def fixLeftovers(): from NormalGlove import Vocab path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() vocab = Vocab.readVocabFromCSVFile() lostEmbeddings = Leftovers.findWordsThatLackEmbedings(vocab, path) print("Leftovers found", len(lostEmbeddings)) ImageEmbedder.createAverageEmbeddingsForVocab(lostEmbeddings, 3, 10, [224, 224], path, 3)
def performPCA(mainFilename, gloveOutputFolder, dimensions, includeSize, skipSize=0, version=1): skipDimensions = [d + skipSize for d in dimensions] PCAOutputFolder = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( version) + "PCA-{}".format(mainFilename) if (os.path.isdir(PCAOutputFolder) == False): os.mkdir(PCAOutputFolder) embeddingFilePath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( version) + "/Keyed-VisualGlove-Full" model = Model.loadKeyedVectors(embeddingFilePath) pureEmbeddings = [model.wv[k] for k in model.vocab] PCAReduction.createPCATransformers(pureEmbeddings[:includeSize], skipDimensions, PCAOutputFolder) PCAReduction.createPCAEmbeddingFiles(model, pureEmbeddings, PCAOutputFolder, gloveOutputFolder, mainFilename, skipDimensions, skipSize)
def main(): from Datasets import DatasetManager embeddingPath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "VisualGlove-Full.txt" embeddings = DatasetManager.getWordsAndEmbeddingsFromFile(embeddingPath) FileProcessing.saveToFile(embeddings, 'tempfullEmbeddings') pureEmbeddings = [embeddings[w] for w in embeddings.keys()] print("Getting TSNE embeddings") reducedEmbeddings = getTsneEmbeddings(pureEmbeddings) print("Saving to file...") FileProcessing.saveToFile(reducedEmbeddings, 'tempEmbeddings') labels = [k for k in embeddings.keys()] visualizeEmbeddings(reducedEmbeddings, labels)
wordsFoundInFolders[w] = 1 counter += 1 wordsNotFound = [] for w in fullVocab: if (w not in wordsFoundInFolders): wordsNotFound.append(w) return wordsNotFound if (__name__ == '__main__'): from NormalGlove import Model # removeListCharsFromFile(DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "/VisualGlove-Full.txt", "NewFile") file = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( ) + "/ProperFormat.txt" # currentVocab = DatasetManager._getWordsFromEmbeddingFile(file) # concatenateEmbeddingsFiles(DatasetManager.getVisualEmbeddingsFullSizeFolderPath(), "ProperFormat.txt") temp = Model.loadGloveVectors(file) ''' fullVocab = Vocab.readVocabFromCSVFile() print("Full Vocab loaded") print("Current vocab loaded") currentVocabLookup = {} for w in currentVocab: currentVocabLookup[w] = 1 del currentVocab fullVocabSize = len(fullVocab) missingVocab = []
def concatEmbeddingFiles(): path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() GloveFormatter.concatenateEmbeddingsFiles(path, "VisualGlove-Full.txt")