def _getCombinedGloveFolder(name, gloveSize, visualSize, version=1): if (version == 1): return DatasetManager._getDatasetsFolderPath( ) + "/CombinedGlove-{}/Glove{}-Visual{}/".format( name, gloveSize, visualSize) else: return DatasetManager._getDatasetsFolderPath( ) + "/V{}/CombinedGlove-{}/Glove{}-Visual{}/".format( version, name, gloveSize, visualSize)
def getTopAndSkipCombined(top, skip, version=1): sizes = [(50, 50), (50, 150), (100, 100), (200, 100), (100, 200), (50, 300), (100, 300), (300, 50), (300, 150), (300, 300)] if (version == 1): basePath = DatasetManager._getDatasetsFolderPath() + "/CombinedGlove-Top{}K-Skip{}/".format(top, skip) else: basePath = DatasetManager._getDatasetsFolderPath() + "/V{}/CombinedGlove-Top{}K-Skip{}/".format(version, top, skip) return [basePath + "Glove{}-Visual{}/Keyed-Glove{}-Visual{}".format(g, v, g, v) for g, v in sizes]
def concatToStandardGlove(visualGloveBaseFolder, name, sizeCombinations=None, version=1): if (sizeCombinations == None): sizeCombinations = STANDARD_COMBINATIONS for gloveSize, visualGloveSize in sizeCombinations: gloveFile = DatasetManager._getDatasetsFolderPath( ) + "/StandardGlove/Keyed-Glove{}-Visual0".format(gloveSize) visualGloveFile = visualGloveBaseFolder + "-{}/Keyed-VisualGlove-{}".format( visualGloveSize, visualGloveSize) saveDir = _getCombinedGloveFolder(name, gloveSize, visualGloveSize, version) if (os.path.isdir(saveDir) == False): os.makedirs(saveDir) newFileName = _getCombinedGlovedFilename(name, gloveSize, visualGloveSize, version) GloveFormatter.combineGloveFiles(visualGloveFile, gloveFile, newFileName) keyedFilename = _getCombinedKeyedFilename(name, gloveSize, visualGloveSize, version) GloveFormatter.createKeyedVectorsFromGloveFile(newFileName, keyedFilename)
def generateWordCouples(): couples = [] with open(DatasetManager.getBLESSDataset(), 'r') as file: for l in file.readlines(): word, _, type, word2 = l.lower().strip().split(' ') couples.append((word, word2, type)) return couples
def main(): from Datasets import DatasetManager embeddingPath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "VisualGlove-Full.txt" embeddings = DatasetManager.getWordsAndEmbeddingsFromFile(embeddingPath) FileProcessing.saveToFile(embeddings, 'tempfullEmbeddings') pureEmbeddings = [embeddings[w] for w in embeddings.keys()] print("Getting TSNE embeddings") reducedEmbeddings = getTsneEmbeddings(pureEmbeddings) print("Saving to file...") FileProcessing.saveToFile(reducedEmbeddings, 'tempEmbeddings') labels = [k for k in embeddings.keys()] visualizeEmbeddings(reducedEmbeddings, labels)
def fixLeftovers(): from NormalGlove import Vocab path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() vocab = Vocab.readVocabFromCSVFile() lostEmbeddings = Leftovers.findWordsThatLackEmbedings(vocab, path) print("Leftovers found", len(lostEmbeddings)) ImageEmbedder.createAverageEmbeddingsForVocab(lostEmbeddings, 3, 10, [224, 224], path, 3)
def performPCA(mainFilename, gloveOutputFolder, dimensions, includeSize, skipSize=0, version=1): skipDimensions = [d + skipSize for d in dimensions] PCAOutputFolder = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( version) + "PCA-{}".format(mainFilename) if (os.path.isdir(PCAOutputFolder) == False): os.mkdir(PCAOutputFolder) embeddingFilePath = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( version) + "/Keyed-VisualGlove-Full" model = Model.loadKeyedVectors(embeddingFilePath) pureEmbeddings = [model.wv[k] for k in model.vocab] PCAReduction.createPCATransformers(pureEmbeddings[:includeSize], skipDimensions, PCAOutputFolder) PCAReduction.createPCAEmbeddingFiles(model, pureEmbeddings, PCAOutputFolder, gloveOutputFolder, mainFilename, skipDimensions, skipSize)
def findWordsThatLackEmbedings(fullVocab, folderPathToEmbeddingsFiles): wordsFoundInFolders = {} counter = 0 for filePath in os.listdir(folderPathToEmbeddingsFiles): print("File", counter) for w in DatasetManager._getWordsFromEmbeddingFile( folderPathToEmbeddingsFiles + "/" + filePath): if (w not in wordsFoundInFolders): wordsFoundInFolders[w] = 1 counter += 1 wordsNotFound = [] for w in fullVocab: if (w not in wordsFoundInFolders): wordsNotFound.append(w) return wordsNotFound
def concatenateEmbeddingsFiles(folderPath, newFileName): embeddedWords = {} with open(newFileName, 'w', encoding='utf-8') as newFile: for filePath in [ p for p in os.listdir(folderPath) if os.path.isdir(folderPath + "/" + p) == False ]: print(filePath) localEmbeddings = DatasetManager.getWordsAndEmbeddingsFromFile( folderPath + "/" + filePath, asStr=True) for i, w in enumerate(localEmbeddings): if (w not in embeddedWords): embeddedWords[w] = 1 newFile.write("{} {}\n".format( w, _embeddingsToString(localEmbeddings[w], strEmbeddings=True))) print("Processed lines:", len(embeddedWords.keys()))
def _resizeWorker(id, vocab, imgSize): print("Starting worker", id) failed = 0 for i, w in enumerate(vocab): try: if ((i + 1) % 10 == 0): print("{}: {}/{} Failed: {}".format(id, i, len(vocab), failed)) imgs = DatasetManager.getSameSizeGloveImages(w, imgSize, asNumpy=True) if (len(imgs) == 0): failed += 1 continue FileProcessing.saveToFile(imgs, _getResizedWordPath(w)) except Exception as e: failed += 1 pass
def _getDataset(filepath=None): if (filepath == None): filepath = DatasetManager.getSynonymsDataset() with open(filepath, 'r') as file: sections = file.read().split('=') parsedSections = [] for s in sections: subSections = s.strip().replace('\n', ' ').replace('.', ' ').split(':'), "\n" sortedSection = {} currentSection = "" for sub in subSections: for w in ' '.join(sub).split(' '): w = w.strip() if (w == "KEY"): currentSection = "KEY" sortedSection[w] = [] elif (w == "SYN"): currentSection = "SYN" sortedSection[w] = [] elif (w == "ANT"): currentSection = "ANT" sortedSection[w] = [] elif (currentSection != "" and len(w) > 0 and str.isspace(w) == False): sortedSection[currentSection].append(w.lower()) if ('KEY' not in sortedSection or len(sortedSection['KEY']) == 0): continue if (('SYN' in sortedSection and len(sortedSection['SYN']) > 0) or ('ANT' in sortedSection and len(sortedSection['ANT']) > 0)): parsedSections.append(sortedSection) return parsedSections
def main(): datasetPath = DatasetManager._getDatasetsFolderPath() print(datasetPath) sizes = [(50, 50), (50, 150), (100, 100), (100, 200), (200, 100)] for gloveSize, visualGloveSize in sizes: gloveFile = datasetPath + "/StandardGlove/glove.6B.{}d.txt".format( gloveSize) visualGloveFile = datasetPath + "/VisualEmbeddings/Top-100K-{}/VisualGlove-{}.txt".format( visualGloveSize, visualGloveSize) saveDir = "/home/ubuntu/VisualGlove/Datasets/CombinedGlove-Top100K/Glove{}-Visual{}/".format( gloveSize, visualGloveSize) if (os.path.isdir(saveDir) == False): os.makedirs(saveDir) newFileName = saveDir + "CombinedGlove-{}-{}.txt".format( gloveSize, visualGloveSize) GloveFormatter.combineGloveFiles(visualGloveFile, gloveFile, newFileName) keyedFilename = saveDir + "Keyed-Glove{}-Visual{}".format( gloveSize, visualGloveSize) GloveFormatter.createKeyedVectorsFromGloveFile(newFileName, keyedFilename)
def main(): version = 2 skipSize = 0 includeSize = 400000 dimensions = [25, 50, 100, 150, 200, 300] name = "Top{}K-Skip{}".format(round(includeSize / 1000), skipSize) gloveOutputFolder = DatasetManager.getVisualEmbeddingsFolderPath(version) + name ''' PerformPCA.performPCA(name, gloveOutputFolder, dimensions, includeSize, skipSize, version) # Convert into KeyedVectors for d in dimensions: folderPath = gloveOutputFolder + "-{}/".format(d) glovePath = folderPath + "VisualGlove-{}-{}.txt".format(name, d) keyPath = folderPath + "Keyed-VisualGlove-{}".format(d) GloveFormatter.createKeyedVectorsFromGloveFile(glovePath, keyPath) ''' # Concat with standard Glove sizes = [(100, 300), (50, 300)] ConcatKeyedVectors.concatToStandardGlove(gloveOutputFolder, name, sizeCombinations=sizes, version=version)
def main(): glovePath = DatasetManager.getVisualEmbeddingsFolderPath( 2) + "/VisualGlove-2.0 Full.txt" GloveFormatter.createKeyedVectorsFromGloveFile( glovePath, "Keyed-VisualGlove-2.0-Full") '''
def readVocabFromCSVFile(): with open(DatasetManager.getGloveVocabCSVPath(), 'r', encoding='utf-8') as file: reader = csv.reader(file) return list(reader)[0]
def getTopAndSkipKVisualOnly(top, skip, version=1): basePath = DatasetManager.getVisualEmbeddingsFolderPath(version) return [basePath + "Top{}K-Skip{}-{}/Keyed-VisualGlove-{}".format(top, skip, i, i) for i in [50, 100, 200, 300]]
def getTop100KCombinedPaths(): sizes = [(50, 50), (50, 150), (100, 100), (200, 100), (100, 200)] folderPath = DatasetManager._getDatasetsFolderPath() + "/CombinedGlove-Top100K/" return [folderPath + "/Glove{}-Visual{}/Keyed-Glove{}-Visual{}".format(g, v, g, v) for g, v in sizes]
wordsFoundInFolders[w] = 1 counter += 1 wordsNotFound = [] for w in fullVocab: if (w not in wordsFoundInFolders): wordsNotFound.append(w) return wordsNotFound if (__name__ == '__main__'): from NormalGlove import Model # removeListCharsFromFile(DatasetManager.getVisualEmbeddingsFullSizeFolderPath() + "/VisualGlove-Full.txt", "NewFile") file = DatasetManager.getVisualEmbeddingsFullSizeFolderPath( ) + "/ProperFormat.txt" # currentVocab = DatasetManager._getWordsFromEmbeddingFile(file) # concatenateEmbeddingsFiles(DatasetManager.getVisualEmbeddingsFullSizeFolderPath(), "ProperFormat.txt") temp = Model.loadGloveVectors(file) ''' fullVocab = Vocab.readVocabFromCSVFile() print("Full Vocab loaded") print("Current vocab loaded") currentVocabLookup = {} for w in currentVocab: currentVocabLookup[w] = 1 del currentVocab fullVocabSize = len(fullVocab) missingVocab = []
def _loadVocab(): with open(DatasetManager.getGloveVocabCSVPath(), 'r') as f: vocab = list(csv.reader(f))[0] return vocab
def getNormalGlovePaths(): return [DatasetManager.getNormalGloveFolderPath() + "/Keyed-Glove{}-Visual0".format(i) for i in [50, 100, 200, 300]]
def getTop100KVisualOnly(): basePath = DatasetManager.getVisualEmbeddingsFolderPath() return [basePath + "Top-100K-{}/Keyed-VisualGlove-{}".format(i, i) for i in [50, 100, 200, 300]]
def concatEmbeddingFiles(): path = DatasetManager.getVisualEmbeddingsFullSizeFolderPath() GloveFormatter.concatenateEmbeddingsFiles(path, "VisualGlove-Full.txt")