def dumpEmbeddings(embeddings, embeddingsFilePath): if os.path.exists(embeddingsFilePath): os.remove(embeddingsFilePath) if not isinstance(embeddings, numpy.ndarray): embeddings = numpy.asarray(embeddings) embeddingsCount, embeddingSize = embeddings.shape with open(embeddingsFilePath, 'w') as embeddingsFile: binary.writei(embeddingsFile, embeddingsCount) binary.writei(embeddingsFile, embeddingSize) for embeddingIndex in range(0, embeddingsCount): embedding = embeddings[embeddingIndex] binary.writef(embeddingsFile, embedding) log.progress('Dumping embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount) log.lineBreak()
def dumpWordMap(indexMap, indexMapFilePath): if os.path.exists(indexMapFilePath): os.remove(indexMapFilePath) with open(indexMapFilePath, 'w') as indexMapFile: indexMapSize = len(indexMap) itemIndex = 0 binary.writei(indexMapFile, indexMapSize) for key, index in indexMap.items(): keyLength = len(key) binary.writei(indexMapFile, keyLength) binary.writes(indexMapFile, key) binary.writei(indexMapFile, index) itemIndex += 1 log.progress('Dumping map: {0:.3f}%.', itemIndex, indexMapSize) indexMapFile.flush() log.lineBreak()
def processData(inputDirectoryPath, w2vEmbeddingsFilePath, fileIndexMapFilePath, wordIndexMapFilePath, wordEmbeddingsFilePath, contextsPath, windowSize, negative, strict): if os.path.exists(contextsPath): os.remove(contextsPath) fileContextSize = 1 wordContextSize = windowSize - fileContextSize fileIndexMap = {} wordIndexMap = collections.OrderedDict() wordEmbeddings = [] noNegativeSamplingPath = contextsPath if negative > 0: noNegativeSamplingPath += '.temp' if os.path.exists(noNegativeSamplingPath): os.remove(noNegativeSamplingPath) pathName = inputDirectoryPath + '/*.txt' textFilePaths = glob.glob(pathName) textFilePaths = sorted(textFilePaths) textFileCount = len(textFilePaths) w2vWordIndexMap, w2vEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsFilePath) contextsCount = 0 with open(noNegativeSamplingPath, 'wb+') as noNegativeSamplingFile: binary.writei(noNegativeSamplingFile, 0) # this is a placeholder for contexts count binary.writei(noNegativeSamplingFile, windowSize) binary.writei(noNegativeSamplingFile, 0) startTime = time.time() for textFileIndex, textFilePath in enumerate(textFilePaths): fileIndexMap[textFilePath] = textFileIndex contextProvider = WordContextProvider(textFilePath=textFilePath) for wordContext in contextProvider.iterate(wordContextSize): allWordsInWordVocabulary = [word in w2vWordIndexMap for word in wordContext] if not all(allWordsInWordVocabulary): continue for word in wordContext: if word not in wordIndexMap: wordIndexMap[word] = len(wordIndexMap) wordEmbeddingIndex = w2vWordIndexMap[word] wordEmbedding = w2vEmbeddings[wordEmbeddingIndex] wordEmbeddings.append(wordEmbedding) indexContext = [textFileIndex] + map(lambda w: wordIndexMap[w], wordContext) binary.writei(noNegativeSamplingFile, indexContext) contextsCount += 1 currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (textFileIndex + 1) log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Words: {3}. Contexts: {4}.', textFileIndex + 1, textFileCount, log.delta(elapsed), secondsPerFile, len(wordIndexMap), contextsCount) log.lineBreak() noNegativeSamplingFile.seek(0, io.SEEK_SET) binary.writei(noNegativeSamplingFile, contextsCount) noNegativeSamplingFile.flush() if negative > 0: with open(contextsPath, 'wb+') as contextsFile: startTime = time.time() contextProvider = parameters.IndexContextProvider(noNegativeSamplingPath) binary.writei(contextsFile, contextsCount) binary.writei(contextsFile, windowSize) binary.writei(contextsFile, negative) batchSize = 10000 batchesCount = contextsCount / batchSize + 1 wordIndices = map(lambda item: item[1], wordIndexMap.items()) wordIndices = numpy.asarray(wordIndices) maxWordIndex = max(wordIndices) for batchIndex in xrange(0, batchesCount): contexts = contextProvider[batchIndex * batchSize : (batchIndex + 1) * batchSize] negativeSamples = generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict) contexts = numpy.concatenate([contexts, negativeSamples], axis=1) contexts = numpy.ravel(contexts) binary.writei(contextsFile, contexts) currentTime = time.time() elapsed = currentTime - startTime log.progress('Negative sampling: {0:.3f}%. Elapsed: {1}.', batchIndex + 1, batchesCount, log.delta(elapsed)) log.lineBreak() contextsFile.flush() os.remove(noNegativeSamplingPath) parameters.dumpWordMap(fileIndexMap, fileIndexMapFilePath) parameters.dumpWordMap(wordIndexMap, wordIndexMapFilePath) parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsFilePath)