def whiteListPriorityComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY wordXIsWhite = wordX in whiteList wordYIsWhite = wordY in whiteList if wordXIsWhite and wordYIsWhite: return 0 elif wordXIsWhite: return -1 elif wordYIsWhite: return 1 frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0
def loadWord2VecEmbeddings(filePath): with open(filePath, 'rb') as file: firstLine = file.readline() embeddingsCount, embeddingSize = tuple(firstLine.split(' ')) embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize) embeddingFormat = '{0}f'.format(embeddingSize) wordIndexMap = {} embeddings = [] log.info('Vocabulary size: {0}. Embedding size: {1}.', embeddingsCount, embeddingSize) embeddingIndex = 0 while True: word = '' while True: char = file.read(1) if not char: log.lineBreak() return wordIndexMap, embeddings if char == ' ': word = word.strip() break word += char embedding = struct.unpack(embeddingFormat, file.read(embeddingSize * 4)) wordIndexMap[word] = len(wordIndexMap) embeddings.append(embedding) embeddingIndex += 1 log.progress('Reading embeddings: {0:.3f}%.', embeddingIndex, embeddingsCount)
def test(self, classifier, testData, batchSize=None): log.start('Testing classifier') inputData, labels = testData batchSize = batchSize if batchSize is not None else inputData.shape[0] batchesCount = inputData.shape[0] / batchSize + 1 predictions = None for batchIndex in xrange(batchesCount): inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize] if predictions is None: predictions = classifier.classify(inputBatch) else: p = classifier.classify(inputBatch) if len(p): predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)]) log.progress('Testing classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount)) performance = Case.roc_auc_truncated(labels, predictions) testMetrics = TestMetrics(performance) log.done(testMetrics) return testMetrics
def extract(outputDirectoryPath, outputConcatFilePath, connector): if os.path.exists(outputDirectoryPath): shutil.rmtree(outputDirectoryPath, ignore_errors=True) os.mkdir(outputDirectoryPath) os.chown(outputDirectoryPath, 1000, 1000) textContainersCount = connector.count() log.info('Found {0} text containers.', textContainersCount) if os.path.exists(outputConcatFilePath): os.remove(outputConcatFilePath) pagesCount = 0 startTime = time.time() for textContainerIndex, name, text in connector.iterate(): text = clean(text) outputFilePath = os.path.join(outputDirectoryPath, name + '.txt') saveText(outputFilePath, text) saveText(outputConcatFilePath, text) currentTime = time.time() elapsed = currentTime - startTime pagesCount += 1 log.progress('Extracting text containers: {0:.3f}%. Elapsed: {1}. Pages: {2}.', textContainerIndex + 1, textContainersCount, log.delta(elapsed), pagesCount) log.lineBreak()
def loadWordVocabulary(vocabularyFilePath, loadFrequencies=True): vocabulary = collections.OrderedDict() with open(vocabularyFilePath, 'rb') as file: itemsCount = file.read(4) itemsCount = struct.unpack('i', itemsCount)[0] for itemIndex in range(0, itemsCount): wordLength = file.read(4) wordLength = struct.unpack('i', wordLength)[0] word = file.read(wordLength) index = file.read(4) index = struct.unpack('i', index)[0] frequency = file.read(4) frequency = struct.unpack('i', frequency)[0] vocabulary[word] = (index, frequency) if loadFrequencies else index log.progress('Loading word vocabulary: {0:.3f}%.', itemIndex + 1, itemsCount) log.lineBreak() return vocabulary
def dumpFileVocabulary(vocabulary, vocabularyFilePath): if os.path.exists(vocabularyFilePath): os.remove(vocabularyFilePath) itemsCount = len(vocabulary) itemIndex = 0 with open(vocabularyFilePath, 'w') as file: file.write(struct.pack('i', itemsCount)) for key, index in vocabulary.items(): keyLength = len(key) keyLength = struct.pack('i', keyLength) index = struct.pack('i', index) file.write(keyLength) file.write(key) file.write(index) itemIndex += 1 log.progress('Dumping file vocabulary: {0:.3f}%.', itemIndex, itemsCount) file.flush() log.lineBreak()
def whiteListPriorityComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY wordXIsWhite = wordX in whiteList wordYIsWhite = wordY in whiteList if wordXIsWhite and wordYIsWhite: return 0 elif wordXIsWhite: return -1 elif wordYIsWhite: return 1 frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0
def make(): words = [] words += getSyntacticWordRelationsWords( 'res/Syntactic-Word-Relations/questions-words.txt') words += getSATWords('res/SAT-Questions/SAT-package-V3.txt') words += getSimLex999Words('res/SimLex-999/SimLex-999.txt') words += getWordSimilarity353Words('res/WordSimilarity-353/combined.csv') words += getRubensteinGoodenoughWords('res/RG/EN-RG-65.txt') words = list(set(words)) words = sorted(words) log.info('Found {0} words.', len(words)) whiteListPath = 'res/Tools/white_list.txt' if os.path.exists(whiteListPath): os.remove(whiteListPath) with open(whiteListPath, 'w+') as whiteListFile: batchSize = 10 batchesCount = len(words) / batchSize + 1 for batchIndex in xrange(0, batchesCount): batch = words[batchIndex * batchSize:(batchIndex + 1) * batchSize] line = ' '.join(batch) + '\n' line = line.lower() whiteListFile.write(line) log.progress('Saving white list: {0:.0f}%.', batchIndex + 1, batchesCount) log.lineBreak() log.info('White list has been saved.')
def buildWordMaps(texts, w2vWordIndexMap, w2vWordEmbeddings): wordIndexMap = collections.OrderedDict() wordFrequencyMap = collections.OrderedDict() for textIndex, text in enumerate(texts): for word in weeding.iterateWords(text): if word not in w2vWordIndexMap: continue if word not in wordIndexMap: wordIndexMap[word] = len(wordIndexMap) wordFrequencyMap[word] = 1 else: wordFrequencyMap[word] += 1 log.progress('Building word maps: {0:.3f}%. Words: {1}.', textIndex + 1, len(texts), len(wordIndexMap)) log.lineBreak() wordEmbeddings = numpy.zeros((len(wordIndexMap), w2vWordEmbeddings.shape[1])) for wordIndexPair in wordIndexMap.items(): word, index = wordIndexPair wordEmbeddings[index] = w2vWordEmbeddings[index] log.progress('Copying w2v embeddings: {0:.3f}%.', index + 1, len(wordIndexMap)) log.lineBreak() return wordIndexMap, wordFrequencyMap, wordEmbeddings
def make(): words = [] words += getSyntacticWordRelationsWords('res/Syntactic-Word-Relations/questions-words.txt') words += getSATWords('res/SAT-Questions/SAT-package-V3.txt') words += getSimLex999Words('res/SimLex-999/SimLex-999.txt') words += getWordSimilarity353Words('res/WordSimilarity-353/combined.csv') words += getRubensteinGoodenoughWords('res/RG/EN-RG-65.txt') words = list(set(words)) words = sorted(words) log.info('Found {0} words.', len(words)) whiteListPath = 'res/Tools/white_list.txt' if os.path.exists(whiteListPath): os.remove(whiteListPath) with open(whiteListPath, 'w+') as whiteListFile: batchSize = 10 batchesCount = len(words) / batchSize + 1 for batchIndex in xrange(0, batchesCount): batch = words[batchIndex * batchSize : (batchIndex + 1) * batchSize] line = ' '.join(batch) + '\n' line = line.lower() whiteListFile.write(line) log.progress('Saving white list: {0:.0f}%.', batchIndex + 1, batchesCount) log.lineBreak() log.info('White list has been saved.')
def trainTextVectors(connector, w2vEmbeddingsPath, wordIndexMapPath, wordFrequencyMapPath, wordEmbeddingsPath, contextsPath, sample, minCount, windowSize, negative, strict, contextsPerText, superBatchSize, fileEmbeddingSize, epochs, learningRate, fileEmbeddingsPath): if exists(wordIndexMapPath) and exists(wordFrequencyMapPath) and exists(wordEmbeddingsPath) \ and exists(contextsPath) and exists(pathTo.textIndexMap): wordIndexMap = parameters.loadMap(wordIndexMapPath) wordFrequencyMap = parameters.loadMap(wordFrequencyMapPath) wordEmbeddings = parameters.loadEmbeddings(wordEmbeddingsPath) textIndexMap = parameters.loadMap(pathTo.textIndexMap) else: w2vWordIndexMap, w2vWordEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsPath) names, texts = extract(connector) wordIndexMap, wordFrequencyMap, wordEmbeddings = buildWordMaps(texts, w2vWordIndexMap, w2vWordEmbeddings) parameters.dumpWordMap(wordIndexMap, wordIndexMapPath) del w2vWordIndexMap del w2vWordEmbeddings gc.collect() parameters.dumpWordMap(wordFrequencyMap, wordFrequencyMapPath) log.progress('Dumping contexts...') parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsPath) log.info('Dumped indices, frequencies and embeddings') texts = subsampleAndPrune(texts, wordFrequencyMap, sample, minCount) textIndexMap = inferContexts(contextsPath, names, texts, wordIndexMap, windowSize, negative, strict, contextsPerText) parameters.dumpWordMap(textIndexMap, pathTo.textIndexMap) with h5py.File(contextsPath, 'r') as contextsFile: contexts = contextsFile['contexts'] log.info('Loaded {0} contexts. Shape: {1}', len(contexts), contexts.shape) fileEmbeddings = numpy.random.rand(len(contexts), fileEmbeddingSize).astype('float32') trainingBatch = numpy.zeros((superBatchSize, contextsPerText, 1+windowSize+negative)).astype('int32') superBatchesCount = len(contexts) / superBatchSize for superBatchIndex in xrange(0, superBatchesCount): log.info('Text batch: {0}/{1}.', superBatchIndex + 1, superBatchesCount) # TODO: this only works if superBatchSize == textsCount; otherwise text indices do not match contexts.read_direct(trainingBatch, source_sel=numpy.s_[superBatchIndex*superBatchSize:(superBatchIndex+1)*superBatchSize]) trainingBatchReshaped = trainingBatch.reshape((superBatchSize*contextsPerText, 1+windowSize+negative)) fileEmbeddingsBatch = fileEmbeddings[superBatchIndex*superBatchSize:(superBatchIndex+1)*superBatchSize] model = traininig.Model(fileEmbeddingsBatch, wordEmbeddings, contextSize=windowSize-2, negative=negative) traininig.train(model, textIndexMap, wordIndexMap, wordEmbeddings, trainingBatchReshaped, epochs, 1, learningRate) fileEmbeddings[superBatchIndex*superBatchSize:(superBatchIndex+1)*superBatchSize] = model.fileEmbeddings.get_value() contextsFile.flush() log.progress('Dumping text embeddings...') binary.dumpTensor(fileEmbeddingsPath, fileEmbeddings) log.info('Dumping text embeddings complete')
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model, superBatchSize, miniBatchSize, parametersPath, embeddingsPath, learningRate, l1Coefficient, l2Coefficient, epochs, metricsPath): if os.path.exists(metricsPath): os.remove(metricsPath) superBatchesCount = contextProvider.contextsCount / superBatchSize + 1 startTime = time.time() previousTotal = 0 for epoch in xrange(0, epochs): for superBatchIndex in xrange(0, superBatchesCount): contextSuperBatch = contextProvider[superBatchIndex * superBatchSize: (superBatchIndex + 1) * superBatchSize] fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:, 1], contextSuperBatch[:, 1: -1], contextSuperBatch[:, -1] model.train(wordIndices, targetWordIndices, miniBatchSize, learningRate, l1Coefficient, l2Coefficient) metrics = validation.validate(wordVocabulary, model) customMetrics = { 'simGemJewel': similarity('gem', 'jewel', wordVocabulary, model) } validation.dump(metricsPath, epoch, superBatchIndex, *metrics, **customMetrics) validation.dump(metricsPath, epoch, superBatchIndex, *metrics) if previousTotal < sum(metrics): model.dump(parametersPath, embeddingsPath) currentTime = time.time() elapsed = currentTime - startTime secondsPerEpoch = elapsed / (epoch + 1) rg, sim353, simLex999, syntRel, sat = metrics log.progress( 'Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.', epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch, rg, sim353, simLex999, syntRel, sat, customMetrics['simGemJewel']) log.lineBreak() return model
def loadWordRequencyMap(indexMapFilePath): wordRequencyMap = loadMap(indexMapFilePath) log.progress('Sorting word frequency map...', 1, 1) wordRequencyMap = sorted(wordRequencyMap.items(), key=lambda item: item[1], reverse=True) wordRequencyMap = collections.OrderedDict(wordRequencyMap) log.progress('Sorting word frequency map complete.', 1, 1) log.lineBreak() return wordRequencyMap
def train(model, fileIndexMap, wordIndexMap, wordEmbeddings, contexts, epochs, batchSize, learningRate, metricsPath=None, pathTo=None): model.trainingContexts.set_value(contexts) contextsCount, contextSize = contexts.shape initialiLearningRate = learningRate startTime = time.time() metrics = { 'meanError': np.nan, 'medianError': np.nan, 'maxError': np.nan, 'minError': np.nan, 'learningRate': learningRate } maxError = None for epoch in xrange(0, epochs): errors = [] for contextIndex in xrange(0, contextsCount): error = model.trainModel(contextIndex, learningRate) errors.append(error) log.progress('Training model: {0:.3f}%. Epoch: {1}. Elapsed: {2}. Error(mean,median,min,max): {3:.3f}, {4:.3f}, {5:.3f}, {6:.3f}. Learning rate: {7}.', epoch * contextsCount + contextIndex + 1, epochs * contextsCount, epoch + 1, log.delta(time.time() - startTime), metrics['meanError'], metrics['medianError'], metrics['minError'], metrics['maxError'], learningRate) learningRate = learningRate * (1 - (float(epoch) + 1) / epochs) learningRate = max(initialiLearningRate * 0.0001, learningRate) metrics = { 'meanError': np.mean(errors), 'medianError': np.median(errors), 'maxError': np.max(errors), 'minError': np.min(errors), 'learningRate': learningRate } if pathTo is not None and (maxError is None or maxError > metrics['maxError']): model.dump(pathTo.fileEmbeddings, pathTo.weights) maxError = metrics['maxError'] if metricsPath is not None: validation.dump(metricsPath, epoch, metrics)
def extract(connector): textFilesCount = connector.count() names = [] texts = [] for textFileIndex, name, text in connector.iterate(): text = extraction.clean(text) names.append(name) texts.append(text) log.progress('Extracting text: {0:.3f}%. Texts: {1}.', textFileIndex + 1, textFilesCount, textFileIndex + 1) log.lineBreak() return names, texts
def loadEmbeddings(embeddingsFilePath): with open(embeddingsFilePath, 'rb') as embeddingsFile: embeddingsCount = binary.readi(embeddingsFile) embeddingSize = binary.readi(embeddingsFile) embeddings = numpy.empty((embeddingsCount, embeddingSize)).astype('float32') for embeddingIndex in range(0, embeddingsCount): embedding = binary.readf(embeddingsFile, embeddingSize) embeddings[embeddingIndex] = embedding log.progress('Loading embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount) log.info('Loading embeddings complete. {0} embeddings loaded.', embeddingsCount) return embeddings
def inferContexts(contextsPath, names, texts, wordIndexMap, windowSize, negative, strict, contextsCount): textIndexMap = collections.OrderedDict() def wordsToIndices(textContext): indices = map(lambda word: wordIndexMap[word], textContext) return indices wordIndices = map(lambda item: item[1], wordIndexMap.items()) wordIndices = numpy.asarray(wordIndices) maxWordIndex = max(wordIndices) with h5py.File(contextsPath, 'w') as contextsFile: tensor = contextsFile.create_dataset('contexts', dtype='int32', shape=(0, contextsCount, 1 + windowSize + negative), # 1 for file index maxshape=(None, contextsCount, 1 + windowSize + negative), # 1 for file index chunks=(1, contextsCount, 1 + windowSize + negative)) # 1 for file index textsCount = 0 for name, text in zip(names, texts): contextProvider = processing.WordContextProvider(text=text, minContexts=contextsCount, maxContexts=contextsCount) contexts = list(contextProvider.iterate(windowSize)) if len(contexts) > 0: contexts = map(wordsToIndices, contexts) textIndexMap[name] = len(textIndexMap) contexts = numpy.asarray(contexts) textIndices = [[textIndexMap[name]]] * len(contexts) contexts = numpy.concatenate([textIndices, contexts], axis=1) negativeSamples = processing.generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict) contexts = numpy.concatenate([contexts, negativeSamples], axis=1) tensor.resize(tensor.shape[0] + 1, axis=0) tensor[-1] = contexts textsCount += 1 log.progress('Creating contexts: {0:.3f}%. Text index map: {1}. Contexts: {2}.', textsCount, len(texts), len(tensor), tensor.shape[0] * tensor.shape[1]) log.lineBreak() return textIndexMap
def frequencyComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True): if os.path.exists(outputDirectoryPath): shutil.rmtree(outputDirectoryPath, ignore_errors=True) log.info("Output directory {0} has been removed.", outputDirectoryPath) os.mkdir(outputDirectoryPath) os.chown(outputDirectoryPath, 1000, 1000) log.info("Output directory {0} has been created.", outputDirectoryPath) pathName = inputDirectoryPath + "/*wiki*.txt.gz" dumpPaths = glob.glob(pathName)[:10] dumpsCount = len(dumpPaths) log.info("Found {0} Wikipedia dumps.", dumpsCount) startTime = time.time() for dumpIndex, dumpPath in enumerate(dumpPaths): dumpName, pages = unpackDump(dumpPath, cleanText) if len(pages) > 0: dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName) os.mkdir(dumpDirectoryPath) os.chown(dumpDirectoryPath, 1000, 1000) for pageName, pageText in pages: savePage(dumpDirectoryPath, pageName, pageText) currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (dumpIndex + 1) log.progress( "Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)", dumpIndex + 1, dumpsCount, dumpName, len(pages), log.delta(elapsed), secondsPerFile, ) log.lineBreak() log.info("Processing complete.")
def frequencyComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True): if os.path.exists(outputDirectoryPath): shutil.rmtree(outputDirectoryPath, ignore_errors=True) log.info('Output directory {0} has been removed.', outputDirectoryPath) os.mkdir(outputDirectoryPath) os.chown(outputDirectoryPath, 1000, 1000) log.info('Output directory {0} has been created.', outputDirectoryPath) pathName = inputDirectoryPath + '/*wiki*.txt.gz' dumpPaths = glob.glob(pathName)[:10] dumpsCount = len(dumpPaths) log.info('Found {0} Wikipedia dumps.', dumpsCount) startTime = time.time() for dumpIndex, dumpPath in enumerate(dumpPaths): dumpName, pages = unpackDump(dumpPath, cleanText) if len(pages) > 0: dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName) os.mkdir(dumpDirectoryPath) os.chown(dumpDirectoryPath, 1000, 1000) for pageName, pageText in pages: savePage(dumpDirectoryPath, pageName, pageText) currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (dumpIndex + 1) log.progress('Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)', dumpIndex + 1, dumpsCount, dumpName, len(pages), log.delta(elapsed), secondsPerFile) log.lineBreak() log.info('Processing complete.')
def loadW2VParameters(filePath, loadEmbeddings=True): with open(filePath, 'rb') as w2vFile: firstLine = w2vFile.readline() embeddingsCount, embeddingSize = tuple(firstLine.split(' ')) embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize) wordIndexMap = collections.OrderedDict() embeddings = numpy.zeros((embeddingsCount, embeddingSize)) embeddingIndex = 0 while True: word = '' while True: char = w2vFile.read(1) if not char: log.lineBreak() if loadEmbeddings: return wordIndexMap, embeddings else: return wordIndexMap if char == ' ': word = word.strip() break word += char wordIndexMap[word] = len(wordIndexMap) if loadEmbeddings: embedding = binary.readf(w2vFile, embeddingSize) embeddings[wordIndexMap[word]] = embedding else: w2vFile.seek(embeddingSize * 4, io.SEEK_CUR) embeddingIndex += 1 log.progress('Loading W2V embeddings: {0:.3f}%. {1} embeddings {2} features each.', embeddingIndex, embeddingsCount, embeddingIndex, embeddingSize)
def subsampleAndPrune(texts, wordFrequencyMap, sample, minCount): totalLength = 0. prunedLength = 0. maxFrequency = wordFrequencyMap.items()[0][1] for textIndex, text in enumerate(texts): totalLength += len(text) texts[textIndex] = weeding.subsampleAndPrune(text, wordFrequencyMap, maxFrequency, sample, minCount) prunedLength += len(texts[textIndex]) log.progress('Subsampling and pruning text: {0:.3f}%. Removed {1:.3f}% of original text.', textIndex + 1, len(texts), (1 - prunedLength/totalLength) * 100) log.lineBreak() return texts
def dumpEmbeddings(embeddings, embeddingsFilePath): if os.path.exists(embeddingsFilePath): os.remove(embeddingsFilePath) if not isinstance(embeddings, numpy.ndarray): embeddings = numpy.asarray(embeddings) embeddingsCount, embeddingSize = embeddings.shape with open(embeddingsFilePath, 'w') as embeddingsFile: binary.writei(embeddingsFile, embeddingsCount) binary.writei(embeddingsFile, embeddingSize) for embeddingIndex in range(0, embeddingsCount): embedding = embeddings[embeddingIndex] binary.writef(embeddingsFile, embedding) log.progress('Dumping embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount) log.lineBreak()
def loadMap(indexMapFilePath, inverse=False): vocabulary = collections.OrderedDict() with open(indexMapFilePath, 'rb') as indexMapFile: itemsCount = binary.readi(indexMapFile) for itemIndex in range(0, itemsCount): wordLength = binary.readi(indexMapFile) word = binary.reads(indexMapFile, wordLength) index = binary.readi(indexMapFile) if inverse: vocabulary[index] = word else: vocabulary[word] = index log.progress('Loading word map: {0:.3f}%.', itemIndex + 1, itemsCount) log.info('Loading word map complete. {0} words loaded.', itemsCount) return vocabulary
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model, superBatchSize, miniBatchSize, parametersPath, embeddingsPath, learningRate, l1Coefficient, l2Coefficient, epochs, metricsPath): if os.path.exists(metricsPath): os.remove(metricsPath) superBatchesCount = contextProvider.contextsCount / superBatchSize + 1 startTime = time.time() previousTotal = 0 for epoch in xrange(0, epochs): for superBatchIndex in xrange(0, superBatchesCount): contextSuperBatch = contextProvider[superBatchIndex * superBatchSize:(superBatchIndex + 1) * superBatchSize] fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:,1], contextSuperBatch[:,1:-1], contextSuperBatch[:,-1] model.train(wordIndices, targetWordIndices, miniBatchSize, learningRate, l1Coefficient, l2Coefficient) metrics = validation.validate(wordVocabulary, model) customMetrics = { 'simGemJewel': similarity('gem', 'jewel', wordVocabulary, model) } validation.dump(metricsPath, epoch, superBatchIndex, *metrics, **customMetrics) validation.dump(metricsPath, epoch, superBatchIndex, *metrics) if previousTotal < sum(metrics): model.dump(parametersPath, embeddingsPath) currentTime = time.time() elapsed = currentTime - startTime secondsPerEpoch = elapsed / (epoch + 1) rg, sim353, simLex999, syntRel, sat = metrics log.progress('Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.', epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch, rg, sim353, simLex999, syntRel, sat, customMetrics['simGemJewel']) log.lineBreak() return model
def createSubmission(self, classifier, testData, batchSize=None): log.start('Creating submission') batchSize = batchSize if batchSize is not None else input.shape[0] batchesCount = testData.shape[0] / batchSize + 1 predictions = None for batchIndex in xrange(batchesCount): inputBatch = testData[batchIndex * batchSize:(batchIndex + 1) * batchSize] if predictions is None: predictions = classifier.classify(inputBatch) elif len(inputBatch): predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)]) log.progress('Creating submission: {0}%'.format((batchIndex + 1) * 100 / batchesCount)) submission = pandas.DataFrame({"id": self.testData["id"], "prediction": predictions}) log.done('submission' + str(submission.shape)) return submission
def loadFileVocabulary(vocabularyFilePath): vocabulary = collections.OrderedDict() with open(vocabularyFilePath, 'rb') as file: itemsCount = file.read(4) itemsCount = struct.unpack('i', itemsCount)[0] for itemIndex in range(0, itemsCount): wordLength = file.read(4) wordLength = struct.unpack('i', wordLength)[0] word = file.read(wordLength) index = file.read(4) index = struct.unpack('i', index)[0] vocabulary[word] = index log.progress('Loading file vocabulary: {0:.3f}%.', itemIndex + 1, itemsCount) log.lineBreak() return vocabulary
def dumpWordMap(indexMap, indexMapFilePath): if os.path.exists(indexMapFilePath): os.remove(indexMapFilePath) with open(indexMapFilePath, 'w') as indexMapFile: indexMapSize = len(indexMap) itemIndex = 0 binary.writei(indexMapFile, indexMapSize) for key, index in indexMap.items(): keyLength = len(key) binary.writei(indexMapFile, keyLength) binary.writes(indexMapFile, key) binary.writei(indexMapFile, index) itemIndex += 1 log.progress('Dumping map: {0:.3f}%.', itemIndex, indexMapSize) indexMapFile.flush() log.lineBreak()
def innerExecute(self, selector): words = [] embeddings = [] for operator, word in selector.operands: embedding = self.wordEmbeddings[self.wordIndexMap[word]] if operator == '-': embedding = embedding * (-1) words.append(word) embeddings.append(embedding) minIndex, maxIndex = ExplainFunction.getSurroundingIndices(words, self.wordIndexMap, 5000) result = embeddings[0] for embedding in embeddings[1:]: result += embedding scores = [] for index in xrange(minIndex, maxIndex): word = self.wordIndexItems[index][0] if word not in words: embedding = self.wordEmbeddings[index] score = vectors.cosineSimilarity(result, embedding) scores.append((word, score)) log.progress('Looking for closest matches: {0:.3f}%.', index - minIndex + 1, maxIndex - minIndex) log.lineBreak() scores = sorted(scores, key=lambda s: s[1], reverse=True) for score in scores[:10]: print score
def train(classifier, trainingData, validationData, batchSize=None): log.start('Training classifier') inputData, labels = trainingData batchSize = batchSize if batchSize is not None else inputData.shape[0] batchesCount = inputData.shape[0] / batchSize start = time.time() for batchIndex in xrange(batchesCount): inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize] labelsBatch = labels[batchIndex * batchSize:(batchIndex + 1) * batchSize] classifier.fit(inputBatch, labelsBatch) log.progress('Training classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount)) end = time.time() elapsed = end - start trainingMetrics = TrainingMetrics(elapsed) log.done(trainingMetrics) return trainingMetrics
def processData(inputDirectoryPath, w2vEmbeddingsFilePath, fileIndexMapFilePath, wordIndexMapFilePath, wordEmbeddingsFilePath, contextsPath, windowSize, negative, strict): if os.path.exists(contextsPath): os.remove(contextsPath) fileContextSize = 1 wordContextSize = windowSize - fileContextSize fileIndexMap = {} wordIndexMap = collections.OrderedDict() wordEmbeddings = [] noNegativeSamplingPath = contextsPath if negative > 0: noNegativeSamplingPath += '.temp' if os.path.exists(noNegativeSamplingPath): os.remove(noNegativeSamplingPath) pathName = inputDirectoryPath + '/*.txt' textFilePaths = glob.glob(pathName) textFilePaths = sorted(textFilePaths) textFileCount = len(textFilePaths) w2vWordIndexMap, w2vEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsFilePath) contextsCount = 0 with open(noNegativeSamplingPath, 'wb+') as noNegativeSamplingFile: binary.writei(noNegativeSamplingFile, 0) # this is a placeholder for contexts count binary.writei(noNegativeSamplingFile, windowSize) binary.writei(noNegativeSamplingFile, 0) startTime = time.time() for textFileIndex, textFilePath in enumerate(textFilePaths): fileIndexMap[textFilePath] = textFileIndex contextProvider = WordContextProvider(textFilePath=textFilePath) for wordContext in contextProvider.iterate(wordContextSize): allWordsInWordVocabulary = [word in w2vWordIndexMap for word in wordContext] if not all(allWordsInWordVocabulary): continue for word in wordContext: if word not in wordIndexMap: wordIndexMap[word] = len(wordIndexMap) wordEmbeddingIndex = w2vWordIndexMap[word] wordEmbedding = w2vEmbeddings[wordEmbeddingIndex] wordEmbeddings.append(wordEmbedding) indexContext = [textFileIndex] + map(lambda w: wordIndexMap[w], wordContext) binary.writei(noNegativeSamplingFile, indexContext) contextsCount += 1 currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (textFileIndex + 1) log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Words: {3}. Contexts: {4}.', textFileIndex + 1, textFileCount, log.delta(elapsed), secondsPerFile, len(wordIndexMap), contextsCount) log.lineBreak() noNegativeSamplingFile.seek(0, io.SEEK_SET) binary.writei(noNegativeSamplingFile, contextsCount) noNegativeSamplingFile.flush() if negative > 0: with open(contextsPath, 'wb+') as contextsFile: startTime = time.time() contextProvider = parameters.IndexContextProvider(noNegativeSamplingPath) binary.writei(contextsFile, contextsCount) binary.writei(contextsFile, windowSize) binary.writei(contextsFile, negative) batchSize = 10000 batchesCount = contextsCount / batchSize + 1 wordIndices = map(lambda item: item[1], wordIndexMap.items()) wordIndices = numpy.asarray(wordIndices) maxWordIndex = max(wordIndices) for batchIndex in xrange(0, batchesCount): contexts = contextProvider[batchIndex * batchSize : (batchIndex + 1) * batchSize] negativeSamples = generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict) contexts = numpy.concatenate([contexts, negativeSamples], axis=1) contexts = numpy.ravel(contexts) binary.writei(contextsFile, contexts) currentTime = time.time() elapsed = currentTime - startTime log.progress('Negative sampling: {0:.3f}%. Elapsed: {1}.', batchIndex + 1, batchesCount, log.delta(elapsed)) log.lineBreak() contextsFile.flush() os.remove(noNegativeSamplingPath) parameters.dumpWordMap(fileIndexMap, fileIndexMapFilePath) parameters.dumpWordMap(wordIndexMap, wordIndexMapFilePath) parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsFilePath)
def pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList): global pruningStepIndex originalVocabularyLength = len(wordVocabulary) prunedVocabularyLength = min(originalVocabularyLength, maxVocabularySize) pruningStepsCount = 0 if originalVocabularyLength > maxVocabularySize: pruningStepsCount += originalVocabularyLength * math.log(originalVocabularyLength) pruningStepsCount += prunedVocabularyLength * math.log(prunedVocabularyLength) pruningStepsCount += prunedVocabularyLength def whiteListPriorityComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY wordXIsWhite = wordX in whiteList wordYIsWhite = wordY in whiteList if wordXIsWhite and wordYIsWhite: return 0 elif wordXIsWhite: return -1 elif wordYIsWhite: return 1 frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0 prunedWordVocabulary = wordVocabulary.items() if originalVocabularyLength > maxVocabularySize: prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=whiteListPriorityComparator) prunedWordVocabulary = prunedWordVocabulary[:maxVocabularySize] def frequencyComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0 prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=frequencyComparator) prunedWordVocabulary = collections.OrderedDict(prunedWordVocabulary) wordIndexMap = {} for wordIndex, wordInfo in enumerate(prunedWordVocabulary.items()): word, info = wordInfo previousIndex, wordFrequency = info wordIndexMap[previousIndex] = wordIndex prunedWordVocabulary[word] = wordIndex, wordFrequency log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount) pruningStepIndex += 1 log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepsCount, pruningStepsCount) log.lineBreak() return prunedWordVocabulary, wordIndexMap
def processData( inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize ): if os.path.exists(contextsPath): os.remove(contextsPath) fileContextSize = 1 wordContextSize = contextSize - fileContextSize fileVocabulary = collections.OrderedDict() wordVocabulary = collections.OrderedDict() unprunedContextsPath = contextsPath + ".unpruned" if os.path.exists(unprunedContextsPath): os.remove(unprunedContextsPath) with open(unprunedContextsPath, "wb+") as unprunedContextsFile: unprunedContextsFile.write(struct.pack("i", 0)) # this is a placeholder for contexts count unprunedContextsFile.write(struct.pack("i", contextSize)) pathName = inputDirectoryPath + "/*/*.txt" textFilePaths = glob.glob(pathName)[:200] textFilePaths = sorted(textFilePaths) textFileCount = len(textFilePaths) startTime = time.time() contextFormat = "{0}i".format(contextSize) contextsCount = 0 for textFileIndex, textFilePath in enumerate(textFilePaths): fileVocabulary[textFilePath] = textFileIndex contextProvider = WordContextProvider(textFilePath) for wordContext in contextProvider.next(wordContextSize): for word in wordContext: if word not in wordVocabulary: wordVocabulary[word] = (len(wordVocabulary), 1) else: wordIndex, frequency = wordVocabulary[word] wordVocabulary[word] = (wordIndex, frequency + 1) indexContext = map(lambda w: wordVocabulary[w][0], wordContext) indexContext = [textFileIndex] + indexContext unprunedContextsFile.write(struct.pack(contextFormat, *indexContext)) contextsCount += 1 textFileName = os.path.basename(textFilePath) currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (textFileIndex + 1) log.progress( "Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.", textFileIndex + 1, textFileCount, log.delta(elapsed), secondsPerFile, len(wordVocabulary), ) log.lineBreak() unprunedContextsFile.seek(0, io.SEEK_SET) unprunedContextsFile.write(struct.pack("i", contextsCount)) unprunedContextsFile.flush() whiteList = whitelist.load() originalVocabularyLength = len(wordVocabulary) prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList) log.info( "Vocabulary has been pruned. {0} items left out of {1}.", len(prunedWordVocabulary), originalVocabularyLength ) with open(unprunedContextsPath, "rb") as unprunedContextsFile: contextsCount = unprunedContextsFile.read(4) contextSize = unprunedContextsFile.read(4) contextsCount = struct.unpack("i", contextsCount)[0] contextSize = struct.unpack("i", contextSize)[0] format = "{0}i".format(contextSize) # plus one spot for file index bufferSize = (contextSize) * 4 prunedContextsCount = 0 with open(contextsPath, "wb+") as uncompressedPrunedContexts: uncompressedPrunedContexts.write(struct.pack("i", 0)) # placeholder for contexts count uncompressedPrunedContexts.write(struct.pack("i", contextSize)) contextIndex = 0 while contextIndex < contextsCount: buffer = unprunedContextsFile.read(bufferSize) context = struct.unpack(format, buffer) fileIndex = context[0] indexContext = context[1:] if all([index in wordIndexMap for index in indexContext]): prunedContextsCount += 1 indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext) context = [fileIndex] + indexContext buffer = struct.pack(format, *context) uncompressedPrunedContexts.write(buffer) contextIndex += 1 contextsPruned = contextIndex - prunedContextsCount + 1 log.progress( "Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.", contextIndex, contextsCount, contextsPruned, float(contextsPruned) * 100 / contextsCount, contextsCount, ) log.lineBreak() uncompressedPrunedContexts.seek(0, io.SEEK_SET) uncompressedPrunedContexts.write(struct.pack("i", prunedContextsCount)) uncompressedPrunedContexts.flush() os.remove(unprunedContextsPath) parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath) parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)
def pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList): global pruningStepIndex originalVocabularyLength = len(wordVocabulary) prunedVocabularyLength = min(originalVocabularyLength, maxVocabularySize) pruningStepsCount = 0 if originalVocabularyLength > maxVocabularySize: pruningStepsCount += originalVocabularyLength * math.log(originalVocabularyLength) pruningStepsCount += prunedVocabularyLength * math.log(prunedVocabularyLength) pruningStepsCount += prunedVocabularyLength def whiteListPriorityComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY wordXIsWhite = wordX in whiteList wordYIsWhite = wordY in whiteList if wordXIsWhite and wordYIsWhite: return 0 elif wordXIsWhite: return -1 elif wordYIsWhite: return 1 frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0 prunedWordVocabulary = wordVocabulary.items() if originalVocabularyLength > maxVocabularySize: prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=whiteListPriorityComparator) prunedWordVocabulary = prunedWordVocabulary[:maxVocabularySize] def frequencyComparator(wordInfoX, wordInfoY): global pruningStepIndex pruningStepIndex += 1 if pruningStepIndex % 1000: log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount) wordX, infoX = wordInfoX wordY, infoY = wordInfoY frequencyX = infoX[1] frequencyY = infoY[1] if frequencyX < frequencyY: return 1 elif frequencyX > frequencyY: return -1 return 0 prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=frequencyComparator) prunedWordVocabulary = collections.OrderedDict(prunedWordVocabulary) wordIndexMap = {} for wordIndex, wordInfo in enumerate(prunedWordVocabulary.items()): word, info = wordInfo previousIndex, wordFrequency = info wordIndexMap[previousIndex] = wordIndex prunedWordVocabulary[word] = wordIndex, wordFrequency log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount) pruningStepIndex += 1 log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepsCount, pruningStepsCount) log.lineBreak() return prunedWordVocabulary, wordIndexMap
def processData(inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize): if os.path.exists(contextsPath): os.remove(contextsPath) fileContextSize = 1 wordContextSize = contextSize - fileContextSize fileVocabulary = collections.OrderedDict() wordVocabulary = collections.OrderedDict() unprunedContextsPath = contextsPath + '.unpruned' if os.path.exists(unprunedContextsPath): os.remove(unprunedContextsPath) with open(unprunedContextsPath, 'wb+') as unprunedContextsFile: unprunedContextsFile.write(struct.pack('i', 0)) # this is a placeholder for contexts count unprunedContextsFile.write(struct.pack('i', contextSize)) pathName = inputDirectoryPath + '/*/*.txt' textFilePaths = glob.glob(pathName)[:200] textFilePaths = sorted(textFilePaths) textFileCount = len(textFilePaths) startTime = time.time() contextFormat = '{0}i'.format(contextSize) contextsCount = 0 for textFileIndex, textFilePath in enumerate(textFilePaths): fileVocabulary[textFilePath] = textFileIndex contextProvider = WordContextProvider(textFilePath) for wordContext in contextProvider.next(wordContextSize): for word in wordContext: if word not in wordVocabulary: wordVocabulary[word] = (len(wordVocabulary), 1) else: wordIndex, frequency = wordVocabulary[word] wordVocabulary[word] = (wordIndex, frequency + 1) indexContext = map(lambda w: wordVocabulary[w][0], wordContext) indexContext = [textFileIndex] + indexContext unprunedContextsFile.write(struct.pack(contextFormat, *indexContext)) contextsCount += 1 textFileName = os.path.basename(textFilePath) currentTime = time.time() elapsed = currentTime - startTime secondsPerFile = elapsed / (textFileIndex + 1) log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.', textFileIndex + 1, textFileCount, log.delta(elapsed), secondsPerFile, len(wordVocabulary)) log.lineBreak() unprunedContextsFile.seek(0, io.SEEK_SET) unprunedContextsFile.write(struct.pack('i', contextsCount)) unprunedContextsFile.flush() whiteList = whitelist.load() originalVocabularyLength = len(wordVocabulary) prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList) log.info('Vocabulary has been pruned. {0} items left out of {1}.', len(prunedWordVocabulary), originalVocabularyLength) with open(unprunedContextsPath, 'rb') as unprunedContextsFile: contextsCount = unprunedContextsFile.read(4) contextSize = unprunedContextsFile.read(4) contextsCount = struct.unpack('i', contextsCount)[0] contextSize = struct.unpack('i', contextSize)[0] format = '{0}i'.format(contextSize) # plus one spot for file index bufferSize = (contextSize) * 4 prunedContextsCount = 0 with open(contextsPath, 'wb+') as uncompressedPrunedContexts: uncompressedPrunedContexts.write(struct.pack('i', 0)) # placeholder for contexts count uncompressedPrunedContexts.write(struct.pack('i', contextSize)) contextIndex = 0 while contextIndex < contextsCount: buffer = unprunedContextsFile.read(bufferSize) context = struct.unpack(format, buffer) fileIndex = context[0] indexContext = context[1:] if all([index in wordIndexMap for index in indexContext]): prunedContextsCount += 1 indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext) context = [fileIndex] + indexContext buffer = struct.pack(format, *context) uncompressedPrunedContexts.write(buffer) contextIndex += 1 contextsPruned = contextIndex - prunedContextsCount + 1 log.progress('Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.', contextIndex, contextsCount, contextsPruned, float(contextsPruned) * 100 / contextsCount, contextsCount) log.lineBreak() uncompressedPrunedContexts.seek(0, io.SEEK_SET) uncompressedPrunedContexts.write(struct.pack('i', prunedContextsCount)) uncompressedPrunedContexts.flush() os.remove(unprunedContextsPath) parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath) parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)
def tsne(X=numpy.array([]), no_dims=2, initial_dims=50, perplexity=30.0, epochs=1000): """Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions. The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.""" # Check inputs if X.dtype != "float64": print "Error: array X should have type float64." return -1 # if no_dims.__class__ != "<type 'int'>": # doesn't work yet! # print "Error: number of dimensions should be an integer."; # return -1; # Initialize variables X = pca(X, initial_dims).real (n, d) = X.shape epochs = 1000 initial_momentum = 0.5 final_momentum = 0.8 eta = 500 min_gain = 0.01 numpy.random.seed(0) Y = numpy.random.randn(n, no_dims) dY = numpy.zeros((n, no_dims)) iY = numpy.zeros((n, no_dims)) gains = numpy.ones((n, no_dims)) # Compute P-values P = x2p(X, 1e-5, perplexity) P = P + numpy.transpose(P) P = P / numpy.sum(P) P = P * 4 # early exaggeration P = numpy.maximum(P, 1e-12) # Run iterations for iter in range(epochs): # Compute pairwise affinities sum_Y = numpy.sum(numpy.square(Y), 1) num = 1 / ( 1 + numpy.add(numpy.add(-2 * numpy.dot(Y, Y.T), sum_Y).T, sum_Y)) num[range(n), range(n)] = 0 Q = num / numpy.sum(num) Q = numpy.maximum(Q, 1e-12) # Compute gradient PQ = P - Q for i in range(n): dY[i, :] = numpy.sum( numpy.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0) # Perform the update if iter < 20: momentum = initial_momentum else: momentum = final_momentum gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ( (dY > 0) == (iY > 0)) gains[gains < min_gain] = min_gain iY = momentum * iY - eta * (gains * dY) Y = Y + iY Y = Y - numpy.tile(numpy.mean(Y, 0), (n, 1)) # Compute current value of cost function if (iter + 1) % 10 == 0: C = numpy.sum(P * numpy.log(P / Q)) log.progress('Plotting embeddings: {0:.3f}%. Error: {1:.3f}.', iter + 1, epochs, C) # Stop lying about P-values if iter == 100: P = P / 4 # Return solution return Y
def tsne(X=numpy.array([]), no_dims=2, initial_dims=50, perplexity=30.0, epochs=1000): """Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions. The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.""" # Check inputs if X.dtype != "float64": print "Error: array X should have type float64." return -1 # if no_dims.__class__ != "<type 'int'>": # doesn't work yet! # print "Error: number of dimensions should be an integer."; # return -1; # Initialize variables X = pca(X, initial_dims).real (n, d) = X.shape epochs = 1000 initial_momentum = 0.5 final_momentum = 0.8 eta = 500 min_gain = 0.01 numpy.random.seed(0) Y = numpy.random.randn(n, no_dims) dY = numpy.zeros((n, no_dims)) iY = numpy.zeros((n, no_dims)) gains = numpy.ones((n, no_dims)) # Compute P-values P = x2p(X, 1e-5, perplexity) P = P + numpy.transpose(P) P = P / numpy.sum(P) P = P * 4 # early exaggeration P = numpy.maximum(P, 1e-12) # Run iterations for iter in range(epochs): # Compute pairwise affinities sum_Y = numpy.sum(numpy.square(Y), 1) num = 1 / (1 + numpy.add(numpy.add(-2 * numpy.dot(Y, Y.T), sum_Y).T, sum_Y)) num[range(n), range(n)] = 0 Q = num / numpy.sum(num) Q = numpy.maximum(Q, 1e-12) # Compute gradient PQ = P - Q for i in range(n): dY[i, :] = numpy.sum(numpy.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0) # Perform the update if iter < 20: momentum = initial_momentum else: momentum = final_momentum gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0)) gains[gains < min_gain] = min_gain iY = momentum * iY - eta * (gains * dY) Y = Y + iY Y = Y - numpy.tile(numpy.mean(Y, 0), (n, 1)) # Compute current value of cost function if (iter + 1) % 10 == 0: C = numpy.sum(P * numpy.log(P / Q)) log.progress('Plotting embeddings: {0:.3f}%. Error: {1:.3f}.', iter + 1, epochs, C) # Stop lying about P-values if iter == 100: P = P / 4 # Return solution return Y