Пример #1
0
    def whiteListPriorityComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        wordXIsWhite = wordX in whiteList
        wordYIsWhite = wordY in whiteList

        if wordXIsWhite and wordYIsWhite:
            return 0
        elif wordXIsWhite:
            return -1
        elif wordYIsWhite:
            return 1

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0
Пример #2
0
def loadWord2VecEmbeddings(filePath):
    with open(filePath, 'rb') as file:
        firstLine = file.readline()
        embeddingsCount, embeddingSize = tuple(firstLine.split(' '))
        embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize)
        embeddingFormat = '{0}f'.format(embeddingSize)
        wordIndexMap = {}
        embeddings = []

        log.info('Vocabulary size: {0}. Embedding size: {1}.', embeddingsCount, embeddingSize)

        embeddingIndex = 0
        while True:
            word = ''
            while True:
                char = file.read(1)

                if not char:
                    log.lineBreak()
                    return wordIndexMap, embeddings

                if char == ' ':
                    word = word.strip()
                    break

                word += char

            embedding = struct.unpack(embeddingFormat, file.read(embeddingSize * 4))
            wordIndexMap[word] = len(wordIndexMap)
            embeddings.append(embedding)

            embeddingIndex += 1
            log.progress('Reading embeddings: {0:.3f}%.', embeddingIndex, embeddingsCount)
Пример #3
0
    def test(self, classifier, testData, batchSize=None):
        log.start('Testing classifier')

        inputData, labels = testData

        batchSize = batchSize if batchSize is not None else inputData.shape[0]
        batchesCount = inputData.shape[0] / batchSize + 1
		
        predictions = None
        for batchIndex in xrange(batchesCount):
            inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize]
			
            if predictions is None:
                predictions = classifier.classify(inputBatch)
            else:
                p = classifier.classify(inputBatch)
                if len(p):
                    predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)])
			
            log.progress('Testing classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount))

        performance = Case.roc_auc_truncated(labels, predictions)

        testMetrics = TestMetrics(performance)
        log.done(testMetrics)

        return testMetrics
Пример #4
0
def extract(outputDirectoryPath, outputConcatFilePath, connector):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)

    textContainersCount = connector.count()
    log.info('Found {0} text containers.', textContainersCount)

    if os.path.exists(outputConcatFilePath):
        os.remove(outputConcatFilePath)

    pagesCount = 0
    startTime = time.time()

    for textContainerIndex, name, text in connector.iterate():
        text = clean(text)

        outputFilePath = os.path.join(outputDirectoryPath, name + '.txt')

        saveText(outputFilePath, text)
        saveText(outputConcatFilePath, text)

        currentTime = time.time()
        elapsed = currentTime - startTime
        pagesCount += 1

        log.progress('Extracting text containers: {0:.3f}%. Elapsed: {1}. Pages: {2}.',
                     textContainerIndex + 1,
                     textContainersCount,
                     log.delta(elapsed),
                     pagesCount)

    log.lineBreak()
Пример #5
0
def loadWordVocabulary(vocabularyFilePath, loadFrequencies=True):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            frequency = file.read(4)
            frequency = struct.unpack('i', frequency)[0]

            vocabulary[word] = (index, frequency) if loadFrequencies else index

            log.progress('Loading word vocabulary: {0:.3f}%.', itemIndex + 1, itemsCount)

        log.lineBreak()

    return vocabulary
Пример #6
0
def loadWordVocabulary(vocabularyFilePath, loadFrequencies=True):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            frequency = file.read(4)
            frequency = struct.unpack('i', frequency)[0]

            vocabulary[word] = (index, frequency) if loadFrequencies else index

            log.progress('Loading word vocabulary: {0:.3f}%.', itemIndex + 1,
                         itemsCount)

        log.lineBreak()

    return vocabulary
Пример #7
0
def dumpFileVocabulary(vocabulary, vocabularyFilePath):
    if os.path.exists(vocabularyFilePath):
        os.remove(vocabularyFilePath)

    itemsCount = len(vocabulary)
    itemIndex = 0

    with open(vocabularyFilePath, 'w') as file:
        file.write(struct.pack('i', itemsCount))

        for key, index in vocabulary.items():
            keyLength = len(key)
            keyLength = struct.pack('i', keyLength)
            index = struct.pack('i', index)

            file.write(keyLength)
            file.write(key)
            file.write(index)

            itemIndex += 1
            log.progress('Dumping file vocabulary: {0:.3f}%.', itemIndex,
                         itemsCount)

        file.flush()

        log.lineBreak()
Пример #8
0
def dumpFileVocabulary(vocabulary, vocabularyFilePath):
    if os.path.exists(vocabularyFilePath):
        os.remove(vocabularyFilePath)

    itemsCount = len(vocabulary)
    itemIndex = 0

    with open(vocabularyFilePath, 'w') as file:
        file.write(struct.pack('i', itemsCount))

        for key, index in vocabulary.items():
            keyLength = len(key)
            keyLength = struct.pack('i', keyLength)
            index = struct.pack('i', index)

            file.write(keyLength)
            file.write(key)
            file.write(index)

            itemIndex += 1
            log.progress('Dumping file vocabulary: {0:.3f}%.', itemIndex, itemsCount)

        file.flush()

        log.lineBreak()
Пример #9
0
    def whiteListPriorityComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        wordXIsWhite = wordX in whiteList
        wordYIsWhite = wordY in whiteList

        if wordXIsWhite and wordYIsWhite:
            return 0
        elif wordXIsWhite:
            return -1
        elif wordYIsWhite:
            return 1

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0
Пример #10
0
def make():
    words = []

    words += getSyntacticWordRelationsWords(
        'res/Syntactic-Word-Relations/questions-words.txt')
    words += getSATWords('res/SAT-Questions/SAT-package-V3.txt')
    words += getSimLex999Words('res/SimLex-999/SimLex-999.txt')
    words += getWordSimilarity353Words('res/WordSimilarity-353/combined.csv')
    words += getRubensteinGoodenoughWords('res/RG/EN-RG-65.txt')

    words = list(set(words))
    words = sorted(words)

    log.info('Found {0} words.', len(words))

    whiteListPath = 'res/Tools/white_list.txt'
    if os.path.exists(whiteListPath):
        os.remove(whiteListPath)

    with open(whiteListPath, 'w+') as whiteListFile:
        batchSize = 10
        batchesCount = len(words) / batchSize + 1
        for batchIndex in xrange(0, batchesCount):
            batch = words[batchIndex * batchSize:(batchIndex + 1) * batchSize]
            line = ' '.join(batch) + '\n'
            line = line.lower()

            whiteListFile.write(line)

            log.progress('Saving white list: {0:.0f}%.', batchIndex + 1,
                         batchesCount)

    log.lineBreak()
    log.info('White list has been saved.')
Пример #11
0
def buildWordMaps(texts, w2vWordIndexMap, w2vWordEmbeddings):
    wordIndexMap = collections.OrderedDict()
    wordFrequencyMap = collections.OrderedDict()

    for textIndex, text in enumerate(texts):
        for word in weeding.iterateWords(text):
            if word not in w2vWordIndexMap:
                continue

            if word not in wordIndexMap:
                wordIndexMap[word] = len(wordIndexMap)
                wordFrequencyMap[word] = 1
            else:
                wordFrequencyMap[word] += 1

        log.progress('Building word maps: {0:.3f}%. Words: {1}.', textIndex + 1, len(texts), len(wordIndexMap))

    log.lineBreak()

    wordEmbeddings = numpy.zeros((len(wordIndexMap), w2vWordEmbeddings.shape[1]))
    for wordIndexPair in wordIndexMap.items():
        word, index = wordIndexPair
        wordEmbeddings[index] = w2vWordEmbeddings[index]

        log.progress('Copying w2v embeddings: {0:.3f}%.', index + 1, len(wordIndexMap))

    log.lineBreak()

    return wordIndexMap, wordFrequencyMap, wordEmbeddings
Пример #12
0
def make():
    words = []

    words += getSyntacticWordRelationsWords('res/Syntactic-Word-Relations/questions-words.txt')
    words += getSATWords('res/SAT-Questions/SAT-package-V3.txt')
    words += getSimLex999Words('res/SimLex-999/SimLex-999.txt')
    words += getWordSimilarity353Words('res/WordSimilarity-353/combined.csv')
    words += getRubensteinGoodenoughWords('res/RG/EN-RG-65.txt')

    words = list(set(words))
    words = sorted(words)

    log.info('Found {0} words.', len(words))

    whiteListPath = 'res/Tools/white_list.txt'
    if os.path.exists(whiteListPath):
        os.remove(whiteListPath)

    with open(whiteListPath, 'w+') as whiteListFile:
        batchSize = 10
        batchesCount = len(words) / batchSize + 1
        for batchIndex in xrange(0, batchesCount):
            batch = words[batchIndex * batchSize : (batchIndex + 1) * batchSize]
            line =  ' '.join(batch) + '\n'
            line = line.lower()

            whiteListFile.write(line)

            log.progress('Saving white list: {0:.0f}%.', batchIndex + 1, batchesCount)

    log.lineBreak()
    log.info('White list has been saved.')
Пример #13
0
def trainTextVectors(connector, w2vEmbeddingsPath, wordIndexMapPath, wordFrequencyMapPath, wordEmbeddingsPath, contextsPath,
                     sample, minCount, windowSize, negative, strict, contextsPerText, superBatchSize, fileEmbeddingSize,
                     epochs, learningRate, fileEmbeddingsPath):
    if exists(wordIndexMapPath) and exists(wordFrequencyMapPath) and exists(wordEmbeddingsPath) \
            and exists(contextsPath) and exists(pathTo.textIndexMap):
        wordIndexMap = parameters.loadMap(wordIndexMapPath)
        wordFrequencyMap = parameters.loadMap(wordFrequencyMapPath)
        wordEmbeddings = parameters.loadEmbeddings(wordEmbeddingsPath)
        textIndexMap = parameters.loadMap(pathTo.textIndexMap)
    else:
        w2vWordIndexMap, w2vWordEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsPath)

        names, texts = extract(connector)
        wordIndexMap, wordFrequencyMap, wordEmbeddings = buildWordMaps(texts, w2vWordIndexMap, w2vWordEmbeddings)

        parameters.dumpWordMap(wordIndexMap, wordIndexMapPath)
        del w2vWordIndexMap
        del w2vWordEmbeddings
        gc.collect()

        parameters.dumpWordMap(wordFrequencyMap, wordFrequencyMapPath)

        log.progress('Dumping contexts...')
        parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsPath)
        log.info('Dumped indices, frequencies and embeddings')

        texts = subsampleAndPrune(texts, wordFrequencyMap, sample, minCount)

        textIndexMap = inferContexts(contextsPath, names, texts, wordIndexMap, windowSize, negative, strict, contextsPerText)

        parameters.dumpWordMap(textIndexMap, pathTo.textIndexMap)

    with h5py.File(contextsPath, 'r') as contextsFile:
        contexts = contextsFile['contexts']
        log.info('Loaded {0} contexts. Shape: {1}', len(contexts), contexts.shape)

        fileEmbeddings = numpy.random.rand(len(contexts), fileEmbeddingSize).astype('float32')
        trainingBatch = numpy.zeros((superBatchSize, contextsPerText, 1+windowSize+negative)).astype('int32')
        superBatchesCount = len(contexts) / superBatchSize

        for superBatchIndex in xrange(0, superBatchesCount):
            log.info('Text batch: {0}/{1}.', superBatchIndex + 1, superBatchesCount)

            # TODO: this only works if superBatchSize == textsCount; otherwise text indices do not match
            contexts.read_direct(trainingBatch, source_sel=numpy.s_[superBatchIndex*superBatchSize:(superBatchIndex+1)*superBatchSize])
            trainingBatchReshaped = trainingBatch.reshape((superBatchSize*contextsPerText, 1+windowSize+negative))

            fileEmbeddingsBatch = fileEmbeddings[superBatchIndex*superBatchSize:(superBatchIndex+1)*superBatchSize]

            model = traininig.Model(fileEmbeddingsBatch, wordEmbeddings, contextSize=windowSize-2, negative=negative)
            traininig.train(model, textIndexMap, wordIndexMap, wordEmbeddings, trainingBatchReshaped, epochs, 1, learningRate)

            fileEmbeddings[superBatchIndex*superBatchSize:(superBatchIndex+1)*superBatchSize] = model.fileEmbeddings.get_value()
            contextsFile.flush()

        log.progress('Dumping text embeddings...')
        binary.dumpTensor(fileEmbeddingsPath, fileEmbeddings)
        log.info('Dumping text embeddings complete')
Пример #14
0
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model,
               superBatchSize, miniBatchSize, parametersPath, embeddingsPath,
               learningRate, l1Coefficient, l2Coefficient, epochs,
               metricsPath):
    if os.path.exists(metricsPath):
        os.remove(metricsPath)

    superBatchesCount = contextProvider.contextsCount / superBatchSize + 1
    startTime = time.time()
    previousTotal = 0

    for epoch in xrange(0, epochs):
        for superBatchIndex in xrange(0, superBatchesCount):
            contextSuperBatch = contextProvider[superBatchIndex *
                                                superBatchSize:
                                                (superBatchIndex + 1) *
                                                superBatchSize]

            fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:,
                                                                            1], contextSuperBatch[:,
                                                                                                  1:
                                                                                                  -1], contextSuperBatch[:,
                                                                                                                         -1]

            model.train(wordIndices, targetWordIndices, miniBatchSize,
                        learningRate, l1Coefficient, l2Coefficient)

            metrics = validation.validate(wordVocabulary, model)
            customMetrics = {
                'simGemJewel': similarity('gem', 'jewel', wordVocabulary,
                                          model)
            }
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics,
                            **customMetrics)
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics)

            if previousTotal < sum(metrics):
                model.dump(parametersPath, embeddingsPath)

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerEpoch = elapsed / (epoch + 1)

            rg, sim353, simLex999, syntRel, sat = metrics
            log.progress(
                'Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.',
                epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch,
                rg, sim353, simLex999, syntRel, sat,
                customMetrics['simGemJewel'])

    log.lineBreak()

    return model
Пример #15
0
def loadWordRequencyMap(indexMapFilePath):
    wordRequencyMap = loadMap(indexMapFilePath)

    log.progress('Sorting word frequency map...', 1, 1)

    wordRequencyMap = sorted(wordRequencyMap.items(), key=lambda item: item[1], reverse=True)
    wordRequencyMap = collections.OrderedDict(wordRequencyMap)

    log.progress('Sorting word frequency map complete.', 1, 1)
    log.lineBreak()

    return wordRequencyMap
Пример #16
0
def train(model, fileIndexMap, wordIndexMap, wordEmbeddings, contexts,
          epochs, batchSize, learningRate, metricsPath=None, pathTo=None):
    model.trainingContexts.set_value(contexts)

    contextsCount, contextSize = contexts.shape

    initialiLearningRate = learningRate
    startTime = time.time()
    metrics = {
        'meanError': np.nan,
        'medianError': np.nan,
        'maxError': np.nan,
        'minError': np.nan,
        'learningRate': learningRate
    }

    maxError = None

    for epoch in xrange(0, epochs):
        errors = []
        for contextIndex in xrange(0, contextsCount):
            error = model.trainModel(contextIndex, learningRate)
            errors.append(error)

            log.progress('Training model: {0:.3f}%. Epoch: {1}. Elapsed: {2}. Error(mean,median,min,max): {3:.3f}, {4:.3f}, {5:.3f}, {6:.3f}. Learning rate: {7}.',
                     epoch * contextsCount + contextIndex + 1,
                     epochs * contextsCount,
                     epoch + 1,
                     log.delta(time.time() - startTime),
                     metrics['meanError'],
                     metrics['medianError'],
                     metrics['minError'],
                     metrics['maxError'],
                     learningRate)

        learningRate = learningRate * (1 - (float(epoch) + 1) / epochs)
        learningRate = max(initialiLearningRate * 0.0001, learningRate)

        metrics = {
            'meanError': np.mean(errors),
            'medianError': np.median(errors),
            'maxError': np.max(errors),
            'minError': np.min(errors),
            'learningRate': learningRate
        }

        if pathTo is not None and (maxError is None or maxError > metrics['maxError']):
            model.dump(pathTo.fileEmbeddings, pathTo.weights)
            maxError = metrics['maxError']

        if metricsPath is not None:
            validation.dump(metricsPath, epoch, metrics)
Пример #17
0
def extract(connector):
    textFilesCount = connector.count()

    names = []
    texts = []
    for textFileIndex, name, text in connector.iterate():
        text = extraction.clean(text)

        names.append(name)
        texts.append(text)

        log.progress('Extracting text: {0:.3f}%. Texts: {1}.', textFileIndex + 1, textFilesCount, textFileIndex + 1)

    log.lineBreak()

    return names, texts
Пример #18
0
def loadEmbeddings(embeddingsFilePath):
    with open(embeddingsFilePath, 'rb') as embeddingsFile:
        embeddingsCount = binary.readi(embeddingsFile)
        embeddingSize = binary.readi(embeddingsFile)

        embeddings = numpy.empty((embeddingsCount, embeddingSize)).astype('float32')

        for embeddingIndex in range(0, embeddingsCount):
            embedding = binary.readf(embeddingsFile, embeddingSize)
            embeddings[embeddingIndex] = embedding

            log.progress('Loading embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount)

        log.info('Loading embeddings complete. {0} embeddings loaded.', embeddingsCount)

        return embeddings
Пример #19
0
def inferContexts(contextsPath, names, texts, wordIndexMap, windowSize, negative, strict, contextsCount):
    textIndexMap = collections.OrderedDict()

    def wordsToIndices(textContext):
        indices = map(lambda word: wordIndexMap[word], textContext)
        return indices

    wordIndices = map(lambda item: item[1], wordIndexMap.items())
    wordIndices = numpy.asarray(wordIndices)
    maxWordIndex = max(wordIndices)

    with h5py.File(contextsPath, 'w') as contextsFile:
        tensor = contextsFile.create_dataset('contexts',
                                             dtype='int32',
                                             shape=(0, contextsCount, 1 + windowSize + negative), # 1 for file index
                                             maxshape=(None, contextsCount, 1 + windowSize + negative), # 1 for file index
                                             chunks=(1, contextsCount, 1 + windowSize + negative)) # 1 for file index

        textsCount = 0
        for name, text in zip(names, texts):
            contextProvider = processing.WordContextProvider(text=text, minContexts=contextsCount, maxContexts=contextsCount)
            contexts = list(contextProvider.iterate(windowSize))

            if len(contexts) > 0:
                contexts = map(wordsToIndices, contexts)
                textIndexMap[name] = len(textIndexMap)
                contexts = numpy.asarray(contexts)
                textIndices = [[textIndexMap[name]]] * len(contexts)
                contexts = numpy.concatenate([textIndices, contexts], axis=1)

                negativeSamples = processing.generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict)
                contexts = numpy.concatenate([contexts, negativeSamples], axis=1)
                tensor.resize(tensor.shape[0] + 1, axis=0)
                tensor[-1] = contexts

            textsCount += 1
            log.progress('Creating contexts: {0:.3f}%. Text index map: {1}. Contexts: {2}.',
                         textsCount,
                         len(texts),
                         len(tensor),
                         tensor.shape[0] * tensor.shape[1])

    log.lineBreak()

    return textIndexMap
Пример #20
0
    def frequencyComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0
Пример #21
0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)
        log.info("Output directory {0} has been removed.", outputDirectoryPath)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)
    log.info("Output directory {0} has been created.", outputDirectoryPath)

    pathName = inputDirectoryPath + "/*wiki*.txt.gz"
    dumpPaths = glob.glob(pathName)[:10]
    dumpsCount = len(dumpPaths)
    log.info("Found {0} Wikipedia dumps.", dumpsCount)

    startTime = time.time()

    for dumpIndex, dumpPath in enumerate(dumpPaths):
        dumpName, pages = unpackDump(dumpPath, cleanText)

        if len(pages) > 0:
            dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName)
            os.mkdir(dumpDirectoryPath)
            os.chown(dumpDirectoryPath, 1000, 1000)

            for pageName, pageText in pages:
                savePage(dumpDirectoryPath, pageName, pageText)

        currentTime = time.time()
        elapsed = currentTime - startTime
        secondsPerFile = elapsed / (dumpIndex + 1)

        log.progress(
            "Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)",
            dumpIndex + 1,
            dumpsCount,
            dumpName,
            len(pages),
            log.delta(elapsed),
            secondsPerFile,
        )

    log.lineBreak()
    log.info("Processing complete.")
Пример #22
0
    def frequencyComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0
Пример #23
0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)
        log.info('Output directory {0} has been removed.', outputDirectoryPath)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)
    log.info('Output directory {0} has been created.', outputDirectoryPath)

    pathName = inputDirectoryPath + '/*wiki*.txt.gz'
    dumpPaths = glob.glob(pathName)[:10]
    dumpsCount = len(dumpPaths)
    log.info('Found {0} Wikipedia dumps.', dumpsCount)

    startTime = time.time()

    for dumpIndex, dumpPath in enumerate(dumpPaths):
        dumpName, pages = unpackDump(dumpPath, cleanText)

        if len(pages) > 0:
            dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName)
            os.mkdir(dumpDirectoryPath)
            os.chown(dumpDirectoryPath, 1000, 1000)

            for pageName, pageText in pages:
                savePage(dumpDirectoryPath, pageName, pageText)

        currentTime = time.time()
        elapsed = currentTime - startTime
        secondsPerFile = elapsed / (dumpIndex + 1)

        log.progress('Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)',
                     dumpIndex + 1,
                     dumpsCount,
                     dumpName,
                     len(pages),
                     log.delta(elapsed),
                     secondsPerFile)

    log.lineBreak()
    log.info('Processing complete.')
Пример #24
0
def loadW2VParameters(filePath, loadEmbeddings=True):
    with open(filePath, 'rb') as w2vFile:
        firstLine = w2vFile.readline()
        embeddingsCount, embeddingSize = tuple(firstLine.split(' '))
        embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize)
        wordIndexMap = collections.OrderedDict()
        embeddings = numpy.zeros((embeddingsCount, embeddingSize))

        embeddingIndex = 0
        while True:
            word = ''
            while True:
                char = w2vFile.read(1)

                if not char:
                    log.lineBreak()

                    if loadEmbeddings:
                        return wordIndexMap, embeddings
                    else:
                        return wordIndexMap

                if char == ' ':
                    word = word.strip()
                    break

                word += char

            wordIndexMap[word] = len(wordIndexMap)
            if loadEmbeddings:
                embedding = binary.readf(w2vFile, embeddingSize)
                embeddings[wordIndexMap[word]] = embedding
            else:
                w2vFile.seek(embeddingSize * 4, io.SEEK_CUR)

            embeddingIndex += 1
            log.progress('Loading W2V embeddings: {0:.3f}%. {1} embeddings {2} features each.',
                         embeddingIndex,
                         embeddingsCount,
                         embeddingIndex,
                         embeddingSize)
Пример #25
0
def subsampleAndPrune(texts, wordFrequencyMap, sample, minCount):
    totalLength = 0.
    prunedLength = 0.

    maxFrequency = wordFrequencyMap.items()[0][1]

    for textIndex, text in enumerate(texts):
        totalLength += len(text)

        texts[textIndex] = weeding.subsampleAndPrune(text, wordFrequencyMap, maxFrequency, sample, minCount)

        prunedLength += len(texts[textIndex])

        log.progress('Subsampling and pruning text: {0:.3f}%. Removed {1:.3f}% of original text.',
                     textIndex + 1,
                     len(texts),
                     (1 - prunedLength/totalLength) * 100)

    log.lineBreak()

    return texts
Пример #26
0
def dumpEmbeddings(embeddings, embeddingsFilePath):
    if os.path.exists(embeddingsFilePath):
        os.remove(embeddingsFilePath)

    if not isinstance(embeddings, numpy.ndarray):
        embeddings = numpy.asarray(embeddings)

    embeddingsCount, embeddingSize = embeddings.shape

    with open(embeddingsFilePath, 'w') as embeddingsFile:
        binary.writei(embeddingsFile, embeddingsCount)
        binary.writei(embeddingsFile, embeddingSize)

        for embeddingIndex in range(0, embeddingsCount):
            embedding = embeddings[embeddingIndex]

            binary.writef(embeddingsFile, embedding)

            log.progress('Dumping embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount)

        log.lineBreak()
Пример #27
0
def loadMap(indexMapFilePath, inverse=False):
    vocabulary = collections.OrderedDict()

    with open(indexMapFilePath, 'rb') as indexMapFile:
        itemsCount = binary.readi(indexMapFile)

        for itemIndex in range(0, itemsCount):
            wordLength = binary.readi(indexMapFile)
            word = binary.reads(indexMapFile, wordLength)
            index = binary.readi(indexMapFile)

            if inverse:
                vocabulary[index] = word
            else:
                vocabulary[word] = index

            log.progress('Loading word map: {0:.3f}%.', itemIndex + 1, itemsCount)

        log.info('Loading word map complete. {0} words loaded.', itemsCount)

    return vocabulary
Пример #28
0
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model, superBatchSize, miniBatchSize, parametersPath, embeddingsPath, learningRate, l1Coefficient, l2Coefficient, epochs, metricsPath):
    if os.path.exists(metricsPath):
        os.remove(metricsPath)

    superBatchesCount = contextProvider.contextsCount / superBatchSize + 1
    startTime = time.time()
    previousTotal = 0

    for epoch in xrange(0, epochs):
        for superBatchIndex in xrange(0, superBatchesCount):
            contextSuperBatch = contextProvider[superBatchIndex * superBatchSize:(superBatchIndex + 1) * superBatchSize]

            fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:,1], contextSuperBatch[:,1:-1], contextSuperBatch[:,-1]

            model.train(wordIndices, targetWordIndices, miniBatchSize, learningRate, l1Coefficient, l2Coefficient)

            metrics = validation.validate(wordVocabulary, model)
            customMetrics = {
                'simGemJewel': similarity('gem', 'jewel', wordVocabulary, model)
            }
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics, **customMetrics)
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics)

            if previousTotal < sum(metrics):
                model.dump(parametersPath, embeddingsPath)

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerEpoch = elapsed / (epoch + 1)

            rg, sim353, simLex999, syntRel, sat = metrics
            log.progress('Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.',
                         epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch,
                         rg, sim353, simLex999, syntRel, sat,
                         customMetrics['simGemJewel'])

    log.lineBreak()

    return model
Пример #29
0
    def createSubmission(self, classifier, testData, batchSize=None):
        log.start('Creating submission')
		
        batchSize = batchSize if batchSize is not None else input.shape[0]
        batchesCount = testData.shape[0] / batchSize + 1
		
        predictions = None
        for batchIndex in xrange(batchesCount):
            inputBatch = testData[batchIndex * batchSize:(batchIndex + 1) * batchSize]
			
            if predictions is None:
                predictions = classifier.classify(inputBatch)
            elif len(inputBatch):
                predictions = numpy.concatenate([predictions, classifier.classify(inputBatch)])
			
            log.progress('Creating submission: {0}%'.format((batchIndex + 1) * 100 / batchesCount))

        submission = pandas.DataFrame({"id": self.testData["id"], "prediction": predictions})

        log.done('submission' + str(submission.shape))

        return submission
Пример #30
0
def loadFileVocabulary(vocabularyFilePath):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            vocabulary[word] = index

            log.progress('Loading file vocabulary: {0:.3f}%.', itemIndex + 1, itemsCount)

        log.lineBreak()

    return vocabulary
Пример #31
0
def dumpWordMap(indexMap, indexMapFilePath):
    if os.path.exists(indexMapFilePath):
        os.remove(indexMapFilePath)

    with open(indexMapFilePath, 'w') as indexMapFile:
        indexMapSize = len(indexMap)
        itemIndex = 0

        binary.writei(indexMapFile, indexMapSize)

        for key, index in indexMap.items():
            keyLength = len(key)

            binary.writei(indexMapFile, keyLength)
            binary.writes(indexMapFile, key)
            binary.writei(indexMapFile, index)

            itemIndex += 1
            log.progress('Dumping map: {0:.3f}%.', itemIndex, indexMapSize)

        indexMapFile.flush()

        log.lineBreak()
Пример #32
0
    def innerExecute(self, selector):
        words = []
        embeddings = []

        for operator, word in selector.operands:
            embedding = self.wordEmbeddings[self.wordIndexMap[word]]
            if operator == '-':
                embedding = embedding * (-1)

            words.append(word)
            embeddings.append(embedding)

        minIndex, maxIndex = ExplainFunction.getSurroundingIndices(words, self.wordIndexMap, 5000)

        result = embeddings[0]
        for embedding in embeddings[1:]:
            result += embedding

        scores = []
        for index in xrange(minIndex, maxIndex):
            word = self.wordIndexItems[index][0]

            if word not in words:
                embedding = self.wordEmbeddings[index]
                score = vectors.cosineSimilarity(result, embedding)
                scores.append((word, score))

            log.progress('Looking for closest matches: {0:.3f}%.',
                         index - minIndex + 1,
                         maxIndex - minIndex)

        log.lineBreak()

        scores = sorted(scores, key=lambda s: s[1], reverse=True)

        for score in scores[:10]:
            print score
Пример #33
0
def loadFileVocabulary(vocabularyFilePath):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            vocabulary[word] = index

            log.progress('Loading file vocabulary: {0:.3f}%.', itemIndex + 1,
                         itemsCount)

        log.lineBreak()

    return vocabulary
Пример #34
0
    def train(classifier, trainingData, validationData, batchSize=None):
        log.start('Training classifier')

        inputData, labels = trainingData

        batchSize = batchSize if batchSize is not None else inputData.shape[0]
        batchesCount = inputData.shape[0] / batchSize

        start = time.time()

        for batchIndex in xrange(batchesCount):
            inputBatch = inputData[batchIndex * batchSize:(batchIndex + 1) * batchSize]
            labelsBatch = labels[batchIndex * batchSize:(batchIndex + 1) * batchSize]

            classifier.fit(inputBatch, labelsBatch)
            log.progress('Training classifier: {0}%'.format((batchIndex + 1) * 100 / batchesCount))

        end = time.time()
        elapsed = end - start

        trainingMetrics = TrainingMetrics(elapsed)
        log.done(trainingMetrics)

        return trainingMetrics
Пример #35
0
def processData(inputDirectoryPath, w2vEmbeddingsFilePath, fileIndexMapFilePath,
                wordIndexMapFilePath, wordEmbeddingsFilePath, contextsPath, windowSize, negative, strict):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = windowSize - fileContextSize

    fileIndexMap = {}
    wordIndexMap = collections.OrderedDict()
    wordEmbeddings = []

    noNegativeSamplingPath = contextsPath
    if negative > 0:
        noNegativeSamplingPath += '.temp'

    if os.path.exists(noNegativeSamplingPath):
        os.remove(noNegativeSamplingPath)

    pathName = inputDirectoryPath + '/*.txt'
    textFilePaths = glob.glob(pathName)
    textFilePaths = sorted(textFilePaths)
    textFileCount = len(textFilePaths)

    w2vWordIndexMap, w2vEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsFilePath)

    contextsCount = 0
    with open(noNegativeSamplingPath, 'wb+') as noNegativeSamplingFile:
        binary.writei(noNegativeSamplingFile, 0) # this is a placeholder for contexts count
        binary.writei(noNegativeSamplingFile, windowSize)
        binary.writei(noNegativeSamplingFile, 0)

        startTime = time.time()

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileIndexMap[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath=textFilePath)
            for wordContext in contextProvider.iterate(wordContextSize):
                allWordsInWordVocabulary = [word in w2vWordIndexMap for word in wordContext]

                if not all(allWordsInWordVocabulary):
                    continue

                for word in wordContext:
                    if word not in wordIndexMap:
                        wordIndexMap[word] = len(wordIndexMap)
                        wordEmbeddingIndex = w2vWordIndexMap[word]
                        wordEmbedding = w2vEmbeddings[wordEmbeddingIndex]
                        wordEmbeddings.append(wordEmbedding)

                indexContext = [textFileIndex] + map(lambda w: wordIndexMap[w], wordContext)

                binary.writei(noNegativeSamplingFile, indexContext)
                contextsCount += 1

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Words: {3}. Contexts: {4}.',
                         textFileIndex + 1,
                         textFileCount,
                         log.delta(elapsed),
                         secondsPerFile,
                         len(wordIndexMap),
                         contextsCount)

        log.lineBreak()

        noNegativeSamplingFile.seek(0, io.SEEK_SET)
        binary.writei(noNegativeSamplingFile, contextsCount)
        noNegativeSamplingFile.flush()

    if negative > 0:
        with open(contextsPath, 'wb+') as contextsFile:
            startTime = time.time()

            contextProvider = parameters.IndexContextProvider(noNegativeSamplingPath)

            binary.writei(contextsFile, contextsCount)
            binary.writei(contextsFile, windowSize)
            binary.writei(contextsFile, negative)

            batchSize = 10000
            batchesCount = contextsCount / batchSize + 1

            wordIndices = map(lambda item: item[1], wordIndexMap.items())
            wordIndices = numpy.asarray(wordIndices)
            maxWordIndex = max(wordIndices)

            for batchIndex in xrange(0, batchesCount):
                contexts = contextProvider[batchIndex * batchSize : (batchIndex + 1) * batchSize]
                negativeSamples = generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict)
                contexts = numpy.concatenate([contexts, negativeSamples], axis=1)
                contexts = numpy.ravel(contexts)

                binary.writei(contextsFile, contexts)

                currentTime = time.time()
                elapsed = currentTime - startTime

                log.progress('Negative sampling: {0:.3f}%. Elapsed: {1}.',
                     batchIndex + 1,
                     batchesCount,
                     log.delta(elapsed))

            log.lineBreak()
            contextsFile.flush()

            os.remove(noNegativeSamplingPath)

    parameters.dumpWordMap(fileIndexMap, fileIndexMapFilePath)
    parameters.dumpWordMap(wordIndexMap, wordIndexMapFilePath)
    parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsFilePath)
Пример #36
0
def pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList):
    global pruningStepIndex

    originalVocabularyLength = len(wordVocabulary)
    prunedVocabularyLength = min(originalVocabularyLength, maxVocabularySize)

    pruningStepsCount = 0
    if originalVocabularyLength > maxVocabularySize:
        pruningStepsCount += originalVocabularyLength * math.log(originalVocabularyLength)
    pruningStepsCount += prunedVocabularyLength * math.log(prunedVocabularyLength)
    pruningStepsCount += prunedVocabularyLength

    def whiteListPriorityComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        wordXIsWhite = wordX in whiteList
        wordYIsWhite = wordY in whiteList

        if wordXIsWhite and wordYIsWhite:
            return 0
        elif wordXIsWhite:
            return -1
        elif wordYIsWhite:
            return 1

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = wordVocabulary.items()

    if originalVocabularyLength > maxVocabularySize:
        prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=whiteListPriorityComparator)
        prunedWordVocabulary = prunedWordVocabulary[:maxVocabularySize]

    def frequencyComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=frequencyComparator)
    prunedWordVocabulary = collections.OrderedDict(prunedWordVocabulary)

    wordIndexMap = {}
    for wordIndex, wordInfo in enumerate(prunedWordVocabulary.items()):
        word, info = wordInfo
        previousIndex, wordFrequency = info
        wordIndexMap[previousIndex] = wordIndex

        prunedWordVocabulary[word] = wordIndex, wordFrequency

        log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)
        pruningStepIndex += 1

    log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepsCount, pruningStepsCount)
    log.lineBreak()

    return prunedWordVocabulary, wordIndexMap
Пример #37
0
def processData(
    inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize
):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = contextSize - fileContextSize

    fileVocabulary = collections.OrderedDict()
    wordVocabulary = collections.OrderedDict()

    unprunedContextsPath = contextsPath + ".unpruned"

    if os.path.exists(unprunedContextsPath):
        os.remove(unprunedContextsPath)

    with open(unprunedContextsPath, "wb+") as unprunedContextsFile:
        unprunedContextsFile.write(struct.pack("i", 0))  # this is a placeholder for contexts count
        unprunedContextsFile.write(struct.pack("i", contextSize))

        pathName = inputDirectoryPath + "/*/*.txt"
        textFilePaths = glob.glob(pathName)[:200]
        textFilePaths = sorted(textFilePaths)
        textFileCount = len(textFilePaths)
        startTime = time.time()

        contextFormat = "{0}i".format(contextSize)
        contextsCount = 0

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileVocabulary[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath)
            for wordContext in contextProvider.next(wordContextSize):
                for word in wordContext:
                    if word not in wordVocabulary:
                        wordVocabulary[word] = (len(wordVocabulary), 1)
                    else:
                        wordIndex, frequency = wordVocabulary[word]
                        wordVocabulary[word] = (wordIndex, frequency + 1)

                indexContext = map(lambda w: wordVocabulary[w][0], wordContext)
                indexContext = [textFileIndex] + indexContext

                unprunedContextsFile.write(struct.pack(contextFormat, *indexContext))
                contextsCount += 1

            textFileName = os.path.basename(textFilePath)
            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress(
                "Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.",
                textFileIndex + 1,
                textFileCount,
                log.delta(elapsed),
                secondsPerFile,
                len(wordVocabulary),
            )

        log.lineBreak()

        unprunedContextsFile.seek(0, io.SEEK_SET)
        unprunedContextsFile.write(struct.pack("i", contextsCount))
        unprunedContextsFile.flush()

    whiteList = whitelist.load()
    originalVocabularyLength = len(wordVocabulary)
    prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList)

    log.info(
        "Vocabulary has been pruned. {0} items left out of {1}.", len(prunedWordVocabulary), originalVocabularyLength
    )

    with open(unprunedContextsPath, "rb") as unprunedContextsFile:
        contextsCount = unprunedContextsFile.read(4)
        contextSize = unprunedContextsFile.read(4)

        contextsCount = struct.unpack("i", contextsCount)[0]
        contextSize = struct.unpack("i", contextSize)[0]

        format = "{0}i".format(contextSize)  # plus one spot for file index
        bufferSize = (contextSize) * 4
        prunedContextsCount = 0
        with open(contextsPath, "wb+") as uncompressedPrunedContexts:
            uncompressedPrunedContexts.write(struct.pack("i", 0))  # placeholder for contexts count
            uncompressedPrunedContexts.write(struct.pack("i", contextSize))

            contextIndex = 0
            while contextIndex < contextsCount:
                buffer = unprunedContextsFile.read(bufferSize)

                context = struct.unpack(format, buffer)
                fileIndex = context[0]
                indexContext = context[1:]

                if all([index in wordIndexMap for index in indexContext]):
                    prunedContextsCount += 1
                    indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext)
                    context = [fileIndex] + indexContext
                    buffer = struct.pack(format, *context)
                    uncompressedPrunedContexts.write(buffer)

                contextIndex += 1
                contextsPruned = contextIndex - prunedContextsCount + 1
                log.progress(
                    "Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.",
                    contextIndex,
                    contextsCount,
                    contextsPruned,
                    float(contextsPruned) * 100 / contextsCount,
                    contextsCount,
                )

            log.lineBreak()

            uncompressedPrunedContexts.seek(0, io.SEEK_SET)
            uncompressedPrunedContexts.write(struct.pack("i", prunedContextsCount))
            uncompressedPrunedContexts.flush()

    os.remove(unprunedContextsPath)

    parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath)
    parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)
Пример #38
0
def pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList):
    global pruningStepIndex

    originalVocabularyLength = len(wordVocabulary)
    prunedVocabularyLength = min(originalVocabularyLength, maxVocabularySize)

    pruningStepsCount = 0
    if originalVocabularyLength > maxVocabularySize:
        pruningStepsCount += originalVocabularyLength * math.log(originalVocabularyLength)
    pruningStepsCount += prunedVocabularyLength * math.log(prunedVocabularyLength)
    pruningStepsCount += prunedVocabularyLength

    def whiteListPriorityComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        wordXIsWhite = wordX in whiteList
        wordYIsWhite = wordY in whiteList

        if wordXIsWhite and wordYIsWhite:
            return 0
        elif wordXIsWhite:
            return -1
        elif wordYIsWhite:
            return 1

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = wordVocabulary.items()

    if originalVocabularyLength > maxVocabularySize:
        prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=whiteListPriorityComparator)
        prunedWordVocabulary = prunedWordVocabulary[:maxVocabularySize]

    def frequencyComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=frequencyComparator)
    prunedWordVocabulary = collections.OrderedDict(prunedWordVocabulary)

    wordIndexMap = {}
    for wordIndex, wordInfo in enumerate(prunedWordVocabulary.items()):
        word, info = wordInfo
        previousIndex, wordFrequency = info
        wordIndexMap[previousIndex] = wordIndex

        prunedWordVocabulary[word] = wordIndex, wordFrequency

        log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)
        pruningStepIndex += 1

    log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepsCount, pruningStepsCount)
    log.lineBreak()

    return prunedWordVocabulary, wordIndexMap
Пример #39
0
def processData(inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = contextSize - fileContextSize

    fileVocabulary = collections.OrderedDict()
    wordVocabulary = collections.OrderedDict()

    unprunedContextsPath = contextsPath + '.unpruned'

    if os.path.exists(unprunedContextsPath):
        os.remove(unprunedContextsPath)

    with open(unprunedContextsPath, 'wb+') as unprunedContextsFile:
        unprunedContextsFile.write(struct.pack('i', 0)) # this is a placeholder for contexts count
        unprunedContextsFile.write(struct.pack('i', contextSize))

        pathName = inputDirectoryPath + '/*/*.txt'
        textFilePaths = glob.glob(pathName)[:200]
        textFilePaths = sorted(textFilePaths)
        textFileCount = len(textFilePaths)
        startTime = time.time()

        contextFormat = '{0}i'.format(contextSize)
        contextsCount = 0

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileVocabulary[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath)
            for wordContext in contextProvider.next(wordContextSize):
                for word in wordContext:
                    if word not in wordVocabulary:
                        wordVocabulary[word] = (len(wordVocabulary), 1)
                    else:
                        wordIndex, frequency = wordVocabulary[word]
                        wordVocabulary[word] = (wordIndex, frequency + 1)

                indexContext = map(lambda w: wordVocabulary[w][0], wordContext)
                indexContext = [textFileIndex] + indexContext

                unprunedContextsFile.write(struct.pack(contextFormat, *indexContext))
                contextsCount += 1

            textFileName = os.path.basename(textFilePath)
            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.',
                         textFileIndex + 1,
                         textFileCount,
                         log.delta(elapsed),
                         secondsPerFile,
                         len(wordVocabulary))

        log.lineBreak()

        unprunedContextsFile.seek(0, io.SEEK_SET)
        unprunedContextsFile.write(struct.pack('i', contextsCount))
        unprunedContextsFile.flush()

    whiteList = whitelist.load()
    originalVocabularyLength = len(wordVocabulary)
    prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList)

    log.info('Vocabulary has been pruned. {0} items left out of {1}.', len(prunedWordVocabulary), originalVocabularyLength)

    with open(unprunedContextsPath, 'rb') as unprunedContextsFile:
        contextsCount = unprunedContextsFile.read(4)
        contextSize = unprunedContextsFile.read(4)

        contextsCount = struct.unpack('i', contextsCount)[0]
        contextSize = struct.unpack('i', contextSize)[0]

        format = '{0}i'.format(contextSize) # plus one spot for file index
        bufferSize = (contextSize) * 4
        prunedContextsCount = 0
        with open(contextsPath, 'wb+') as uncompressedPrunedContexts:
            uncompressedPrunedContexts.write(struct.pack('i', 0)) # placeholder for contexts count
            uncompressedPrunedContexts.write(struct.pack('i', contextSize))

            contextIndex = 0
            while contextIndex < contextsCount:
                buffer = unprunedContextsFile.read(bufferSize)

                context = struct.unpack(format, buffer)
                fileIndex = context[0]
                indexContext = context[1:]

                if all([index in wordIndexMap for index in indexContext]):
                    prunedContextsCount += 1
                    indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext)
                    context = [fileIndex] + indexContext
                    buffer = struct.pack(format, *context)
                    uncompressedPrunedContexts.write(buffer)

                contextIndex += 1
                contextsPruned = contextIndex - prunedContextsCount + 1
                log.progress('Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.',
                             contextIndex,
                             contextsCount,
                             contextsPruned,
                             float(contextsPruned) * 100 / contextsCount,
                             contextsCount)

            log.lineBreak()

            uncompressedPrunedContexts.seek(0, io.SEEK_SET)
            uncompressedPrunedContexts.write(struct.pack('i', prunedContextsCount))
            uncompressedPrunedContexts.flush()

    os.remove(unprunedContextsPath)

    parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath)
    parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)
Пример #40
0
def tsne(X=numpy.array([]),
         no_dims=2,
         initial_dims=50,
         perplexity=30.0,
         epochs=1000):
    """Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions.
    The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array."""

    # Check inputs
    if X.dtype != "float64":
        print "Error: array X should have type float64."
        return -1
        # if no_dims.__class__ != "<type 'int'>":			# doesn't work yet!
    #	print "Error: number of dimensions should be an integer.";
    #	return -1;

    # Initialize variables
    X = pca(X, initial_dims).real
    (n, d) = X.shape
    epochs = 1000
    initial_momentum = 0.5
    final_momentum = 0.8
    eta = 500
    min_gain = 0.01
    numpy.random.seed(0)
    Y = numpy.random.randn(n, no_dims)
    dY = numpy.zeros((n, no_dims))
    iY = numpy.zeros((n, no_dims))
    gains = numpy.ones((n, no_dims))

    # Compute P-values
    P = x2p(X, 1e-5, perplexity)
    P = P + numpy.transpose(P)
    P = P / numpy.sum(P)
    P = P * 4  # early exaggeration
    P = numpy.maximum(P, 1e-12)

    # Run iterations
    for iter in range(epochs):

        # Compute pairwise affinities
        sum_Y = numpy.sum(numpy.square(Y), 1)
        num = 1 / (
            1 + numpy.add(numpy.add(-2 * numpy.dot(Y, Y.T), sum_Y).T, sum_Y))
        num[range(n), range(n)] = 0
        Q = num / numpy.sum(num)
        Q = numpy.maximum(Q, 1e-12)

        # Compute gradient
        PQ = P - Q
        for i in range(n):
            dY[i, :] = numpy.sum(
                numpy.tile(PQ[:, i] * num[:, i],
                           (no_dims, 1)).T * (Y[i, :] - Y), 0)

        # Perform the update
        if iter < 20:
            momentum = initial_momentum
        else:
            momentum = final_momentum
        gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * (
            (dY > 0) == (iY > 0))
        gains[gains < min_gain] = min_gain
        iY = momentum * iY - eta * (gains * dY)
        Y = Y + iY
        Y = Y - numpy.tile(numpy.mean(Y, 0), (n, 1))

        # Compute current value of cost function
        if (iter + 1) % 10 == 0:
            C = numpy.sum(P * numpy.log(P / Q))
            log.progress('Plotting embeddings: {0:.3f}%. Error: {1:.3f}.',
                         iter + 1, epochs, C)  # Stop lying about P-values
        if iter == 100:
            P = P / 4

    # Return solution
    return Y
Пример #41
0
def tsne(X=numpy.array([]), no_dims=2, initial_dims=50, perplexity=30.0, epochs=1000):
    """Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions.
    The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array."""

    # Check inputs
    if X.dtype != "float64":
        print "Error: array X should have type float64."
        return -1
        # if no_dims.__class__ != "<type 'int'>":			# doesn't work yet!
    #	print "Error: number of dimensions should be an integer.";
    #	return -1;

    # Initialize variables
    X = pca(X, initial_dims).real
    (n, d) = X.shape
    epochs = 1000
    initial_momentum = 0.5
    final_momentum = 0.8
    eta = 500
    min_gain = 0.01
    numpy.random.seed(0)
    Y = numpy.random.randn(n, no_dims)
    dY = numpy.zeros((n, no_dims))
    iY = numpy.zeros((n, no_dims))
    gains = numpy.ones((n, no_dims))

    # Compute P-values
    P = x2p(X, 1e-5, perplexity)
    P = P + numpy.transpose(P)
    P = P / numpy.sum(P)
    P = P * 4  # early exaggeration
    P = numpy.maximum(P, 1e-12)

    # Run iterations
    for iter in range(epochs):

        # Compute pairwise affinities
        sum_Y = numpy.sum(numpy.square(Y), 1)
        num = 1 / (1 + numpy.add(numpy.add(-2 * numpy.dot(Y, Y.T), sum_Y).T, sum_Y))
        num[range(n), range(n)] = 0
        Q = num / numpy.sum(num)
        Q = numpy.maximum(Q, 1e-12)

        # Compute gradient
        PQ = P - Q
        for i in range(n):
            dY[i, :] = numpy.sum(numpy.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0)

        # Perform the update
        if iter < 20:
            momentum = initial_momentum
        else:
            momentum = final_momentum
        gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0))
        gains[gains < min_gain] = min_gain
        iY = momentum * iY - eta * (gains * dY)
        Y = Y + iY
        Y = Y - numpy.tile(numpy.mean(Y, 0), (n, 1))

        # Compute current value of cost function
        if (iter + 1) % 10 == 0:
            C = numpy.sum(P * numpy.log(P / Q))
            log.progress('Plotting embeddings: {0:.3f}%. Error: {1:.3f}.',
                         iter + 1, epochs, C)  # Stop lying about P-values
        if iter == 100:
            P = P / 4

    # Return solution
    return Y