Пример #1
0
def buildWordMaps(texts, w2vWordIndexMap, w2vWordEmbeddings):
    wordIndexMap = collections.OrderedDict()
    wordFrequencyMap = collections.OrderedDict()

    for textIndex, text in enumerate(texts):
        for word in weeding.iterateWords(text):
            if word not in w2vWordIndexMap:
                continue

            if word not in wordIndexMap:
                wordIndexMap[word] = len(wordIndexMap)
                wordFrequencyMap[word] = 1
            else:
                wordFrequencyMap[word] += 1

        log.progress('Building word maps: {0:.3f}%. Words: {1}.', textIndex + 1, len(texts), len(wordIndexMap))

    log.lineBreak()

    wordEmbeddings = numpy.zeros((len(wordIndexMap), w2vWordEmbeddings.shape[1]))
    for wordIndexPair in wordIndexMap.items():
        word, index = wordIndexPair
        wordEmbeddings[index] = w2vWordEmbeddings[index]

        log.progress('Copying w2v embeddings: {0:.3f}%.', index + 1, len(wordIndexMap))

    log.lineBreak()

    return wordIndexMap, wordFrequencyMap, wordEmbeddings
Пример #2
0
def dumpFileVocabulary(vocabulary, vocabularyFilePath):
    if os.path.exists(vocabularyFilePath):
        os.remove(vocabularyFilePath)

    itemsCount = len(vocabulary)
    itemIndex = 0

    with open(vocabularyFilePath, 'w') as file:
        file.write(struct.pack('i', itemsCount))

        for key, index in vocabulary.items():
            keyLength = len(key)
            keyLength = struct.pack('i', keyLength)
            index = struct.pack('i', index)

            file.write(keyLength)
            file.write(key)
            file.write(index)

            itemIndex += 1
            log.progress('Dumping file vocabulary: {0:.3f}%.', itemIndex, itemsCount)

        file.flush()

        log.lineBreak()
Пример #3
0
def loadWordVocabulary(vocabularyFilePath, loadFrequencies=True):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            frequency = file.read(4)
            frequency = struct.unpack('i', frequency)[0]

            vocabulary[word] = (index, frequency) if loadFrequencies else index

            log.progress('Loading word vocabulary: {0:.3f}%.', itemIndex + 1, itemsCount)

        log.lineBreak()

    return vocabulary
Пример #4
0
def dumpFileVocabulary(vocabulary, vocabularyFilePath):
    if os.path.exists(vocabularyFilePath):
        os.remove(vocabularyFilePath)

    itemsCount = len(vocabulary)
    itemIndex = 0

    with open(vocabularyFilePath, 'w') as file:
        file.write(struct.pack('i', itemsCount))

        for key, index in vocabulary.items():
            keyLength = len(key)
            keyLength = struct.pack('i', keyLength)
            index = struct.pack('i', index)

            file.write(keyLength)
            file.write(key)
            file.write(index)

            itemIndex += 1
            log.progress('Dumping file vocabulary: {0:.3f}%.', itemIndex,
                         itemsCount)

        file.flush()

        log.lineBreak()
Пример #5
0
def make():
    words = []

    words += getSyntacticWordRelationsWords(
        'res/Syntactic-Word-Relations/questions-words.txt')
    words += getSATWords('res/SAT-Questions/SAT-package-V3.txt')
    words += getSimLex999Words('res/SimLex-999/SimLex-999.txt')
    words += getWordSimilarity353Words('res/WordSimilarity-353/combined.csv')
    words += getRubensteinGoodenoughWords('res/RG/EN-RG-65.txt')

    words = list(set(words))
    words = sorted(words)

    log.info('Found {0} words.', len(words))

    whiteListPath = 'res/Tools/white_list.txt'
    if os.path.exists(whiteListPath):
        os.remove(whiteListPath)

    with open(whiteListPath, 'w+') as whiteListFile:
        batchSize = 10
        batchesCount = len(words) / batchSize + 1
        for batchIndex in xrange(0, batchesCount):
            batch = words[batchIndex * batchSize:(batchIndex + 1) * batchSize]
            line = ' '.join(batch) + '\n'
            line = line.lower()

            whiteListFile.write(line)

            log.progress('Saving white list: {0:.0f}%.', batchIndex + 1,
                         batchesCount)

    log.lineBreak()
    log.info('White list has been saved.')
Пример #6
0
def loadWordVocabulary(vocabularyFilePath, loadFrequencies=True):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            frequency = file.read(4)
            frequency = struct.unpack('i', frequency)[0]

            vocabulary[word] = (index, frequency) if loadFrequencies else index

            log.progress('Loading word vocabulary: {0:.3f}%.', itemIndex + 1,
                         itemsCount)

        log.lineBreak()

    return vocabulary
Пример #7
0
def make():
    words = []

    words += getSyntacticWordRelationsWords('res/Syntactic-Word-Relations/questions-words.txt')
    words += getSATWords('res/SAT-Questions/SAT-package-V3.txt')
    words += getSimLex999Words('res/SimLex-999/SimLex-999.txt')
    words += getWordSimilarity353Words('res/WordSimilarity-353/combined.csv')
    words += getRubensteinGoodenoughWords('res/RG/EN-RG-65.txt')

    words = list(set(words))
    words = sorted(words)

    log.info('Found {0} words.', len(words))

    whiteListPath = 'res/Tools/white_list.txt'
    if os.path.exists(whiteListPath):
        os.remove(whiteListPath)

    with open(whiteListPath, 'w+') as whiteListFile:
        batchSize = 10
        batchesCount = len(words) / batchSize + 1
        for batchIndex in xrange(0, batchesCount):
            batch = words[batchIndex * batchSize : (batchIndex + 1) * batchSize]
            line =  ' '.join(batch) + '\n'
            line = line.lower()

            whiteListFile.write(line)

            log.progress('Saving white list: {0:.0f}%.', batchIndex + 1, batchesCount)

    log.lineBreak()
    log.info('White list has been saved.')
Пример #8
0
def loadWord2VecEmbeddings(filePath):
    with open(filePath, 'rb') as file:
        firstLine = file.readline()
        embeddingsCount, embeddingSize = tuple(firstLine.split(' '))
        embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize)
        embeddingFormat = '{0}f'.format(embeddingSize)
        wordIndexMap = {}
        embeddings = []

        log.info('Vocabulary size: {0}. Embedding size: {1}.', embeddingsCount, embeddingSize)

        embeddingIndex = 0
        while True:
            word = ''
            while True:
                char = file.read(1)

                if not char:
                    log.lineBreak()
                    return wordIndexMap, embeddings

                if char == ' ':
                    word = word.strip()
                    break

                word += char

            embedding = struct.unpack(embeddingFormat, file.read(embeddingSize * 4))
            wordIndexMap[word] = len(wordIndexMap)
            embeddings.append(embedding)

            embeddingIndex += 1
            log.progress('Reading embeddings: {0:.3f}%.', embeddingIndex, embeddingsCount)
Пример #9
0
def extract(outputDirectoryPath, outputConcatFilePath, connector):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)

    textContainersCount = connector.count()
    log.info('Found {0} text containers.', textContainersCount)

    if os.path.exists(outputConcatFilePath):
        os.remove(outputConcatFilePath)

    pagesCount = 0
    startTime = time.time()

    for textContainerIndex, name, text in connector.iterate():
        text = clean(text)

        outputFilePath = os.path.join(outputDirectoryPath, name + '.txt')

        saveText(outputFilePath, text)
        saveText(outputConcatFilePath, text)

        currentTime = time.time()
        elapsed = currentTime - startTime
        pagesCount += 1

        log.progress('Extracting text containers: {0:.3f}%. Elapsed: {1}. Pages: {2}.',
                     textContainerIndex + 1,
                     textContainersCount,
                     log.delta(elapsed),
                     pagesCount)

    log.lineBreak()
Пример #10
0
def loadWordRequencyMap(indexMapFilePath):
    wordRequencyMap = loadMap(indexMapFilePath)

    log.progress('Sorting word frequency map...', 1, 1)

    wordRequencyMap = sorted(wordRequencyMap.items(), key=lambda item: item[1], reverse=True)
    wordRequencyMap = collections.OrderedDict(wordRequencyMap)

    log.progress('Sorting word frequency map complete.', 1, 1)
    log.lineBreak()

    return wordRequencyMap
Пример #11
0
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model,
               superBatchSize, miniBatchSize, parametersPath, embeddingsPath,
               learningRate, l1Coefficient, l2Coefficient, epochs,
               metricsPath):
    if os.path.exists(metricsPath):
        os.remove(metricsPath)

    superBatchesCount = contextProvider.contextsCount / superBatchSize + 1
    startTime = time.time()
    previousTotal = 0

    for epoch in xrange(0, epochs):
        for superBatchIndex in xrange(0, superBatchesCount):
            contextSuperBatch = contextProvider[superBatchIndex *
                                                superBatchSize:
                                                (superBatchIndex + 1) *
                                                superBatchSize]

            fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:,
                                                                            1], contextSuperBatch[:,
                                                                                                  1:
                                                                                                  -1], contextSuperBatch[:,
                                                                                                                         -1]

            model.train(wordIndices, targetWordIndices, miniBatchSize,
                        learningRate, l1Coefficient, l2Coefficient)

            metrics = validation.validate(wordVocabulary, model)
            customMetrics = {
                'simGemJewel': similarity('gem', 'jewel', wordVocabulary,
                                          model)
            }
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics,
                            **customMetrics)
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics)

            if previousTotal < sum(metrics):
                model.dump(parametersPath, embeddingsPath)

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerEpoch = elapsed / (epoch + 1)

            rg, sim353, simLex999, syntRel, sat = metrics
            log.progress(
                'Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.',
                epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch,
                rg, sim353, simLex999, syntRel, sat,
                customMetrics['simGemJewel'])

    log.lineBreak()

    return model
Пример #12
0
def extract(connector):
    textFilesCount = connector.count()

    names = []
    texts = []
    for textFileIndex, name, text in connector.iterate():
        text = extraction.clean(text)

        names.append(name)
        texts.append(text)

        log.progress('Extracting text: {0:.3f}%. Texts: {1}.', textFileIndex + 1, textFilesCount, textFileIndex + 1)

    log.lineBreak()

    return names, texts
Пример #13
0
def inferContexts(contextsPath, names, texts, wordIndexMap, windowSize, negative, strict, contextsCount):
    textIndexMap = collections.OrderedDict()

    def wordsToIndices(textContext):
        indices = map(lambda word: wordIndexMap[word], textContext)
        return indices

    wordIndices = map(lambda item: item[1], wordIndexMap.items())
    wordIndices = numpy.asarray(wordIndices)
    maxWordIndex = max(wordIndices)

    with h5py.File(contextsPath, 'w') as contextsFile:
        tensor = contextsFile.create_dataset('contexts',
                                             dtype='int32',
                                             shape=(0, contextsCount, 1 + windowSize + negative), # 1 for file index
                                             maxshape=(None, contextsCount, 1 + windowSize + negative), # 1 for file index
                                             chunks=(1, contextsCount, 1 + windowSize + negative)) # 1 for file index

        textsCount = 0
        for name, text in zip(names, texts):
            contextProvider = processing.WordContextProvider(text=text, minContexts=contextsCount, maxContexts=contextsCount)
            contexts = list(contextProvider.iterate(windowSize))

            if len(contexts) > 0:
                contexts = map(wordsToIndices, contexts)
                textIndexMap[name] = len(textIndexMap)
                contexts = numpy.asarray(contexts)
                textIndices = [[textIndexMap[name]]] * len(contexts)
                contexts = numpy.concatenate([textIndices, contexts], axis=1)

                negativeSamples = processing.generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict)
                contexts = numpy.concatenate([contexts, negativeSamples], axis=1)
                tensor.resize(tensor.shape[0] + 1, axis=0)
                tensor[-1] = contexts

            textsCount += 1
            log.progress('Creating contexts: {0:.3f}%. Text index map: {1}. Contexts: {2}.',
                         textsCount,
                         len(texts),
                         len(tensor),
                         tensor.shape[0] * tensor.shape[1])

    log.lineBreak()

    return textIndexMap
Пример #14
0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)
        log.info("Output directory {0} has been removed.", outputDirectoryPath)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)
    log.info("Output directory {0} has been created.", outputDirectoryPath)

    pathName = inputDirectoryPath + "/*wiki*.txt.gz"
    dumpPaths = glob.glob(pathName)[:10]
    dumpsCount = len(dumpPaths)
    log.info("Found {0} Wikipedia dumps.", dumpsCount)

    startTime = time.time()

    for dumpIndex, dumpPath in enumerate(dumpPaths):
        dumpName, pages = unpackDump(dumpPath, cleanText)

        if len(pages) > 0:
            dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName)
            os.mkdir(dumpDirectoryPath)
            os.chown(dumpDirectoryPath, 1000, 1000)

            for pageName, pageText in pages:
                savePage(dumpDirectoryPath, pageName, pageText)

        currentTime = time.time()
        elapsed = currentTime - startTime
        secondsPerFile = elapsed / (dumpIndex + 1)

        log.progress(
            "Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)",
            dumpIndex + 1,
            dumpsCount,
            dumpName,
            len(pages),
            log.delta(elapsed),
            secondsPerFile,
        )

    log.lineBreak()
    log.info("Processing complete.")
Пример #15
0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)
        log.info('Output directory {0} has been removed.', outputDirectoryPath)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)
    log.info('Output directory {0} has been created.', outputDirectoryPath)

    pathName = inputDirectoryPath + '/*wiki*.txt.gz'
    dumpPaths = glob.glob(pathName)[:10]
    dumpsCount = len(dumpPaths)
    log.info('Found {0} Wikipedia dumps.', dumpsCount)

    startTime = time.time()

    for dumpIndex, dumpPath in enumerate(dumpPaths):
        dumpName, pages = unpackDump(dumpPath, cleanText)

        if len(pages) > 0:
            dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName)
            os.mkdir(dumpDirectoryPath)
            os.chown(dumpDirectoryPath, 1000, 1000)

            for pageName, pageText in pages:
                savePage(dumpDirectoryPath, pageName, pageText)

        currentTime = time.time()
        elapsed = currentTime - startTime
        secondsPerFile = elapsed / (dumpIndex + 1)

        log.progress('Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)',
                     dumpIndex + 1,
                     dumpsCount,
                     dumpName,
                     len(pages),
                     log.delta(elapsed),
                     secondsPerFile)

    log.lineBreak()
    log.info('Processing complete.')
Пример #16
0
def loadW2VParameters(filePath, loadEmbeddings=True):
    with open(filePath, 'rb') as w2vFile:
        firstLine = w2vFile.readline()
        embeddingsCount, embeddingSize = tuple(firstLine.split(' '))
        embeddingsCount, embeddingSize = int(embeddingsCount), int(embeddingSize)
        wordIndexMap = collections.OrderedDict()
        embeddings = numpy.zeros((embeddingsCount, embeddingSize))

        embeddingIndex = 0
        while True:
            word = ''
            while True:
                char = w2vFile.read(1)

                if not char:
                    log.lineBreak()

                    if loadEmbeddings:
                        return wordIndexMap, embeddings
                    else:
                        return wordIndexMap

                if char == ' ':
                    word = word.strip()
                    break

                word += char

            wordIndexMap[word] = len(wordIndexMap)
            if loadEmbeddings:
                embedding = binary.readf(w2vFile, embeddingSize)
                embeddings[wordIndexMap[word]] = embedding
            else:
                w2vFile.seek(embeddingSize * 4, io.SEEK_CUR)

            embeddingIndex += 1
            log.progress('Loading W2V embeddings: {0:.3f}%. {1} embeddings {2} features each.',
                         embeddingIndex,
                         embeddingsCount,
                         embeddingIndex,
                         embeddingSize)
Пример #17
0
def dumpEmbeddings(embeddings, embeddingsFilePath):
    if os.path.exists(embeddingsFilePath):
        os.remove(embeddingsFilePath)

    if not isinstance(embeddings, numpy.ndarray):
        embeddings = numpy.asarray(embeddings)

    embeddingsCount, embeddingSize = embeddings.shape

    with open(embeddingsFilePath, 'w') as embeddingsFile:
        binary.writei(embeddingsFile, embeddingsCount)
        binary.writei(embeddingsFile, embeddingSize)

        for embeddingIndex in range(0, embeddingsCount):
            embedding = embeddings[embeddingIndex]

            binary.writef(embeddingsFile, embedding)

            log.progress('Dumping embeddings: {0:.3f}%.', embeddingIndex + 1, embeddingsCount)

        log.lineBreak()
Пример #18
0
def subsampleAndPrune(texts, wordFrequencyMap, sample, minCount):
    totalLength = 0.
    prunedLength = 0.

    maxFrequency = wordFrequencyMap.items()[0][1]

    for textIndex, text in enumerate(texts):
        totalLength += len(text)

        texts[textIndex] = weeding.subsampleAndPrune(text, wordFrequencyMap, maxFrequency, sample, minCount)

        prunedLength += len(texts[textIndex])

        log.progress('Subsampling and pruning text: {0:.3f}%. Removed {1:.3f}% of original text.',
                     textIndex + 1,
                     len(texts),
                     (1 - prunedLength/totalLength) * 100)

    log.lineBreak()

    return texts
Пример #19
0
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model, superBatchSize, miniBatchSize, parametersPath, embeddingsPath, learningRate, l1Coefficient, l2Coefficient, epochs, metricsPath):
    if os.path.exists(metricsPath):
        os.remove(metricsPath)

    superBatchesCount = contextProvider.contextsCount / superBatchSize + 1
    startTime = time.time()
    previousTotal = 0

    for epoch in xrange(0, epochs):
        for superBatchIndex in xrange(0, superBatchesCount):
            contextSuperBatch = contextProvider[superBatchIndex * superBatchSize:(superBatchIndex + 1) * superBatchSize]

            fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:,1], contextSuperBatch[:,1:-1], contextSuperBatch[:,-1]

            model.train(wordIndices, targetWordIndices, miniBatchSize, learningRate, l1Coefficient, l2Coefficient)

            metrics = validation.validate(wordVocabulary, model)
            customMetrics = {
                'simGemJewel': similarity('gem', 'jewel', wordVocabulary, model)
            }
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics, **customMetrics)
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics)

            if previousTotal < sum(metrics):
                model.dump(parametersPath, embeddingsPath)

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerEpoch = elapsed / (epoch + 1)

            rg, sim353, simLex999, syntRel, sat = metrics
            log.progress('Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.',
                         epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch,
                         rg, sim353, simLex999, syntRel, sat,
                         customMetrics['simGemJewel'])

    log.lineBreak()

    return model
Пример #20
0
def loadFileVocabulary(vocabularyFilePath):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            vocabulary[word] = index

            log.progress('Loading file vocabulary: {0:.3f}%.', itemIndex + 1, itemsCount)

        log.lineBreak()

    return vocabulary
Пример #21
0
def dumpWordMap(indexMap, indexMapFilePath):
    if os.path.exists(indexMapFilePath):
        os.remove(indexMapFilePath)

    with open(indexMapFilePath, 'w') as indexMapFile:
        indexMapSize = len(indexMap)
        itemIndex = 0

        binary.writei(indexMapFile, indexMapSize)

        for key, index in indexMap.items():
            keyLength = len(key)

            binary.writei(indexMapFile, keyLength)
            binary.writes(indexMapFile, key)
            binary.writei(indexMapFile, index)

            itemIndex += 1
            log.progress('Dumping map: {0:.3f}%.', itemIndex, indexMapSize)

        indexMapFile.flush()

        log.lineBreak()
Пример #22
0
    def innerExecute(self, selector):
        words = []
        embeddings = []

        for operator, word in selector.operands:
            embedding = self.wordEmbeddings[self.wordIndexMap[word]]
            if operator == '-':
                embedding = embedding * (-1)

            words.append(word)
            embeddings.append(embedding)

        minIndex, maxIndex = ExplainFunction.getSurroundingIndices(words, self.wordIndexMap, 5000)

        result = embeddings[0]
        for embedding in embeddings[1:]:
            result += embedding

        scores = []
        for index in xrange(minIndex, maxIndex):
            word = self.wordIndexItems[index][0]

            if word not in words:
                embedding = self.wordEmbeddings[index]
                score = vectors.cosineSimilarity(result, embedding)
                scores.append((word, score))

            log.progress('Looking for closest matches: {0:.3f}%.',
                         index - minIndex + 1,
                         maxIndex - minIndex)

        log.lineBreak()

        scores = sorted(scores, key=lambda s: s[1], reverse=True)

        for score in scores[:10]:
            print score
Пример #23
0
def loadFileVocabulary(vocabularyFilePath):
    vocabulary = collections.OrderedDict()

    with open(vocabularyFilePath, 'rb') as file:
        itemsCount = file.read(4)
        itemsCount = struct.unpack('i', itemsCount)[0]

        for itemIndex in range(0, itemsCount):
            wordLength = file.read(4)
            wordLength = struct.unpack('i', wordLength)[0]

            word = file.read(wordLength)

            index = file.read(4)
            index = struct.unpack('i', index)[0]

            vocabulary[word] = index

            log.progress('Loading file vocabulary: {0:.3f}%.', itemIndex + 1,
                         itemsCount)

        log.lineBreak()

    return vocabulary
Пример #24
0
def processData(inputDirectoryPath, w2vEmbeddingsFilePath, fileIndexMapFilePath,
                wordIndexMapFilePath, wordEmbeddingsFilePath, contextsPath, windowSize, negative, strict):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = windowSize - fileContextSize

    fileIndexMap = {}
    wordIndexMap = collections.OrderedDict()
    wordEmbeddings = []

    noNegativeSamplingPath = contextsPath
    if negative > 0:
        noNegativeSamplingPath += '.temp'

    if os.path.exists(noNegativeSamplingPath):
        os.remove(noNegativeSamplingPath)

    pathName = inputDirectoryPath + '/*.txt'
    textFilePaths = glob.glob(pathName)
    textFilePaths = sorted(textFilePaths)
    textFileCount = len(textFilePaths)

    w2vWordIndexMap, w2vEmbeddings = parameters.loadW2VParameters(w2vEmbeddingsFilePath)

    contextsCount = 0
    with open(noNegativeSamplingPath, 'wb+') as noNegativeSamplingFile:
        binary.writei(noNegativeSamplingFile, 0) # this is a placeholder for contexts count
        binary.writei(noNegativeSamplingFile, windowSize)
        binary.writei(noNegativeSamplingFile, 0)

        startTime = time.time()

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileIndexMap[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath=textFilePath)
            for wordContext in contextProvider.iterate(wordContextSize):
                allWordsInWordVocabulary = [word in w2vWordIndexMap for word in wordContext]

                if not all(allWordsInWordVocabulary):
                    continue

                for word in wordContext:
                    if word not in wordIndexMap:
                        wordIndexMap[word] = len(wordIndexMap)
                        wordEmbeddingIndex = w2vWordIndexMap[word]
                        wordEmbedding = w2vEmbeddings[wordEmbeddingIndex]
                        wordEmbeddings.append(wordEmbedding)

                indexContext = [textFileIndex] + map(lambda w: wordIndexMap[w], wordContext)

                binary.writei(noNegativeSamplingFile, indexContext)
                contextsCount += 1

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Words: {3}. Contexts: {4}.',
                         textFileIndex + 1,
                         textFileCount,
                         log.delta(elapsed),
                         secondsPerFile,
                         len(wordIndexMap),
                         contextsCount)

        log.lineBreak()

        noNegativeSamplingFile.seek(0, io.SEEK_SET)
        binary.writei(noNegativeSamplingFile, contextsCount)
        noNegativeSamplingFile.flush()

    if negative > 0:
        with open(contextsPath, 'wb+') as contextsFile:
            startTime = time.time()

            contextProvider = parameters.IndexContextProvider(noNegativeSamplingPath)

            binary.writei(contextsFile, contextsCount)
            binary.writei(contextsFile, windowSize)
            binary.writei(contextsFile, negative)

            batchSize = 10000
            batchesCount = contextsCount / batchSize + 1

            wordIndices = map(lambda item: item[1], wordIndexMap.items())
            wordIndices = numpy.asarray(wordIndices)
            maxWordIndex = max(wordIndices)

            for batchIndex in xrange(0, batchesCount):
                contexts = contextProvider[batchIndex * batchSize : (batchIndex + 1) * batchSize]
                negativeSamples = generateNegativeSamples(negative, contexts, wordIndices, maxWordIndex, strict)
                contexts = numpy.concatenate([contexts, negativeSamples], axis=1)
                contexts = numpy.ravel(contexts)

                binary.writei(contextsFile, contexts)

                currentTime = time.time()
                elapsed = currentTime - startTime

                log.progress('Negative sampling: {0:.3f}%. Elapsed: {1}.',
                     batchIndex + 1,
                     batchesCount,
                     log.delta(elapsed))

            log.lineBreak()
            contextsFile.flush()

            os.remove(noNegativeSamplingPath)

    parameters.dumpWordMap(fileIndexMap, fileIndexMapFilePath)
    parameters.dumpWordMap(wordIndexMap, wordIndexMapFilePath)
    parameters.dumpEmbeddings(wordEmbeddings, wordEmbeddingsFilePath)
Пример #25
0
def pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList):
    global pruningStepIndex

    originalVocabularyLength = len(wordVocabulary)
    prunedVocabularyLength = min(originalVocabularyLength, maxVocabularySize)

    pruningStepsCount = 0
    if originalVocabularyLength > maxVocabularySize:
        pruningStepsCount += originalVocabularyLength * math.log(originalVocabularyLength)
    pruningStepsCount += prunedVocabularyLength * math.log(prunedVocabularyLength)
    pruningStepsCount += prunedVocabularyLength

    def whiteListPriorityComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        wordXIsWhite = wordX in whiteList
        wordYIsWhite = wordY in whiteList

        if wordXIsWhite and wordYIsWhite:
            return 0
        elif wordXIsWhite:
            return -1
        elif wordYIsWhite:
            return 1

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = wordVocabulary.items()

    if originalVocabularyLength > maxVocabularySize:
        prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=whiteListPriorityComparator)
        prunedWordVocabulary = prunedWordVocabulary[:maxVocabularySize]

    def frequencyComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=frequencyComparator)
    prunedWordVocabulary = collections.OrderedDict(prunedWordVocabulary)

    wordIndexMap = {}
    for wordIndex, wordInfo in enumerate(prunedWordVocabulary.items()):
        word, info = wordInfo
        previousIndex, wordFrequency = info
        wordIndexMap[previousIndex] = wordIndex

        prunedWordVocabulary[word] = wordIndex, wordFrequency

        log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepIndex, pruningStepsCount)
        pruningStepIndex += 1

    log.progress("Pruning word vocabulary: {0:.3f}%.", pruningStepsCount, pruningStepsCount)
    log.lineBreak()

    return prunedWordVocabulary, wordIndexMap
Пример #26
0
def processData(
    inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize
):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = contextSize - fileContextSize

    fileVocabulary = collections.OrderedDict()
    wordVocabulary = collections.OrderedDict()

    unprunedContextsPath = contextsPath + ".unpruned"

    if os.path.exists(unprunedContextsPath):
        os.remove(unprunedContextsPath)

    with open(unprunedContextsPath, "wb+") as unprunedContextsFile:
        unprunedContextsFile.write(struct.pack("i", 0))  # this is a placeholder for contexts count
        unprunedContextsFile.write(struct.pack("i", contextSize))

        pathName = inputDirectoryPath + "/*/*.txt"
        textFilePaths = glob.glob(pathName)[:200]
        textFilePaths = sorted(textFilePaths)
        textFileCount = len(textFilePaths)
        startTime = time.time()

        contextFormat = "{0}i".format(contextSize)
        contextsCount = 0

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileVocabulary[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath)
            for wordContext in contextProvider.next(wordContextSize):
                for word in wordContext:
                    if word not in wordVocabulary:
                        wordVocabulary[word] = (len(wordVocabulary), 1)
                    else:
                        wordIndex, frequency = wordVocabulary[word]
                        wordVocabulary[word] = (wordIndex, frequency + 1)

                indexContext = map(lambda w: wordVocabulary[w][0], wordContext)
                indexContext = [textFileIndex] + indexContext

                unprunedContextsFile.write(struct.pack(contextFormat, *indexContext))
                contextsCount += 1

            textFileName = os.path.basename(textFilePath)
            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress(
                "Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.",
                textFileIndex + 1,
                textFileCount,
                log.delta(elapsed),
                secondsPerFile,
                len(wordVocabulary),
            )

        log.lineBreak()

        unprunedContextsFile.seek(0, io.SEEK_SET)
        unprunedContextsFile.write(struct.pack("i", contextsCount))
        unprunedContextsFile.flush()

    whiteList = whitelist.load()
    originalVocabularyLength = len(wordVocabulary)
    prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList)

    log.info(
        "Vocabulary has been pruned. {0} items left out of {1}.", len(prunedWordVocabulary), originalVocabularyLength
    )

    with open(unprunedContextsPath, "rb") as unprunedContextsFile:
        contextsCount = unprunedContextsFile.read(4)
        contextSize = unprunedContextsFile.read(4)

        contextsCount = struct.unpack("i", contextsCount)[0]
        contextSize = struct.unpack("i", contextSize)[0]

        format = "{0}i".format(contextSize)  # plus one spot for file index
        bufferSize = (contextSize) * 4
        prunedContextsCount = 0
        with open(contextsPath, "wb+") as uncompressedPrunedContexts:
            uncompressedPrunedContexts.write(struct.pack("i", 0))  # placeholder for contexts count
            uncompressedPrunedContexts.write(struct.pack("i", contextSize))

            contextIndex = 0
            while contextIndex < contextsCount:
                buffer = unprunedContextsFile.read(bufferSize)

                context = struct.unpack(format, buffer)
                fileIndex = context[0]
                indexContext = context[1:]

                if all([index in wordIndexMap for index in indexContext]):
                    prunedContextsCount += 1
                    indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext)
                    context = [fileIndex] + indexContext
                    buffer = struct.pack(format, *context)
                    uncompressedPrunedContexts.write(buffer)

                contextIndex += 1
                contextsPruned = contextIndex - prunedContextsCount + 1
                log.progress(
                    "Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.",
                    contextIndex,
                    contextsCount,
                    contextsPruned,
                    float(contextsPruned) * 100 / contextsCount,
                    contextsCount,
                )

            log.lineBreak()

            uncompressedPrunedContexts.seek(0, io.SEEK_SET)
            uncompressedPrunedContexts.write(struct.pack("i", prunedContextsCount))
            uncompressedPrunedContexts.flush()

    os.remove(unprunedContextsPath)

    parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath)
    parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)
Пример #27
0
def processData(inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = contextSize - fileContextSize

    fileVocabulary = collections.OrderedDict()
    wordVocabulary = collections.OrderedDict()

    unprunedContextsPath = contextsPath + '.unpruned'

    if os.path.exists(unprunedContextsPath):
        os.remove(unprunedContextsPath)

    with open(unprunedContextsPath, 'wb+') as unprunedContextsFile:
        unprunedContextsFile.write(struct.pack('i', 0)) # this is a placeholder for contexts count
        unprunedContextsFile.write(struct.pack('i', contextSize))

        pathName = inputDirectoryPath + '/*/*.txt'
        textFilePaths = glob.glob(pathName)[:200]
        textFilePaths = sorted(textFilePaths)
        textFileCount = len(textFilePaths)
        startTime = time.time()

        contextFormat = '{0}i'.format(contextSize)
        contextsCount = 0

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileVocabulary[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath)
            for wordContext in contextProvider.next(wordContextSize):
                for word in wordContext:
                    if word not in wordVocabulary:
                        wordVocabulary[word] = (len(wordVocabulary), 1)
                    else:
                        wordIndex, frequency = wordVocabulary[word]
                        wordVocabulary[word] = (wordIndex, frequency + 1)

                indexContext = map(lambda w: wordVocabulary[w][0], wordContext)
                indexContext = [textFileIndex] + indexContext

                unprunedContextsFile.write(struct.pack(contextFormat, *indexContext))
                contextsCount += 1

            textFileName = os.path.basename(textFilePath)
            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.',
                         textFileIndex + 1,
                         textFileCount,
                         log.delta(elapsed),
                         secondsPerFile,
                         len(wordVocabulary))

        log.lineBreak()

        unprunedContextsFile.seek(0, io.SEEK_SET)
        unprunedContextsFile.write(struct.pack('i', contextsCount))
        unprunedContextsFile.flush()

    whiteList = whitelist.load()
    originalVocabularyLength = len(wordVocabulary)
    prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList)

    log.info('Vocabulary has been pruned. {0} items left out of {1}.', len(prunedWordVocabulary), originalVocabularyLength)

    with open(unprunedContextsPath, 'rb') as unprunedContextsFile:
        contextsCount = unprunedContextsFile.read(4)
        contextSize = unprunedContextsFile.read(4)

        contextsCount = struct.unpack('i', contextsCount)[0]
        contextSize = struct.unpack('i', contextSize)[0]

        format = '{0}i'.format(contextSize) # plus one spot for file index
        bufferSize = (contextSize) * 4
        prunedContextsCount = 0
        with open(contextsPath, 'wb+') as uncompressedPrunedContexts:
            uncompressedPrunedContexts.write(struct.pack('i', 0)) # placeholder for contexts count
            uncompressedPrunedContexts.write(struct.pack('i', contextSize))

            contextIndex = 0
            while contextIndex < contextsCount:
                buffer = unprunedContextsFile.read(bufferSize)

                context = struct.unpack(format, buffer)
                fileIndex = context[0]
                indexContext = context[1:]

                if all([index in wordIndexMap for index in indexContext]):
                    prunedContextsCount += 1
                    indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext)
                    context = [fileIndex] + indexContext
                    buffer = struct.pack(format, *context)
                    uncompressedPrunedContexts.write(buffer)

                contextIndex += 1
                contextsPruned = contextIndex - prunedContextsCount + 1
                log.progress('Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.',
                             contextIndex,
                             contextsCount,
                             contextsPruned,
                             float(contextsPruned) * 100 / contextsCount,
                             contextsCount)

            log.lineBreak()

            uncompressedPrunedContexts.seek(0, io.SEEK_SET)
            uncompressedPrunedContexts.write(struct.pack('i', prunedContextsCount))
            uncompressedPrunedContexts.flush()

    os.remove(unprunedContextsPath)

    parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath)
    parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)
Пример #28
0
def pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList):
    global pruningStepIndex

    originalVocabularyLength = len(wordVocabulary)
    prunedVocabularyLength = min(originalVocabularyLength, maxVocabularySize)

    pruningStepsCount = 0
    if originalVocabularyLength > maxVocabularySize:
        pruningStepsCount += originalVocabularyLength * math.log(originalVocabularyLength)
    pruningStepsCount += prunedVocabularyLength * math.log(prunedVocabularyLength)
    pruningStepsCount += prunedVocabularyLength

    def whiteListPriorityComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        wordXIsWhite = wordX in whiteList
        wordYIsWhite = wordY in whiteList

        if wordXIsWhite and wordYIsWhite:
            return 0
        elif wordXIsWhite:
            return -1
        elif wordYIsWhite:
            return 1

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = wordVocabulary.items()

    if originalVocabularyLength > maxVocabularySize:
        prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=whiteListPriorityComparator)
        prunedWordVocabulary = prunedWordVocabulary[:maxVocabularySize]

    def frequencyComparator(wordInfoX, wordInfoY):
        global pruningStepIndex

        pruningStepIndex += 1
        if pruningStepIndex % 1000:
            log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)

        wordX, infoX = wordInfoX
        wordY, infoY = wordInfoY

        frequencyX = infoX[1]
        frequencyY = infoY[1]

        if frequencyX < frequencyY:
            return 1
        elif frequencyX > frequencyY:
            return -1

        return 0

    prunedWordVocabulary = sorted(prunedWordVocabulary, cmp=frequencyComparator)
    prunedWordVocabulary = collections.OrderedDict(prunedWordVocabulary)

    wordIndexMap = {}
    for wordIndex, wordInfo in enumerate(prunedWordVocabulary.items()):
        word, info = wordInfo
        previousIndex, wordFrequency = info
        wordIndexMap[previousIndex] = wordIndex

        prunedWordVocabulary[word] = wordIndex, wordFrequency

        log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepIndex, pruningStepsCount)
        pruningStepIndex += 1

    log.progress('Pruning word vocabulary: {0:.3f}%.', pruningStepsCount, pruningStepsCount)
    log.lineBreak()

    return prunedWordVocabulary, wordIndexMap