Exemplo n.º 1
0
def trainModel(fileVocabulary, wordVocabulary, contextProvider, model,
               superBatchSize, miniBatchSize, parametersPath, embeddingsPath,
               learningRate, l1Coefficient, l2Coefficient, epochs,
               metricsPath):
    if os.path.exists(metricsPath):
        os.remove(metricsPath)

    superBatchesCount = contextProvider.contextsCount / superBatchSize + 1
    startTime = time.time()
    previousTotal = 0

    for epoch in xrange(0, epochs):
        for superBatchIndex in xrange(0, superBatchesCount):
            contextSuperBatch = contextProvider[superBatchIndex *
                                                superBatchSize:
                                                (superBatchIndex + 1) *
                                                superBatchSize]

            fileIndices, wordIndices, targetWordIndices = contextSuperBatch[:,
                                                                            1], contextSuperBatch[:,
                                                                                                  1:
                                                                                                  -1], contextSuperBatch[:,
                                                                                                                         -1]

            model.train(wordIndices, targetWordIndices, miniBatchSize,
                        learningRate, l1Coefficient, l2Coefficient)

            metrics = validation.validate(wordVocabulary, model)
            customMetrics = {
                'simGemJewel': similarity('gem', 'jewel', wordVocabulary,
                                          model)
            }
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics,
                            **customMetrics)
            validation.dump(metricsPath, epoch, superBatchIndex, *metrics)

            if previousTotal < sum(metrics):
                model.dump(parametersPath, embeddingsPath)

            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerEpoch = elapsed / (epoch + 1)

            rg, sim353, simLex999, syntRel, sat = metrics
            log.progress(
                'Training model: {0:.3f}%. Elapsed: {1}. Epoch: {2}. ({3:.3f} sec/epoch), RG: {4}. Sim353: {5}. SimLex999: {6}. SyntRel: {7}. SAT: {8}. Gem/Jewel: {9:.3f}.',
                epoch + 1, epochs, log.delta(elapsed), epoch, secondsPerEpoch,
                rg, sim353, simLex999, syntRel, sat,
                customMetrics['simGemJewel'])

    log.lineBreak()

    return model
Exemplo n.º 2
0
def prepareWikipediaDumps(inputDirectoryPath, outputDirectoryPath, cleanText=True):
    if os.path.exists(outputDirectoryPath):
        shutil.rmtree(outputDirectoryPath, ignore_errors=True)
        log.info('Output directory {0} has been removed.', outputDirectoryPath)

    os.mkdir(outputDirectoryPath)
    os.chown(outputDirectoryPath, 1000, 1000)
    log.info('Output directory {0} has been created.', outputDirectoryPath)

    pathName = inputDirectoryPath + '/*wiki*.txt.gz'
    dumpPaths = glob.glob(pathName)[:10]
    dumpsCount = len(dumpPaths)
    log.info('Found {0} Wikipedia dumps.', dumpsCount)

    startTime = time.time()

    for dumpIndex, dumpPath in enumerate(dumpPaths):
        dumpName, pages = unpackDump(dumpPath, cleanText)

        if len(pages) > 0:
            dumpDirectoryPath = os.path.join(outputDirectoryPath, dumpName)
            os.mkdir(dumpDirectoryPath)
            os.chown(dumpDirectoryPath, 1000, 1000)

            for pageName, pageText in pages:
                savePage(dumpDirectoryPath, pageName, pageText)

        currentTime = time.time()
        elapsed = currentTime - startTime
        secondsPerFile = elapsed / (dumpIndex + 1)

        log.progress('Unpacking Wikipedia dumps: {0:.3f}%. Last dump: {1} ({2} pages). Elapsed: {3}. ({4:.3f} sec/dump)',
                     dumpIndex + 1,
                     dumpsCount,
                     dumpName,
                     len(pages),
                     log.delta(elapsed),
                     secondsPerFile)

    log.lineBreak()
    log.info('Processing complete.')
Exemplo n.º 3
0
def processData(inputDirectoryPath, fileVocabularyPath, wordVocabularyPath, contextsPath, contextSize, maxVocabularySize):
    if os.path.exists(contextsPath):
        os.remove(contextsPath)

    fileContextSize = 1
    wordContextSize = contextSize - fileContextSize

    fileVocabulary = collections.OrderedDict()
    wordVocabulary = collections.OrderedDict()

    unprunedContextsPath = contextsPath + '.unpruned'

    if os.path.exists(unprunedContextsPath):
        os.remove(unprunedContextsPath)

    with open(unprunedContextsPath, 'wb+') as unprunedContextsFile:
        unprunedContextsFile.write(struct.pack('i', 0)) # this is a placeholder for contexts count
        unprunedContextsFile.write(struct.pack('i', contextSize))

        pathName = inputDirectoryPath + '/*/*.txt'
        textFilePaths = glob.glob(pathName)[:200]
        textFilePaths = sorted(textFilePaths)
        textFileCount = len(textFilePaths)
        startTime = time.time()

        contextFormat = '{0}i'.format(contextSize)
        contextsCount = 0

        for textFileIndex, textFilePath in enumerate(textFilePaths):
            fileVocabulary[textFilePath] = textFileIndex

            contextProvider = WordContextProvider(textFilePath)
            for wordContext in contextProvider.next(wordContextSize):
                for word in wordContext:
                    if word not in wordVocabulary:
                        wordVocabulary[word] = (len(wordVocabulary), 1)
                    else:
                        wordIndex, frequency = wordVocabulary[word]
                        wordVocabulary[word] = (wordIndex, frequency + 1)

                indexContext = map(lambda w: wordVocabulary[w][0], wordContext)
                indexContext = [textFileIndex] + indexContext

                unprunedContextsFile.write(struct.pack(contextFormat, *indexContext))
                contextsCount += 1

            textFileName = os.path.basename(textFilePath)
            currentTime = time.time()
            elapsed = currentTime - startTime
            secondsPerFile = elapsed / (textFileIndex + 1)

            log.progress('Reading contexts: {0:.3f}%. Elapsed: {1} ({2:.3f} sec/file). Vocabulary: {3}.',
                         textFileIndex + 1,
                         textFileCount,
                         log.delta(elapsed),
                         secondsPerFile,
                         len(wordVocabulary))

        log.lineBreak()

        unprunedContextsFile.seek(0, io.SEEK_SET)
        unprunedContextsFile.write(struct.pack('i', contextsCount))
        unprunedContextsFile.flush()

    whiteList = whitelist.load()
    originalVocabularyLength = len(wordVocabulary)
    prunedWordVocabulary, wordIndexMap = pruneWordVocabulary(wordVocabulary, maxVocabularySize, whiteList)

    log.info('Vocabulary has been pruned. {0} items left out of {1}.', len(prunedWordVocabulary), originalVocabularyLength)

    with open(unprunedContextsPath, 'rb') as unprunedContextsFile:
        contextsCount = unprunedContextsFile.read(4)
        contextSize = unprunedContextsFile.read(4)

        contextsCount = struct.unpack('i', contextsCount)[0]
        contextSize = struct.unpack('i', contextSize)[0]

        format = '{0}i'.format(contextSize) # plus one spot for file index
        bufferSize = (contextSize) * 4
        prunedContextsCount = 0
        with open(contextsPath, 'wb+') as uncompressedPrunedContexts:
            uncompressedPrunedContexts.write(struct.pack('i', 0)) # placeholder for contexts count
            uncompressedPrunedContexts.write(struct.pack('i', contextSize))

            contextIndex = 0
            while contextIndex < contextsCount:
                buffer = unprunedContextsFile.read(bufferSize)

                context = struct.unpack(format, buffer)
                fileIndex = context[0]
                indexContext = context[1:]

                if all([index in wordIndexMap for index in indexContext]):
                    prunedContextsCount += 1
                    indexContext = map(lambda wordIndex: wordIndexMap[wordIndex], indexContext)
                    context = [fileIndex] + indexContext
                    buffer = struct.pack(format, *context)
                    uncompressedPrunedContexts.write(buffer)

                contextIndex += 1
                contextsPruned = contextIndex - prunedContextsCount + 1
                log.progress('Pruning contexts: {0:.3f}%. {1} contexts ({2:.3f}%) pruned out of {3}.',
                             contextIndex,
                             contextsCount,
                             contextsPruned,
                             float(contextsPruned) * 100 / contextsCount,
                             contextsCount)

            log.lineBreak()

            uncompressedPrunedContexts.seek(0, io.SEEK_SET)
            uncompressedPrunedContexts.write(struct.pack('i', prunedContextsCount))
            uncompressedPrunedContexts.flush()

    os.remove(unprunedContextsPath)

    parameters.dumpFileVocabulary(fileVocabulary, fileVocabularyPath)
    parameters.dumpWordVocabulary(prunedWordVocabulary, wordVocabularyPath)