Пример #1
0
def getWordCounts(sentences,numSentences,fileName,corpusVocabulary,inverseDictionary, wordCount,wordCountSentence,lastReadSentenceInd):

    start_time_all = time.time()
    start_time = start_time_all

    for indSentence, sentence in enumerate(sentences):

        start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000)

        if indSentence <= lastReadSentenceInd:
            continue

        wordsInSentence = util.splitSentence(sentence)

        wordsAlreadyRead = set()

        for word in wordsInSentence:

            if word in wordCount:
                # corpusDictionary[word].count += 1
                wordCount[word] += 1
                corpusVocabulary[word].count += 1

                if word not in wordsAlreadyRead:
                    wordCountSentence[word] += 1
                    corpusVocabulary[word].sentenceCount += 1

                wordsAlreadyRead.add(word)

            else:
                # relatedWords = getConceptNetRelatedWords(word)
                # corpusDictionary[word] = Word(relatedWords)
                wordCount[word] = 1
                wordCountSentence[word] = 1
                inverseDictionary[len(corpusVocabulary)] = word
                corpusVocabulary[word] = util.Word(word,len(corpusVocabulary))


        lastReadSentenceInd = indSentence

        # filehandler = open(fileName, "wb")
        # # pickle.dump([corpusDictionary, lastReadSentenceInd],filehandler)
        # pickle.dump([wordCount,wordCountSentence, lastReadSentenceInd], filehandler)
        # filehandler.close()

    filehandler = open(fileName, "wb")
    # pickle.dump([corpusDictionary, lastReadSentenceInd],filehandler)
    pickle.dump([wordCount, wordCountSentence, corpusVocabulary,inverseDictionary, lastReadSentenceInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time_all
    print('total time for word count = ' + str(elapsed_time/60) + ' minutes ')

    return(wordCount,wordCountSentence, corpusVocabulary, lastReadSentenceInd)
Пример #2
0
def buildSentenceMatrix(sentences,rows,cols,weights,lastReadSentenceInd, corpusVocabulary, fileName):

    start_time_all = time.time()
    start_time = start_time_all

    numSentences = len(sentences)

    numberWordsPerSentence = np.zeros(numSentences)

    for indSentence, sentence in enumerate(sentences):

        start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000)

        if indSentence <= lastReadSentenceInd:
            continue

        wordsAlreadyRead = set()

        wordsInSentence = util.splitSentence(sentence)

        for word in wordsInSentence:

            if word not in wordsAlreadyRead:

                indWord = corpusVocabulary[word].index
                rows.append(indSentence)
                cols.append(indWord)
                weights.append(1 / corpusVocabulary[word].sentenceCount)
                numberWordsPerSentence[indSentence] += 1

            wordsAlreadyRead.add(word)

        lastReadSentenceInd = indSentence

    sentenceMatrix = csr_matrix((weights, (rows, cols)), shape=(numSentences, len(corpusVocabulary)))

    filehandler = open(fileName, "wb")
    pickle.dump([sentenceMatrix, numberWordsPerSentence, lastReadSentenceInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time_all
    print('total time for building sentence matrix = ' + str(elapsed_time/60) + ' minutes ')

    return (sentenceMatrix,numberWordsPerSentence)
Пример #3
0
def buildSentenceDictionaries(sentences,sentenceDictList,lastReadSentenceInd, corpusVocabulary, fileName):

    start_time_all = time.time()
    start_time = start_time_all

    numSentences = len(sentences)

    for indSentence, sentence in enumerate(sentences):

        start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000)

        if indSentence <= lastReadSentenceInd:
            continue

        sentenceDict = dict()
        wordsAlreadyRead = set()

        wordsInSentence = util.splitSentence(sentence)

        for word in wordsInSentence:

            if word not in wordsAlreadyRead:

                sentenceDict[word] = 1 / corpusVocabulary[word].sentenceCount

            wordsAlreadyRead.add(word)

        sentenceDictList.append(sentenceDict)

        lastReadSentenceInd = indSentence

    filehandler = open(fileName, "wb")
    pickle.dump([sentenceDictList, lastReadSentenceInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time_all
    print('total time for building sentence list = ' + str(elapsed_time/60) + ' minutes ')

    return sentenceDictList
Пример #4
0
def findSentences(originalQuestions,
                  originalSentences,
                  numberWordsPerSentence,
                  questionMatrix,
                  choiceMatrix,
                  relatedMatrix,
                  numQuestions,
                  sentences,
                  maxSentences,
                  useConceptNet,
                  saveDir,
                  relatedWordsWeight=0.01):

    # allBestSentences = []

    start_time_all = time.time()
    start_time = start_time_all

    for indQ, question in enumerate(questionMatrix):
        # bestSentences = findSentencesForQuestion(question,sentences,maxSentences,useConceptNet)

        choice = choiceMatrix[indQ]
        related = relatedMatrix[indQ]

        savePath = saveDir + str(indQ) + '.txt'
        picklePath = saveDir + str(indQ) + '.pkl'

        if not os.path.exists(savePath):

            bestSentences = findSentencesForQuestion_SparseMatrices(
                originalSentences, question, choice, related, sentences,
                numberWordsPerSentence, maxSentences, useConceptNet,
                relatedWordsWeight)

            f = open(savePath, 'w')
            f.write('Question:\n')
            j = json.loads(originalQuestions[indQ])
            question = j['question']
            stem = question['stem']
            f.write(stem + '\n')

            choices = question['choices']
            for c in choices:
                choiceText = c['text']
                label = c['label']
                f.write(label + ') ' + choiceText + '\n')

            f.write('\n')

            f.write('Relevant Sentences:' + '\n')

            for indSentence in bestSentences.indices:
                f.write(originalSentences[indSentence] + '\n')

            f.close()

            bestSentencesPickelArray = []

            for ind in bestSentences.indices:
                bestSentencesPickelArray.append(originalSentences[ind])
            filehandler = open(picklePath, "wb")
            pickle.dump([bestSentencesPickelArray, bestSentences.indices],
                        filehandler)
            filehandler.close()

            gc.collect()

            # allBestSentences.append(bestSentences)

        start_time = util.printRemainingTime(start_time, numQuestions, indQ, 1)

    elapsed_time = time.time() - start_time

    print('elapsed time to find relevant sentences = ' + str(elapsed_time))
def buildQuestionMatrix(questions, numQuestions, fileName, lastReadQuestionInd,
                        rows, cols, occurrences, corpusVocabulary,
                        wordCountThreshold):

    start_time = time.time()
    start_time_print = start_time

    for indQ, q in enumerate(questions):

        start_time_print = util.printRemainingTime(start_time_print,
                                                   numQuestions, indQ, 100)

        if indQ <= lastReadQuestionInd:
            continue

        j = json.loads(q)
        question = j['question']
        stem = question['stem']
        wordsInQuestion = util.splitSentence(stem)

        choices = question['choices']
        for c in choices:
            choiceText = c['text']
            wordsInChoice = util.splitSentence(choiceText)
            wordsInQuestion.extend(wordsInChoice)

        wordsAlreadyRead = set()

        for word in wordsInQuestion:

            if word not in wordsAlreadyRead:

                if word not in corpusVocabulary:  #word not in the vocabulary, so it's a rare word: look for similar words

                    (rows, cols,
                     occurrences) = util.updateSparseWithRelatedWords(
                         word, corpusVocabulary, rows, cols, occurrences, indQ)

                else:

                    indWord = corpusVocabulary[word].index
                    rows.append(indQ)
                    cols.append(indWord)
                    occurrences.append(1)

                    if corpusVocabulary[
                            word].sentenceCount < wordCountThreshold:  #only use conceptnet for rare words

                        (rows, cols,
                         occurrences) = util.updateSparseWithRelatedWords(
                             word, corpusVocabulary, rows, cols, occurrences,
                             indQ)

            wordsAlreadyRead.add(word)

        lastReadQuestionInd = indQ

    questionMatrix = csr_matrix((occurrences, (rows, cols)),
                                shape=(numQuestions, len(corpusVocabulary)))

    filehandler = open(fileName, "wb")
    pickle.dump([questionMatrix, lastReadQuestionInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time

    print('total time for building question matrix = ' +
          str(elapsed_time / 60) + ' minutes ')

    return questionMatrix
def buildQuestionMatrix2(testQuestionIndices, questionMatrix, choiceMatrix,
                         relatedMatrix, questions, numQuestions, fileName,
                         lastReadQuestionInd, corpusVocabulary,
                         inverseDictionary, wordCountThreshold):

    start_time = time.time()
    start_time_print = start_time

    usedQuestions = []

    if len(testQuestionIndices) > 0:
        for q in testQuestionIndices:
            usedQuestions.append(questions[testQuestionIndices[q]])
    else:
        usedQuestions = questions

    for indQ, q in enumerate(usedQuestions):

        start_time_print = util.printRemainingTime(start_time_print,
                                                   numQuestions, indQ, 100)

        if indQ <= lastReadQuestionInd:
            continue

        j = json.loads(q)
        question = j['question']
        stem = question['stem']
        wordsInQuestion = util.splitSentence(stem)

        choices = question['choices']

        wordsInChoices = []

        for c in choices:
            choiceText = c['text']
            wordsInCurrChoice = util.splitSentence(choiceText)
            wordsInChoices.extend(wordsInCurrChoice)

        # questionVector, relatedQuestionVector = addWordsToSparseMatrix(wordsInQuestion, corpusVocabulary, wordCountThreshold)
        # choicesVector, relatedChoicesVector = addWordsToSparseMatrix(wordsInChoices, corpusVocabulary, wordCountThreshold)

        questionVector = getSentenceVector(wordsInQuestion, corpusVocabulary)
        choicesVector = getSentenceVector(wordsInChoices, corpusVocabulary)

        # questionWords = []
        # choiceWords = []
        # questionVector2 = questionVector.toarray()[0]
        # nonzero = np.where(questionVector2 > 0)[0]
        #
        # for c in nonzero:
        #    questionWords.append(inverseDictionary[c])
        #
        # choicesVector2 = choicesVector.toarray()[0]
        # nonzero = np.where(choicesVector2 > 0)[0]
        #
        # for c in nonzero:
        #     choiceWords.append(inverseDictionary[c])

        relatedWords = getRelatedWordsBothways(wordsInQuestion, wordsInChoices,
                                               questionVector, choicesVector,
                                               wordCountThreshold,
                                               corpusVocabulary,
                                               inverseDictionary)

        questionMatrix.append(questionVector)
        choiceMatrix.append(choicesVector)
        relatedMatrix.append(relatedWords)

        lastReadQuestionInd = indQ

    filehandler = open(fileName, "wb")
    pickle.dump(
        [questionMatrix, choiceMatrix, relatedMatrix, lastReadQuestionInd],
        filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time

    print('total time for building question matrix = ' +
          str(elapsed_time / 60) + ' minutes ')

    return (questionMatrix, choiceMatrix, relatedMatrix)