示例#1
0
def getWordCounts(sentences,numSentences,fileName,corpusVocabulary,inverseDictionary, wordCount,wordCountSentence,lastReadSentenceInd):

    start_time_all = time.time()
    start_time = start_time_all

    for indSentence, sentence in enumerate(sentences):

        start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000)

        if indSentence <= lastReadSentenceInd:
            continue

        wordsInSentence = util.splitSentence(sentence)

        wordsAlreadyRead = set()

        for word in wordsInSentence:

            if word in wordCount:
                # corpusDictionary[word].count += 1
                wordCount[word] += 1
                corpusVocabulary[word].count += 1

                if word not in wordsAlreadyRead:
                    wordCountSentence[word] += 1
                    corpusVocabulary[word].sentenceCount += 1

                wordsAlreadyRead.add(word)

            else:
                # relatedWords = getConceptNetRelatedWords(word)
                # corpusDictionary[word] = Word(relatedWords)
                wordCount[word] = 1
                wordCountSentence[word] = 1
                inverseDictionary[len(corpusVocabulary)] = word
                corpusVocabulary[word] = util.Word(word,len(corpusVocabulary))


        lastReadSentenceInd = indSentence

        # filehandler = open(fileName, "wb")
        # # pickle.dump([corpusDictionary, lastReadSentenceInd],filehandler)
        # pickle.dump([wordCount,wordCountSentence, lastReadSentenceInd], filehandler)
        # filehandler.close()

    filehandler = open(fileName, "wb")
    # pickle.dump([corpusDictionary, lastReadSentenceInd],filehandler)
    pickle.dump([wordCount, wordCountSentence, corpusVocabulary,inverseDictionary, lastReadSentenceInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time_all
    print('total time for word count = ' + str(elapsed_time/60) + ' minutes ')

    return(wordCount,wordCountSentence, corpusVocabulary, lastReadSentenceInd)
示例#2
0
def buildSentenceMatrix(sentences,rows,cols,weights,lastReadSentenceInd, corpusVocabulary, fileName):

    start_time_all = time.time()
    start_time = start_time_all

    numSentences = len(sentences)

    numberWordsPerSentence = np.zeros(numSentences)

    for indSentence, sentence in enumerate(sentences):

        start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000)

        if indSentence <= lastReadSentenceInd:
            continue

        wordsAlreadyRead = set()

        wordsInSentence = util.splitSentence(sentence)

        for word in wordsInSentence:

            if word not in wordsAlreadyRead:

                indWord = corpusVocabulary[word].index
                rows.append(indSentence)
                cols.append(indWord)
                weights.append(1 / corpusVocabulary[word].sentenceCount)
                numberWordsPerSentence[indSentence] += 1

            wordsAlreadyRead.add(word)

        lastReadSentenceInd = indSentence

    sentenceMatrix = csr_matrix((weights, (rows, cols)), shape=(numSentences, len(corpusVocabulary)))

    filehandler = open(fileName, "wb")
    pickle.dump([sentenceMatrix, numberWordsPerSentence, lastReadSentenceInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time_all
    print('total time for building sentence matrix = ' + str(elapsed_time/60) + ' minutes ')

    return (sentenceMatrix,numberWordsPerSentence)
示例#3
0
def buildSentenceDictionaries(sentences,sentenceDictList,lastReadSentenceInd, corpusVocabulary, fileName):

    start_time_all = time.time()
    start_time = start_time_all

    numSentences = len(sentences)

    for indSentence, sentence in enumerate(sentences):

        start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000)

        if indSentence <= lastReadSentenceInd:
            continue

        sentenceDict = dict()
        wordsAlreadyRead = set()

        wordsInSentence = util.splitSentence(sentence)

        for word in wordsInSentence:

            if word not in wordsAlreadyRead:

                sentenceDict[word] = 1 / corpusVocabulary[word].sentenceCount

            wordsAlreadyRead.add(word)

        sentenceDictList.append(sentenceDict)

        lastReadSentenceInd = indSentence

    filehandler = open(fileName, "wb")
    pickle.dump([sentenceDictList, lastReadSentenceInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time_all
    print('total time for building sentence list = ' + str(elapsed_time/60) + ' minutes ')

    return sentenceDictList
def buildQuestionMatrix(questions, numQuestions, fileName, lastReadQuestionInd,
                        rows, cols, occurrences, corpusVocabulary,
                        wordCountThreshold):

    start_time = time.time()
    start_time_print = start_time

    for indQ, q in enumerate(questions):

        start_time_print = util.printRemainingTime(start_time_print,
                                                   numQuestions, indQ, 100)

        if indQ <= lastReadQuestionInd:
            continue

        j = json.loads(q)
        question = j['question']
        stem = question['stem']
        wordsInQuestion = util.splitSentence(stem)

        choices = question['choices']
        for c in choices:
            choiceText = c['text']
            wordsInChoice = util.splitSentence(choiceText)
            wordsInQuestion.extend(wordsInChoice)

        wordsAlreadyRead = set()

        for word in wordsInQuestion:

            if word not in wordsAlreadyRead:

                if word not in corpusVocabulary:  #word not in the vocabulary, so it's a rare word: look for similar words

                    (rows, cols,
                     occurrences) = util.updateSparseWithRelatedWords(
                         word, corpusVocabulary, rows, cols, occurrences, indQ)

                else:

                    indWord = corpusVocabulary[word].index
                    rows.append(indQ)
                    cols.append(indWord)
                    occurrences.append(1)

                    if corpusVocabulary[
                            word].sentenceCount < wordCountThreshold:  #only use conceptnet for rare words

                        (rows, cols,
                         occurrences) = util.updateSparseWithRelatedWords(
                             word, corpusVocabulary, rows, cols, occurrences,
                             indQ)

            wordsAlreadyRead.add(word)

        lastReadQuestionInd = indQ

    questionMatrix = csr_matrix((occurrences, (rows, cols)),
                                shape=(numQuestions, len(corpusVocabulary)))

    filehandler = open(fileName, "wb")
    pickle.dump([questionMatrix, lastReadQuestionInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time

    print('total time for building question matrix = ' +
          str(elapsed_time / 60) + ' minutes ')

    return questionMatrix
def buildQuestionDictionaries(f, fileName, lastReadQuestionInd, questionList,
                              corpusVocabulary, wordCountThreshold):
    printEvery = 100

    questions = f.readlines()
    numQuestions = len(questions)

    start_time = time.time()
    start_time_print = start_time

    for indQ, q in enumerate(questions):

        if indQ % printEvery == 0:
            elapsed_time = time.time() - start_time_print

            remainingSentences = numQuestions - indQ
            remainingTime = remainingSentences / printEvery * elapsed_time / 60

            print(
                str(indQ + 1) + ' of ' + str(numQuestions) +
                ' elapsed_time = ' + str(elapsed_time) +
                ' seconds, estimated remaining time = ' + str(remainingTime) +
                ' minutes')

            # filehandler = open(fileName, "wb")
            # pickle.dump([questionWordMatrix, lastReadQuestionInd], filehandler)
            # filehandler.close()

            start_time_print = time.time()

        if indQ <= lastReadQuestionInd:
            continue

        j = json.loads(q)
        question = j['question']
        stem = question['stem']
        wordsInQuestion = util.splitSentence(stem)

        choices = question['choices']
        for c in choices:
            choiceText = c['text']
            wordsInChoice = util.splitSentence(choiceText)
            wordsInQuestion.extend(wordsInChoice)

        wordsAlreadyRead = set()

        wordVec = dict()

        for word in wordsInQuestion:

            if word not in wordsAlreadyRead:

                if word not in corpusVocabulary:

                    relatedWords = util.getConceptNetRelatedWords(word)
                    wordVec[word] = relatedWords

                else:
                    if corpusVocabulary[
                            word].sentenceCount < wordCountThreshold:

                        # st = time.time()

                        relatedWords = util.getConceptNetRelatedWords(word)
                        wordVec[word] = relatedWords

                        # el = time.time() - st
                        # print('elapsed_time_getConceptNetWords = ' + str(el))

                    else:
                        wordVec[word] = []

            wordsAlreadyRead.add(word)

        questionList.append(wordVec)

        lastReadQuestionInd = indQ

    filehandler = open(fileName, "wb")
    pickle.dump([questionList, lastReadQuestionInd], filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time

    print('total time for building question matrix = ' +
          str(elapsed_time / 60) + ' minutes ')

    return questionList
def buildQuestionMatrix2(testQuestionIndices, questionMatrix, choiceMatrix,
                         relatedMatrix, questions, numQuestions, fileName,
                         lastReadQuestionInd, corpusVocabulary,
                         inverseDictionary, wordCountThreshold):

    start_time = time.time()
    start_time_print = start_time

    usedQuestions = []

    if len(testQuestionIndices) > 0:
        for q in testQuestionIndices:
            usedQuestions.append(questions[testQuestionIndices[q]])
    else:
        usedQuestions = questions

    for indQ, q in enumerate(usedQuestions):

        start_time_print = util.printRemainingTime(start_time_print,
                                                   numQuestions, indQ, 100)

        if indQ <= lastReadQuestionInd:
            continue

        j = json.loads(q)
        question = j['question']
        stem = question['stem']
        wordsInQuestion = util.splitSentence(stem)

        choices = question['choices']

        wordsInChoices = []

        for c in choices:
            choiceText = c['text']
            wordsInCurrChoice = util.splitSentence(choiceText)
            wordsInChoices.extend(wordsInCurrChoice)

        # questionVector, relatedQuestionVector = addWordsToSparseMatrix(wordsInQuestion, corpusVocabulary, wordCountThreshold)
        # choicesVector, relatedChoicesVector = addWordsToSparseMatrix(wordsInChoices, corpusVocabulary, wordCountThreshold)

        questionVector = getSentenceVector(wordsInQuestion, corpusVocabulary)
        choicesVector = getSentenceVector(wordsInChoices, corpusVocabulary)

        # questionWords = []
        # choiceWords = []
        # questionVector2 = questionVector.toarray()[0]
        # nonzero = np.where(questionVector2 > 0)[0]
        #
        # for c in nonzero:
        #    questionWords.append(inverseDictionary[c])
        #
        # choicesVector2 = choicesVector.toarray()[0]
        # nonzero = np.where(choicesVector2 > 0)[0]
        #
        # for c in nonzero:
        #     choiceWords.append(inverseDictionary[c])

        relatedWords = getRelatedWordsBothways(wordsInQuestion, wordsInChoices,
                                               questionVector, choicesVector,
                                               wordCountThreshold,
                                               corpusVocabulary,
                                               inverseDictionary)

        questionMatrix.append(questionVector)
        choiceMatrix.append(choicesVector)
        relatedMatrix.append(relatedWords)

        lastReadQuestionInd = indQ

    filehandler = open(fileName, "wb")
    pickle.dump(
        [questionMatrix, choiceMatrix, relatedMatrix, lastReadQuestionInd],
        filehandler)
    filehandler.close()

    elapsed_time = time.time() - start_time

    print('total time for building question matrix = ' +
          str(elapsed_time / 60) + ' minutes ')

    return (questionMatrix, choiceMatrix, relatedMatrix)