예제 #1
0
class Summarizer:
    def __init__(self):
        self.parser = Parser()

    '''
    test : it is catchphrase, one sentence.
    title: sentence list.
    '''
    def summarize(self, text, title):
        sentences = text
        result = []

        ## step 1, get term list of catchphrase.
        (catchphrase_keywords, catchphrase_wordCount) = self.parser.getKeywords(title)
        result.append( (catchphrase_keywords, catchphrase_wordCount) )

        catchword_list = [catchphrase_keywords[idx]['word'] for idx in range(len(catchphrase_keywords))]
        #print("[*catchword_list*]",catchword_list)

        ## step 2, get top k word list in sentences.
        ## 2.1 get term list of detail.
        #text_merged = " ".join(sentences)
        #(detail_keywords, detail_wordCount) = self.parser.getKeywords(text_merged)

        for idx in range(len(text)):
            (sentence_keywords, sentence_wordCount) = self.parser.getKeywords(text[idx])
            result.append( (sentence_keywords, sentence_wordCount) )

            word_list = [sentence_keywords[idx]['word'] for idx in range(len(sentence_keywords))]
            #print("\n[*word_list*]", word_list)


        return result
예제 #2
0
def predictBayesianModel(sentenceList=[
    'hello world occupation lease', 'machine learning board',
    'machine learning lease occupation'
],
                         input_path="./model/train_model.npz",
                         word_index_file="./model/word_index.npz"):
    #print("\n-----------------------------------------")
    #print("Load model1: ./model/train_model.npz")
    #print("-----------------------------------------")
    #--------------------------------------------------
    npzfile = np.load(input_path)

    pi_bar = npzfile['arr_0']
    theta_bar = npzfile['arr_1']
    #print("\n[[pi]]:")
    #print(pi_bar)
    #print(pi_bar.shape)
    #print("\n[[theta]]:")
    #print(theta_bar)
    #print(theta_bar.shape)

    #print("\n-----------------------------------------")
    #print("Load model2: ./model/word_index.npz")
    #print("-----------------------------------------")
    #--------------------------------------------------
    npzfile2 = np.load(word_index_file)

    catchword_index = npzfile2['arr_0']
    bodyword_index = npzfile2['arr_1']
    #print("\n[[catchword index]]:")
    #print(catchword_index)
    #print(catchword_index.shape)
    #print("\n[[bodyword index]]:")
    #print(bodyword_index)
    #print(bodyword_index.shape)
    #--------------------------------------------------

    scoreRecord = []
    parser = Parser()
    catchword_list = catchword_index.tolist()
    bodyword_list = bodyword_index.tolist()

    #--------------------------------------------------
    # Get catchword_positionList
    #--------------------------------------------------
    catchwords = sentenceList[0]
    (keywords, wordCount) = parser.getKeywords(catchwords)

    catchword_positionList = []
    #print("keywords: ", keywords)
    for elem in keywords:
        word = elem['word']
        count = elem['count']

        idx = catchword_list.index(word) if word in catchword_list else -1
        if (idx != -1):
            #print("appending ", catchword_list[idx])
            catchword_positionList.append(idx)

    #Debug
    #print("catchword_positionList:", catchword_positionList)
    #for catchwordPos in catchword_positionList:
    #print(catchword_list[catchwordPos])

    #--------------------------------------------------
    # Calculate score for each word in body sentence.
    # The first sentence is catchphrases.
    #--------------------------------------------------
    for idx in range(1, len(sentenceList)):
        (keywords, wordCount) = parser.getKeywords(sentenceList[idx])

        sentence_score = 0
        '''
        1) get the position list of catch words in predicted case.
        2) for each word in each sentence, find the scores for each catchword in theta_bar.
        3) add these scores which will be the final for one word in this sentence.
        4) evaluate next word... until the end of this sentence.
        5) goto 2).
        '''

        ## print("----------- sentence --------------")
        for elem in keywords:
            # Jeff: For each word in body sentence.
            word = elem['word']
            count = elem['count']
            ## print("sentence word      :", word)
            ## print("sentence word count:", count)
            ## print(" ")

            word_score = 0

            wordInSentence_idx = bodyword_list.index(
                word) if word in bodyword_list else -1
            if (wordInSentence_idx != -1):

                # Jeff: For each word in catchphrase
                for catchwordIdx in catchword_positionList:
                    ## print("* theta_bar[",idx, "][", catchword_list[catchwordIdx], "]")
                    ## print("* score:", theta_bar[idx][catchwordIdx])
                    ## print(" ")

                    word_score += theta_bar[idx][catchwordIdx]

            sentence_score += word_score * count

        scoreRecord.append(sentence_score)

    # NB: sentence ith, from 1 to end.
    #print("\nScore list for each sentence:")
    #print([ float("%.2f" % elem) for elem in scoreRecord ])
    #print("")

    return scoreRecord