class Summarizer: def __init__(self): self.parser = Parser() ''' test : it is catchphrase, one sentence. title: sentence list. ''' def summarize(self, text, title): sentences = text result = [] ## step 1, get term list of catchphrase. (catchphrase_keywords, catchphrase_wordCount) = self.parser.getKeywords(title) result.append( (catchphrase_keywords, catchphrase_wordCount) ) catchword_list = [catchphrase_keywords[idx]['word'] for idx in range(len(catchphrase_keywords))] #print("[*catchword_list*]",catchword_list) ## step 2, get top k word list in sentences. ## 2.1 get term list of detail. #text_merged = " ".join(sentences) #(detail_keywords, detail_wordCount) = self.parser.getKeywords(text_merged) for idx in range(len(text)): (sentence_keywords, sentence_wordCount) = self.parser.getKeywords(text[idx]) result.append( (sentence_keywords, sentence_wordCount) ) word_list = [sentence_keywords[idx]['word'] for idx in range(len(sentence_keywords))] #print("\n[*word_list*]", word_list) return result
def predictBayesianModel(sentenceList=[ 'hello world occupation lease', 'machine learning board', 'machine learning lease occupation' ], input_path="./model/train_model.npz", word_index_file="./model/word_index.npz"): #print("\n-----------------------------------------") #print("Load model1: ./model/train_model.npz") #print("-----------------------------------------") #-------------------------------------------------- npzfile = np.load(input_path) pi_bar = npzfile['arr_0'] theta_bar = npzfile['arr_1'] #print("\n[[pi]]:") #print(pi_bar) #print(pi_bar.shape) #print("\n[[theta]]:") #print(theta_bar) #print(theta_bar.shape) #print("\n-----------------------------------------") #print("Load model2: ./model/word_index.npz") #print("-----------------------------------------") #-------------------------------------------------- npzfile2 = np.load(word_index_file) catchword_index = npzfile2['arr_0'] bodyword_index = npzfile2['arr_1'] #print("\n[[catchword index]]:") #print(catchword_index) #print(catchword_index.shape) #print("\n[[bodyword index]]:") #print(bodyword_index) #print(bodyword_index.shape) #-------------------------------------------------- scoreRecord = [] parser = Parser() catchword_list = catchword_index.tolist() bodyword_list = bodyword_index.tolist() #-------------------------------------------------- # Get catchword_positionList #-------------------------------------------------- catchwords = sentenceList[0] (keywords, wordCount) = parser.getKeywords(catchwords) catchword_positionList = [] #print("keywords: ", keywords) for elem in keywords: word = elem['word'] count = elem['count'] idx = catchword_list.index(word) if word in catchword_list else -1 if (idx != -1): #print("appending ", catchword_list[idx]) catchword_positionList.append(idx) #Debug #print("catchword_positionList:", catchword_positionList) #for catchwordPos in catchword_positionList: #print(catchword_list[catchwordPos]) #-------------------------------------------------- # Calculate score for each word in body sentence. # The first sentence is catchphrases. #-------------------------------------------------- for idx in range(1, len(sentenceList)): (keywords, wordCount) = parser.getKeywords(sentenceList[idx]) sentence_score = 0 ''' 1) get the position list of catch words in predicted case. 2) for each word in each sentence, find the scores for each catchword in theta_bar. 3) add these scores which will be the final for one word in this sentence. 4) evaluate next word... until the end of this sentence. 5) goto 2). ''' ## print("----------- sentence --------------") for elem in keywords: # Jeff: For each word in body sentence. word = elem['word'] count = elem['count'] ## print("sentence word :", word) ## print("sentence word count:", count) ## print(" ") word_score = 0 wordInSentence_idx = bodyword_list.index( word) if word in bodyword_list else -1 if (wordInSentence_idx != -1): # Jeff: For each word in catchphrase for catchwordIdx in catchword_positionList: ## print("* theta_bar[",idx, "][", catchword_list[catchwordIdx], "]") ## print("* score:", theta_bar[idx][catchwordIdx]) ## print(" ") word_score += theta_bar[idx][catchwordIdx] sentence_score += word_score * count scoreRecord.append(sentence_score) # NB: sentence ith, from 1 to end. #print("\nScore list for each sentence:") #print([ float("%.2f" % elem) for elem in scoreRecord ]) #print("") return scoreRecord