def ExtractKeywords(self): tk_segm = Help.GenSpeaksWordsTag(self.segm.cleanSentences, self.segm.cleanSpeakers, self.segm.cleanSentTags) #generate list of tokenized words with relative speaker UNCOMMENT tk_segm_list = Help.Expand(tk_segm) #words, speakers, tags idf_w = np.zeros(len(tk_segm_list[0])) ext_score = np.zeros(len(tk_segm_list[0])) num_segm = len(self.segm.cleanSentences) prob_s_w = np.zeros((len(tk_segm_list[0]), num_segm)) num_s_w = np.zeros((len(tk_segm_list[0]), num_segm)) nent = np.zeros(len(tk_segm_list[0])) den_w = np.zeros(len(tk_segm_list[0])) #tfidf is created considering the words ordered as in spacy_loc_single_w for i in range(0, len(tk_segm_list[0])): idf_w[i] = self.freqVec[self.prep.singleWords.index(tk_segm_list[0][i])] #check this later for i in range(0, len(tk_segm_list[0])): #i iter over words for j in range(0, num_segm): #j iter over segments try: freq = Help.WordSegmFrequency(tk_segm[2][j], tk_segm_list[0][i]) #times w is uttered in segment j except: print(j) input("enter") try: num_s_w[i][j] = freq except: print("2") input("enter") try: den_w[i] = freq + den_w[i] #sum over all segments except: print("3") input("enter") for i in range(0, len(tk_segm_list[0])): for j in range(0, num_segm): prob_s_w[i][j] = num_s_w[i][j] / den_w[i] #probability of being in segment j given word i for i in range(0, len(tk_segm_list[0])): for j in range(0, num_segm): if tk_segm_list[0][i] in tk_segm[2][j]: nent[i] = nent[i] - (prob_s_w[i][j] * np.log(prob_s_w[i][j])) #negative entropy of each word for i in range(len(tk_segm_list[0])): ext_score[i] = self.kIdf * idf_w[i] + self.kNent * nent[i] #sum global and local scores [loc_single_words, scores] = Help.RemoveMultipleKeywords(tk_segm_list[0], ext_score) #remove multiple keywords newScores = self.PosAndNer(loc_single_words, scores, tk_segm_list) #filter according to POS and adapt weight acc to NER self.Revert(loc_single_words, newScores) #revert scores and extract top Tm%
def CopmputeLTS(self, wFreq): #topicModel['Docs'] is a list of len(num_documents) where each el is (topic_id, topic_prob) # wFreq is a list with size #documents giving the frequency of the term in each doc LTS = np.zeros(self.topicModel['NumTopics'], dtype=float) num = np.zeros(self.topicModel['NumTopics'], dtype=float) den = np.zeros(self.topicModel['NumTopics'], dtype=float) for y in range(len(self.topicModel['Docs'])): #iterate over documents if wFreq[y]: for x in range(len(self.topicModel['Docs'][y])): idx = self.topicModel['Docs'][y][x][0] - 1 prob = self.topicModel['Docs'][y][x][1] add_num = wFreq[y] * prob add_den = wFreq[y] * (1 - prob) num[idx] = num[idx] + add_num den[idx] = den[idx] + add_den for x in range(len(LTS)): LTS[x] = Help.SafeDiv( num[x], den[x] ) #first sum over docs, then divide, then sum over topics return np.sum( LTS ) #return sum over all topics (LTS of a single word with frequency term_freq)
def Segmentation(self): #get boundaries, CB and number of segments self.GetBoundaries()# #segment according to previous results newSegments = self.Segmenter()# for x in range(len(newSegments[0])): locSentences = newSegments[0][x] locSpeakers = newSegments[1][x] vec = [locSentences, locSpeakers] self.speakerDistr.append(self.ScoreSegment(vec)[1])# for x in range(len(newSegments[0])): self.cleanSentences.append(Help.RemoveMinorSpeaker(newSegments[0][x], newSegments[1][x], self.speakerDistr[x])) self.cleanSpeakers.append(Help.RemoveMinorSpeaker(newSegments[1][x], newSegments[1][x], self.speakerDistr[x])) self.cleanSentOrig.append(Help.RemoveMinorSpeaker(newSegments[2][x], newSegments[1][x], self.speakerDistr[x])) self.cleanSentTags.append(Help.RemoveMinorSpeaker(newSegments[3][x], newSegments[1][x], self.speakerDistr[x]))
def Score(self, lenText): score = np.zeros(lenText) smooth_score = np.zeros(lenText) for i in range(1, lenText-1): # iterate over all wiindows (as many as num of sentences), # each can be potential segment boundary but first and last sentences maxim_i = np.maximum(0, i-self.winLength) maxim_f = np.maximum(1, i) minim = np.maximum(np.minimum(lenText, i + self.winLength), maxim_f) win_left_idx = [] win_right_idx = [] for x in range(maxim_i, maxim_f): # min and max to avoid index problems win_left_idx.append(x) for x in range(i, minim): # min and max to avoid index problems win_right_idx.append(x) if i > 1: win_l = [self.prep.speakers[win_left_idx[0] : win_left_idx[-1]], self.prep.sentLemma[win_left_idx[0] : win_left_idx[-1]]] #create win left_i else: #when i = 1 --> win_l is only the first el win_l = [[self.prep.speakers[0]], [self.prep.sentLemma[0]]] if i < lenText - 2: win_r = [self.prep.speakers[win_right_idx[0] : win_right_idx[-1] +1], self.prep.sentLemma[win_right_idx[0] : win_right_idx[-1] +1]] #create win left_i else: win_r = [[self.prep.speakers[-1]], [self.prep.sentLemma[-1]]] WC_l = self.WC(win_l) WC_r = self.WC(win_r) dist_wc = Help.Dist(WC_l, WC_r) WI_l = self.WI(win_l[1], win_l[0]) WI_r = self.WI(win_r[1], win_r[0]) dist_wi = Help.Dist(WI_l, WI_r) score[i] = dist_wc + dist_wi for i in range(1, lenText-1): temp_score = 0 #loc score for smoothing bound = self.SafeSmooth(i, len(score)) #check not to go out of size when smoothing low = bound[0] up = bound[1] for j in range(low, up): temp_score += score[j] smooth_score[i] = temp_score / (1 + self.smoothParam) return score, smooth_score
def CreateLus(self): Ns = self.prep.numSpeakers Lus = np.zeros((len(self.segm.cleanSentences[self.íter]), Ns)) for i in range(0, len(self.segm.cleanSentences[self.íter])): v1 = Help.CreateSentenceVector( self.segm.cleanSentences[self.íter][i], self.freqVec, self.prep.singleWords) for j in range(0, Ns): if j + 1 in self.segm.cleanSpeakers[self.íter]: v2 = Help.CreateSpeakerVector( j, self.segm.cleanSentences[self.íter], self.segm.cleanSpeakers[self.íter], self.speakVec) if Help.NotValidCos(v1, v2): v1, v2 = Help.ReshapeVec(v1, v2) cos_dist = 1 - sp.distance.cosine(v1, v2) if math.isnan(cos_dist): Lus[i][j] = 0. else: Lus[i][j] = cos_dist return Lus
def ScoreSegment(self, segm): sum_c = [] c_idx = [] cat = Help.GenCat(self.Ns) #generate categories vector cat = cat[1:] #[0,0,0,0,0] not allowed, there's always at least one speaker #sum_c score per each cat, find min #c_idx idx of each cat, to find the cat corresp to min score for c in cat: sum_dist = Help.Dist(Help.Dstr(segm, self.Ns), Help.DstrId(self.Ns,c)) #score segments sum_c.append(sum_dist) c_idx.append(c) #c single category boolean vector used for calling dstr_id min_score = np.min(sum_c) min_cat = c_idx[np.argmin(sum_c)] # in the list of categories, take el of idx that minimizes the score return min_score, min_cat
def WI(self, w, s): WI_vec = np.zeros(self.Ns) if w: suidf_win = Help.CreateSentenceVector(w, self.freq, self.prep.singleWords) den = 0 #doesn't have to be reset num = [] #append num per each speaker for j in range(self.Ns): num_t = 0 #numerator for given speaker for k in range(len(s)): if s[k] == j+1: num_t += suidf_win[k] den += suidf_win[k] num.append(num_t) for j in range(0, self.Ns): WI_vec[j] = Help.SafeDiv(num[j] , den) return WI_vec
def Suidf( self ): #computes suidf for all the words in the meeting (not in the dataset, not sure what's better) surp_w_s = np.zeros( (self.Ns, len(self.meetingWords))) #matrix [num_speakers X num_words] surp_w = np.zeros(len(self.meetingWords)) suidf_v = surp_w for c in range(0, len( self.meetingWords)): #ext loop, it over words to match w_ref = self.meetingWords[c] for j in range(0, self.Ns): num = 0 den = 0 for k in range(0, self.Ns): if j != k: num += self.meetingHisto[j + 1][ c] #number of times speaker k+1 utters w_ref den += np.sum( self.meetingHisto[j + 1][:] ) #number of words uttered by given speaker, pass given sp and list of speak surp_w_s[j][c] = -np.log(Help.SafeDiv(num, den)) if surp_w_s[j][c] == np.inf: surp_w_s[j][c] = self.high * self.Ns for f in range(0, len(self.meetingWords)): #f idx of each single word word = self.meetingWords[f] summ = 0 for c in range(0, self.Ns): summ += surp_w_s[c][f] surp_w[f] = Help.SafeDiv(summ, self.Ns) # suidf_v[f] = surp_w[f] * howmany(word, f, num_speak) * np.sqrt(idf(word)) / num_speak #howmany number of speaks uttered word suidf_v[f] = surp_w[f] * self.HowMany(f) * np.sqrt( self.idf[self.meetingWords.index( word)]) / self.Ns #howmany number of speaks uttered word return suidf_v
def CreateLss(self): #freq vec is the speaker tfidf Ns = self.prep.numSpeakers Lss = np.zeros((Ns, Ns)) for i in range(0, Ns): if i + 1 in self.segm.cleanSpeakers[self.íter]: v1 = Help.CreateSpeakerVector( i, self.segm.cleanSentences[self.íter], self.segm.cleanSpeakers[self.íter], self.speakVec) for j in range(0, Ns): if j + 1 in self.segm.cleanSpeakers[self.íter]: v2 = Help.CreateSpeakerVector( j, self.segm.cleanSentences[self.íter], self.segm.cleanSpeakers[self.íter], self.speakVec) if Help.NotValidCos(v1, v2): v1, v2 = Help.ReshapeVec(v1, v2) cos_dist = 1 - sp.distance.cosine(v1, v2) if math.isnan(cos_dist): Lss[i][j] = 0. else: Lss[i][j] = cos_dist return Lss
def Summarize(self, x = 1): for meeting in self.transcripts: print('\n\n\n\n\nMeeting ' + str(x) + ' ...') #preprocessing prep = Preprocessing() prep.Preprocess(meeting) print("Preprocessing completed ...") #frequency vectors freq = FrequencyMeasures(prep.meetingHisto, prep.singleWords, self.histograms['ListWordsVector'], prep.numSpeakers) freq.GetAll() print("Frequencies computed ...") #functional segmentation segm = FuncSegm(prep, freq.suidf, prep.numSpeakers) segm.Segmentation() print("Segmentation completed ...") #keywords keyw = Extractor(prep, segm, freq.idf) keyw.ExtractKeywords() print("Keywords extracted ...") #check if monologue or dialogue and apply specific method localSummary = [] i = 1 for dstr in segm.speakerDistr: if len(segm.cleanSentences[i-1]) > 1: if (np.sum(dstr) == 1): mon = Monologue(segm, keyw, i-1) mon.Summarize() localSummary.append(mon.summary) print("Monologue summarized ...") else: dial = Dialogue(prep, segm, self.histograms, self.topicModels, freq.suidf, freq.tfidfSpeak, i-1) dial.Summarize() localSummary.append(dial.summary) print("Dialogue summarized ...") elif len(segm.cleanSentences[i-1]) == 1: localSummary.append(str(segm.cleanSentOrig[i-1])) else: ... i += 1 #join, save and append the final summary txtSummary = ' '.join(localSummary) Help.SaveFileTxt(txtSummary, 'summary_' + str(x), self.resultPath) x += 1 self.summaries.append(txtSummary) print("Summary stored ...") print("Dataset summarized!!!")
def WC(self, win): WC_vec = np.zeros(self.Ns) #don't need a matrix, store the result in score vector length = 0 for s in win[1]: length = length + len(s) for j in range(0, self.Ns): #WC_j left in i count = 0 x = 0 for s in win[0]: if s == j+1: count = count + len(win[1][x]) # number of words uttered by j in win x += 1 WC_vec[j] = Help.SafeDiv(count, length) #match with dimensionality return WC_vec
def CreateLuu( self, top=False, lex=True ): # top=True means the function computes topical similarity, else lex=True computes lexical similarity Luu = np.zeros((len(self.segm.cleanSentences[self.íter]), len(self.segm.cleanSentences[self.íter]) )) # matrix [num_utterances X num_utterances] if (top and lex) or ( (not top) and (not lex) ): # if error in passing parameters (Luu can be based only on one kind of similarity) top = False # reset default parameters lex = True # reset default parameters if top: #topic similarity prob_top_sent = np.zeros( (len(self.topicModel['Terms']), len(self.segm.cleanSentences[self.íter]))) for x in range(len(self.topicModel['Terms'])): for y in range(len(self.segm.cleanSentences[self.íter])): num = 0 den = 0 for w in self.segm.cleanSentences[self.íter][y]: # idx_w = find_index_word(w, corpus, tokens_topic_model) #ret -1 if w not in corpus try: tk_id = self.topicModel['Dictionary'].token2id[w] num += (Help.FreqWordInSentence( w, self.segm.cleanSentences[self.íter][y]) * self.topicModel['Terms'][x][tk_id]) except: num += (Help.FreqWordInSentence( w, self.segm.cleanSentences[self.íter][y]) * self.small) den += Help.FreqWordInSentence( w, self.segm.cleanSentences[self.íter][y]) prob_top_sent[x][y] = Help.SafeDiv(num, den) for x in range(len(self.segm.cleanSentences[self.íter])): for y in range(len(self.segm.cleanSentences[self.íter])): LTS_sum = 0 prob = 0 for w in self.segm.cleanSentences[self.íter][y]: wFreq = self.ComputeTermFrequency( w ) #creates a vector with the frequency of the word per each doc if np.sum( wFreq ): #if w doesn't appear in the dictionary, don't waste time LTS_sum += self.CopmputeLTS( wFreq ) #return sum over all topics (LTS of a single word with frequency term_freq) prob = Help.SumTopics(prob_top_sent, x) #should I pass x or y? Luu[x][y] = LTS_sum * prob else: #lexical similarity for i in range(len(self.segm.cleanSentences[self.íter])): v1 = Help.CreateSentenceVector( self.segm.cleanSentences[self.íter][i], self.freqVec, self.prep.singleWords) for j in range(len(self.segm.cleanSentences[self.íter])): v2 = Help.CreateSentenceVector( self.segm.cleanSentences[self.íter][j], self.freqVec, self.prep.singleWords) if Help.NotValidCos(v1, v2): v1, v2 = Help.ReshapeVec(v1, v2) #if complains about v1, v2 dimensions, add zeros (or ones, idk yet, is a cosine distance) to match those dimensions cos_sim = 1 - sp.distance.cosine(v1, v2) if math.isnan(cos_sim): Luu[i][j] = 0. else: Luu[i][j] = cos_sim # cosine similarity only if vectors of same size # return norm(Luu, norm='l1') #matrix representing lexical (topic) similarity via word overlap (via LDA) return Luu