Пример #1
0
    def ExtractKeywords(self):
        tk_segm = Help.GenSpeaksWordsTag(self.segm.cleanSentences, self.segm.cleanSpeakers, self.segm.cleanSentTags) #generate list of tokenized words with relative speaker UNCOMMENT
        tk_segm_list = Help.Expand(tk_segm) #words, speakers, tags
        idf_w = np.zeros(len(tk_segm_list[0]))
        ext_score = np.zeros(len(tk_segm_list[0]))
        num_segm = len(self.segm.cleanSentences)
        prob_s_w = np.zeros((len(tk_segm_list[0]), num_segm))
        num_s_w = np.zeros((len(tk_segm_list[0]), num_segm))
        nent  = np.zeros(len(tk_segm_list[0]))
        den_w = np.zeros(len(tk_segm_list[0]))

        #tfidf is created considering the words ordered as in spacy_loc_single_w
        for i in range(0, len(tk_segm_list[0])):
            idf_w[i] = self.freqVec[self.prep.singleWords.index(tk_segm_list[0][i])] 

        #check this later
        for i in range(0, len(tk_segm_list[0])): #i iter over words
            for j in range(0, num_segm): #j iter over segments
                try:
                    freq = Help.WordSegmFrequency(tk_segm[2][j], tk_segm_list[0][i]) #times w is uttered in segment j
                except:
                    print(j)
                    input("enter")
                try:
                    num_s_w[i][j] = freq
                except:
                    print("2")
                    input("enter")
                try:
                    den_w[i] = freq + den_w[i] #sum over all segments
                except:
                    print("3")
                    input("enter")
                
                
        for i in range(0, len(tk_segm_list[0])):
            for j in range(0, num_segm):
                prob_s_w[i][j] = num_s_w[i][j] / den_w[i] #probability of being in segment j given word i
    
        for i in range(0, len(tk_segm_list[0])):
            for j in range(0, num_segm):
                if tk_segm_list[0][i] in tk_segm[2][j]:
                    nent[i] = nent[i] - (prob_s_w[i][j] * np.log(prob_s_w[i][j])) #negative entropy of each word
    
        for i in range(len(tk_segm_list[0])):
           ext_score[i] = self.kIdf * idf_w[i] + self.kNent * nent[i] #sum global and local scores
        [loc_single_words, scores] = Help.RemoveMultipleKeywords(tk_segm_list[0], ext_score) #remove multiple keywords
        newScores = self.PosAndNer(loc_single_words, scores, tk_segm_list) #filter according to POS and adapt weight acc to NER
        self.Revert(loc_single_words, newScores) #revert scores and extract top Tm%
    def CopmputeLTS(self, wFreq):
        #topicModel['Docs'] is a list of len(num_documents) where each el is (topic_id, topic_prob)
        # wFreq is a list with size #documents giving the frequency of the term in each doc
        LTS = np.zeros(self.topicModel['NumTopics'], dtype=float)
        num = np.zeros(self.topicModel['NumTopics'], dtype=float)
        den = np.zeros(self.topicModel['NumTopics'], dtype=float)

        for y in range(len(self.topicModel['Docs'])):  #iterate over documents
            if wFreq[y]:
                for x in range(len(self.topicModel['Docs'][y])):

                    idx = self.topicModel['Docs'][y][x][0] - 1

                    prob = self.topicModel['Docs'][y][x][1]
                    add_num = wFreq[y] * prob

                    add_den = wFreq[y] * (1 - prob)

                    num[idx] = num[idx] + add_num
                    den[idx] = den[idx] + add_den

        for x in range(len(LTS)):
            LTS[x] = Help.SafeDiv(
                num[x], den[x]
            )  #first sum over docs, then divide, then sum over topics

        return np.sum(
            LTS
        )  #return sum over all topics (LTS of a single word with frequency term_freq)
Пример #3
0
    def Segmentation(self):
        #get boundaries, CB and number of segments
        self.GetBoundaries()#
        #segment according to previous results
        newSegments         = self.Segmenter()#

        for x in range(len(newSegments[0])):
            locSentences = newSegments[0][x]
            locSpeakers = newSegments[1][x]  
            vec = [locSentences, locSpeakers]
            self.speakerDistr.append(self.ScoreSegment(vec)[1])#

        for x in range(len(newSegments[0])):
            self.cleanSentences.append(Help.RemoveMinorSpeaker(newSegments[0][x], newSegments[1][x], self.speakerDistr[x]))    
            self.cleanSpeakers.append(Help.RemoveMinorSpeaker(newSegments[1][x], newSegments[1][x], self.speakerDistr[x]))  
            self.cleanSentOrig.append(Help.RemoveMinorSpeaker(newSegments[2][x], newSegments[1][x], self.speakerDistr[x]))  
            self.cleanSentTags.append(Help.RemoveMinorSpeaker(newSegments[3][x], newSegments[1][x], self.speakerDistr[x]))  
Пример #4
0
    def Score(self, lenText):    

        score = np.zeros(lenText)
        smooth_score = np.zeros(lenText)
        for i in range(1, lenText-1): # iterate over all wiindows (as many as num of sentences), 
                                       # each can be potential segment boundary but first and last sentences             
            maxim_i = np.maximum(0, i-self.winLength)
            maxim_f = np.maximum(1, i)
            minim = np.maximum(np.minimum(lenText, i + self.winLength), maxim_f)  
            win_left_idx = []
            win_right_idx = []
        
            for x in range(maxim_i, maxim_f): # min and max to avoid index problems
                win_left_idx.append(x)
            
            for x in range(i, minim): # min and max to avoid index problems
                win_right_idx.append(x)
        
            if i > 1:
                win_l = [self.prep.speakers[win_left_idx[0] : win_left_idx[-1]], self.prep.sentLemma[win_left_idx[0] : win_left_idx[-1]]] #create win left_i
            else: #when i = 1 --> win_l is only the first el 
                win_l = [[self.prep.speakers[0]], [self.prep.sentLemma[0]]]
            if i < lenText - 2:
                win_r = [self.prep.speakers[win_right_idx[0] : win_right_idx[-1] +1], self.prep.sentLemma[win_right_idx[0] : win_right_idx[-1] +1]] #create win left_i
            else:
                win_r = [[self.prep.speakers[-1]], [self.prep.sentLemma[-1]]]
        
          
            WC_l = self.WC(win_l)
            WC_r = self.WC(win_r)
            dist_wc = Help.Dist(WC_l, WC_r)
            WI_l = self.WI(win_l[1], win_l[0])
            WI_r = self.WI(win_r[1], win_r[0])
            dist_wi = Help.Dist(WI_l, WI_r)
            score[i] = dist_wc + dist_wi  
    
        for i in range(1, lenText-1):
            temp_score = 0 #loc score for smoothing
            bound = self.SafeSmooth(i, len(score)) #check not to go out of size when smoothing
            low = bound[0]
            up = bound[1]
            for j in range(low, up): 
                temp_score += score[j] 
            smooth_score[i] = temp_score / (1 + self.smoothParam)
        return score, smooth_score
 def CreateLus(self):
     Ns = self.prep.numSpeakers
     Lus = np.zeros((len(self.segm.cleanSentences[self.íter]), Ns))
     for i in range(0, len(self.segm.cleanSentences[self.íter])):
         v1 = Help.CreateSentenceVector(
             self.segm.cleanSentences[self.íter][i], self.freqVec,
             self.prep.singleWords)
         for j in range(0, Ns):
             if j + 1 in self.segm.cleanSpeakers[self.íter]:
                 v2 = Help.CreateSpeakerVector(
                     j, self.segm.cleanSentences[self.íter],
                     self.segm.cleanSpeakers[self.íter], self.speakVec)
                 if Help.NotValidCos(v1, v2):
                     v1, v2 = Help.ReshapeVec(v1, v2)
                 cos_dist = 1 - sp.distance.cosine(v1, v2)
                 if math.isnan(cos_dist):
                     Lus[i][j] = 0.
                 else:
                     Lus[i][j] = cos_dist
     return Lus
Пример #6
0
    def ScoreSegment(self, segm):
        sum_c = []
        c_idx = []

        cat = Help.GenCat(self.Ns) #generate categories vector
        cat = cat[1:] #[0,0,0,0,0] not allowed, there's always at least one speaker
 
        #sum_c score per each cat, find min
        #c_idx idx of each cat, to find the cat corresp to min score
        for c in cat:
 
            sum_dist = Help.Dist(Help.Dstr(segm, self.Ns), Help.DstrId(self.Ns,c)) #score segments
            sum_c.append(sum_dist)
            c_idx.append(c)
             #c single category boolean vector used for calling dstr_id
        
        min_score = np.min(sum_c) 
        min_cat = c_idx[np.argmin(sum_c)] # in the list of categories, take el of idx that minimizes the score         

        return min_score, min_cat
Пример #7
0
 def WI(self, w, s):    
     
     WI_vec = np.zeros(self.Ns)
     if w:
         suidf_win = Help.CreateSentenceVector(w, self.freq, self.prep.singleWords)
     
         den = 0 #doesn't have to be reset
         num = [] #append num per each speaker
         for j in range(self.Ns):
             num_t = 0 #numerator for given speaker
             for k in range(len(s)):
                 if s[k] == j+1:
                     num_t += suidf_win[k]
                     den += suidf_win[k]   
             num.append(num_t)
         
     
         for j in range(0, self.Ns):
             WI_vec[j] = Help.SafeDiv(num[j] , den)
     
     return WI_vec  
    def Suidf(
        self
    ):  #computes suidf for all the words in the meeting (not in the dataset, not sure what's better)

        surp_w_s = np.zeros(
            (self.Ns,
             len(self.meetingWords)))  #matrix [num_speakers X num_words]
        surp_w = np.zeros(len(self.meetingWords))
        suidf_v = surp_w

        for c in range(0, len(
                self.meetingWords)):  #ext loop, it over words to match

            w_ref = self.meetingWords[c]

            for j in range(0, self.Ns):
                num = 0
                den = 0
                for k in range(0, self.Ns):
                    if j != k:
                        num += self.meetingHisto[j + 1][
                            c]  #number of times speaker k+1 utters w_ref
                        den += np.sum(
                            self.meetingHisto[j + 1][:]
                        )  #number of words uttered by given speaker, pass given sp and list of speak
                surp_w_s[j][c] = -np.log(Help.SafeDiv(num, den))
                if surp_w_s[j][c] == np.inf:
                    surp_w_s[j][c] = self.high * self.Ns

        for f in range(0, len(self.meetingWords)):  #f idx of each single word
            word = self.meetingWords[f]
            summ = 0
            for c in range(0, self.Ns):
                summ += surp_w_s[c][f]
            surp_w[f] = Help.SafeDiv(summ, self.Ns)
            #            suidf_v[f] = surp_w[f] * howmany(word, f, num_speak) * np.sqrt(idf(word)) / num_speak #howmany number of speaks uttered word
            suidf_v[f] = surp_w[f] * self.HowMany(f) * np.sqrt(
                self.idf[self.meetingWords.index(
                    word)]) / self.Ns  #howmany number of speaks uttered word
        return suidf_v
    def CreateLss(self):  #freq vec is the speaker tfidf
        Ns = self.prep.numSpeakers
        Lss = np.zeros((Ns, Ns))

        for i in range(0, Ns):
            if i + 1 in self.segm.cleanSpeakers[self.íter]:
                v1 = Help.CreateSpeakerVector(
                    i, self.segm.cleanSentences[self.íter],
                    self.segm.cleanSpeakers[self.íter], self.speakVec)
                for j in range(0, Ns):
                    if j + 1 in self.segm.cleanSpeakers[self.íter]:
                        v2 = Help.CreateSpeakerVector(
                            j, self.segm.cleanSentences[self.íter],
                            self.segm.cleanSpeakers[self.íter], self.speakVec)
                        if Help.NotValidCos(v1, v2):
                            v1, v2 = Help.ReshapeVec(v1, v2)
                        cos_dist = 1 - sp.distance.cosine(v1, v2)
                        if math.isnan(cos_dist):
                            Lss[i][j] = 0.
                        else:
                            Lss[i][j] = cos_dist

        return Lss
    def Summarize(self, x = 1):

        for meeting in self.transcripts:
            print('\n\n\n\n\nMeeting ' + str(x) + ' ...')
            #preprocessing
            prep = Preprocessing()
            prep.Preprocess(meeting)
            print("Preprocessing completed ...")
            #frequency vectors
            freq = FrequencyMeasures(prep.meetingHisto, prep.singleWords, self.histograms['ListWordsVector'], prep.numSpeakers)
            freq.GetAll()
            print("Frequencies computed ...")
            #functional segmentation
            segm = FuncSegm(prep, freq.suidf, prep.numSpeakers)
            segm.Segmentation()
            print("Segmentation completed ...")
            #keywords
            keyw = Extractor(prep, segm, freq.idf)
            keyw.ExtractKeywords()
            print("Keywords extracted ...")
            #check if monologue or dialogue and apply specific method
            localSummary = []
            i = 1
            for dstr in segm.speakerDistr:
                if len(segm.cleanSentences[i-1]) > 1:
                    if (np.sum(dstr) == 1):
                        mon = Monologue(segm, keyw, i-1)
                        mon.Summarize()
                        localSummary.append(mon.summary)
                        print("Monologue summarized ...")
                    else:
                        dial = Dialogue(prep, segm, self.histograms, self.topicModels, freq.suidf, freq.tfidfSpeak, i-1)
                        dial.Summarize()
                        localSummary.append(dial.summary)
                        print("Dialogue summarized ...")
                elif len(segm.cleanSentences[i-1]) == 1:
                    localSummary.append(str(segm.cleanSentOrig[i-1]))
                else:
                    ...
                i += 1
            
            #join, save and append the final summary
            txtSummary = ' '.join(localSummary)
            Help.SaveFileTxt(txtSummary, 'summary_' + str(x), self.resultPath)
            x += 1
            self.summaries.append(txtSummary)
            print("Summary stored ...")
        
        print("Dataset summarized!!!")
Пример #11
0
    def WC(self, win):
    
        WC_vec = np.zeros(self.Ns) #don't need a matrix, store the result in score vector
        length = 0

        for s in win[1]:
            length = length + len(s)    
    
        for j in range(0, self.Ns): #WC_j left in i 
            count = 0
            x = 0
            for s in win[0]:
                if s == j+1:
                    count = count + len(win[1][x]) # number of words uttered by j in win
                x += 1 
            WC_vec[j] = Help.SafeDiv(count, length)    #match with dimensionality            
        return WC_vec    
    def CreateLuu(
        self,
        top=False,
        lex=True
    ):  # top=True means the function computes topical similarity, else lex=True computes lexical similarity

        Luu = np.zeros((len(self.segm.cleanSentences[self.íter]),
                        len(self.segm.cleanSentences[self.íter])
                        ))  # matrix [num_utterances X num_utterances]
        if (top and lex) or (
            (not top) and (not lex)
        ):  # if error in passing parameters (Luu can be based only on one kind of similarity)
            top = False  # reset default parameters
            lex = True  # reset default parameters

        if top:  #topic similarity

            prob_top_sent = np.zeros(
                (len(self.topicModel['Terms']),
                 len(self.segm.cleanSentences[self.íter])))
            for x in range(len(self.topicModel['Terms'])):
                for y in range(len(self.segm.cleanSentences[self.íter])):
                    num = 0
                    den = 0
                    for w in self.segm.cleanSentences[self.íter][y]:
                        #                    idx_w = find_index_word(w, corpus, tokens_topic_model) #ret -1 if w not in corpus

                        try:
                            tk_id = self.topicModel['Dictionary'].token2id[w]
                            num += (Help.FreqWordInSentence(
                                w, self.segm.cleanSentences[self.íter][y]) *
                                    self.topicModel['Terms'][x][tk_id])
                        except:
                            num += (Help.FreqWordInSentence(
                                w, self.segm.cleanSentences[self.íter][y]) *
                                    self.small)
                        den += Help.FreqWordInSentence(
                            w, self.segm.cleanSentences[self.íter][y])
                    prob_top_sent[x][y] = Help.SafeDiv(num, den)

            for x in range(len(self.segm.cleanSentences[self.íter])):
                for y in range(len(self.segm.cleanSentences[self.íter])):
                    LTS_sum = 0
                    prob = 0
                    for w in self.segm.cleanSentences[self.íter][y]:

                        wFreq = self.ComputeTermFrequency(
                            w
                        )  #creates a vector with the frequency of the word per each doc
                        if np.sum(
                                wFreq
                        ):  #if w doesn't appear in the dictionary, don't waste time
                            LTS_sum += self.CopmputeLTS(
                                wFreq
                            )  #return sum over all topics (LTS of a single word with frequency term_freq)
                    prob = Help.SumTopics(prob_top_sent,
                                          x)  #should I pass x or y?
                    Luu[x][y] = LTS_sum * prob

        else:  #lexical similarity
            for i in range(len(self.segm.cleanSentences[self.íter])):
                v1 = Help.CreateSentenceVector(
                    self.segm.cleanSentences[self.íter][i], self.freqVec,
                    self.prep.singleWords)
                for j in range(len(self.segm.cleanSentences[self.íter])):
                    v2 = Help.CreateSentenceVector(
                        self.segm.cleanSentences[self.íter][j], self.freqVec,
                        self.prep.singleWords)
                    if Help.NotValidCos(v1, v2):
                        v1, v2 = Help.ReshapeVec(v1, v2)
                        #if complains about v1, v2 dimensions, add zeros (or ones, idk yet, is a cosine distance) to match those dimensions
                    cos_sim = 1 - sp.distance.cosine(v1, v2)
                    if math.isnan(cos_sim):
                        Luu[i][j] = 0.
                    else:
                        Luu[i][j] = cos_sim

                    # cosine similarity only if vectors of same size

    #    return norm(Luu, norm='l1') #matrix representing lexical (topic) similarity via word overlap (via LDA)
        return Luu