示例#1
0
    def summarize(self, text, limit_type='word', limit=100):
        raw_sentences = self.sent_tokenize(text)
        clean_sentences = self.preprocess_text(text)

        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(clean_sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        for i in range(centroid_vector.shape[0]):
            if centroid_vector[i] <= self.topic_threshold:
                centroid_vector[i] = 0

        sentences_scores = []
        for i in range(tfidf.shape[0]):
            score = base.similarity(tfidf[i, :], centroid_vector)
            sentences_scores.append((i, raw_sentences[i], score, tfidf[i, :]))

        sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True)

        count = 0
        sentences_summary = []
        for s in sentence_scores_sort:
            if count > limit:
                break
            include_flag = True
            for ps in sentences_summary:
                sim = base.similarity(s[3], ps[3])
                # print(s[0], ps[0], sim)
                if sim > self.sim_threshold:
                    include_flag = False
            if include_flag:
                # print(s[0], s[1])
                sentences_summary.append(s)
                if limit_type == 'word':
                    count += len(s[1].split())
                else:
                    count += len(s[1])

        summary = "\n".join([s[1] for s in sentences_summary])
        return summary
示例#2
0
    def summarize(self, text, limit_type='word', limit=100):
        """ 
        Main function for text summarization using word2vec embeddings. 
        The idea is similar to the previous , but more extense. 
        """
        raw_sentences = self.sent_tokenize(text) # tokenize by sentences
        clean_sentences = self.preprocess_text(text)  # preprocess sentences with model's prepreocessor (nltk or regex)

        if self.debug:
            print("ORIGINAL TEXT STATS = {0} chars, {1} words, {2} sentences".format(len(text),
                                                                                     len(text.split(' ')),
                                                                                     len(raw_sentences)))
            print("*** RAW SENTENCES ***")
            for i, s in enumerate(raw_sentences):
                print(i, s)
            print("*** CLEAN SENTENCES ***")
            for i, s in enumerate(clean_sentences):
                print(i, s)

        centroid_words = self.get_topic_idf(clean_sentences) # obtain all word whose tfidf id more than topic_threshold
        #print(centroid_words)
        
        if self.debug:
            print("*** CENTROID WORDS ***")
            print(len(centroid_words), centroid_words)

        self.word_vectors_cache(clean_sentences)  # obtain word representations from the model embeddings  
        centroid_vector = self.compose_vectors(centroid_words, debug=True) # obtain centroid vector from the centroid word embeddings 

        tfidf, centroid_bow = self.get_bow(clean_sentences) # obtain tfidf vectors ebmeddings and the resulting tfidf vector 
        max_length = get_max_length(clean_sentences) # max lenght of the clean sentences  

        sentences_scores = [] # to store the scores  
        for i in range(len(clean_sentences)): # for each sentence in clean sentences 
            scores = []
            words = clean_sentences[i].split()  # obtain agggregates words 
            sentence_vector = self.compose_vectors(words) # obtain the sentence embedding representation 

            scores.append(base.similarity(sentence_vector, centroid_vector)) # obtain similarity between sentence and centroid
            scores.append(self.bow_param * base.similarity(tfidf[i, :], centroid_bow)) # obtain similarity score of the ith tfidf word vector and centroid 
            scores.append(self.length_param * (1 - (len(words) / max_length))) # score for the length and complement ration of the numer of words and sentence max lentgth
            scores.append(self.position_param * (1 / (i + 1)))  # score for relative position of the sentence (position matters in relevance)
            score = average_score(scores) # average over all the scores obtained 
            # score = stanford_cerainty_factor(scores)
            
            sentences_scores.append((i, raw_sentences[i], score, sentence_vector))  # (index, sentence i, avg score, sentence vector)

            if self.debug:
                print(i, scores, score)

        sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True) # sort the sentences by their relative score 
        if self.debug:
            print("*** SENTENCE SCORES ***")
            for s in sentence_scores_sort:
                print(s[0], s[1], s[2])

        count = 0
        sentences_summary = []

        if self.keep_first: # fi we want to keep the first sentence 
            for s in sentence_scores_sort:
                if s[0] == 0:
                    sentences_summary.append(s)
                    if limit_type == 'word':
                        count += len(s[1].split())
                    else:
                        count += len(s[1])
                    sentence_scores_sort.remove(s)
                    break

        for s in sentence_scores_sort: # for each sentence in sorted sentences 
            if count > limit: # if the count exceeeds limit, break the algorihtm 
                break
            include_flag = True
            for ps in sentences_summary: # for every other sentence 
                sim = base.similarity(s[3], ps[3])  #somare how similar they are 
                # print(s[0], ps[0], sim)
                if sim > self.sim_threshold:  # if too similar 
                    include_flag = False  # don't include
            if include_flag:
                # print(s[0], s[1])
                sentences_summary.append(s) # append 
                if limit_type == 'word': # decide limit by word or by sentence 
                    count += len(s[1].split()) 
                else:
                    count += len(s[1])

        if self.reordering: # reorderby sorting 
            sentences_summary = sorted(sentences_summary, key=lambda el: el[0], reverse=False)

        summary = "\n".join([s[1] for s in sentences_summary]) # obtain summary by joinin the sentences  

        if self.debug:
            print("SUMMARY TEXT STATS = {0} chars, {1} words, {2} sentences".format(len(summary),
                                                                                    len(summary.split(' ')),
                                                                                    len(sentences_summary)))

            print("*** SUMMARY ***")
            print(summary)

        return summary
    def summarize(self, text, limit_type='word', limit=100):

        raw_sentences = self.sent_tokenize(text)  # sent_tokenize input text
        clean_sentences = self.preprocess_text(
            text)  # preprocess sentences using nltk ot regex

        vectorizer = CountVectorizer()  # instantiate CountVectorizer object
        sent_word_matrix = vectorizer.fit_transform(
            clean_sentences)  # create mapping of sentences

        transformer = TfidfTransformer(
            norm=None, sublinear_tf=False,
            smooth_idf=False)  # instantiate tfidf weighting
        tfidf = transformer.fit_transform(
            sent_word_matrix)  # fit tfidf weighting to the counts matrix
        tfidf = tfidf.toarray()  # convert to numpy array

        centroid_vector = tfidf.sum(0)  # centroid vector for the input text
        centroid_vector = np.divide(
            centroid_vector,
            centroid_vector.max())  # normalize by the maximum value
        for i in range(centroid_vector.shape[0]):  # for i=0 to m (sentences)
            if centroid_vector[i] <= self.topic_threshold:
                centroid_vector[i] = 0

        sentences_scores = []  # store sentence scores
        for i in range(tfidf.shape[0]):
            score = base.similarity(
                tfidf[i, :], centroid_vector
            )  # compute similarity of the sentences and centroid vector
            sentences_scores.append(
                (i, raw_sentences[i], score,
                 tfidf[i, :]))  # (i, sentence i, score , tfidf_vector)

        sentence_scores_sort = sorted(
            sentences_scores, key=lambda el: el[2],
            reverse=True)  # DESC sort sentence scores

        count = 0
        sentences_summary = []  # to store the summary
        for s in sentence_scores_sort:
            # summary is commposed of limit number of sentences
            if count > limit:
                break
            include_flag = True  # include the sentence or not
            for ps in sentences_summary:  # for each sentence i the current summary
                sim = base.similarity(s[3],
                                      ps[3])  # obtain their similarity scores
                # print(s[0], ps[0], sim)
                if sim > self.sim_threshold:  # if too similar to existent sentence in summary, discard
                    include_flag = False
            if include_flag:
                # print(s[0], s[1])
                sentences_summary.append(s)  # include sentence inthe summary
                if limit_type == 'word':
                    count += len(s[1].split(
                    ))  # max summary length is total number of words
                else:
                    count += len(
                        s[1]
                    )  # max summary legnth is total number of sentences

        summary = "\n".join([s[1] for s in sentences_summary
                             ])  # create the summary.
        return summary
    def summarize(self, text, limit_type='word', limit=100):
        raw_sentences = self.sent_tokenize(text)
        clean_sentences = self.preprocess_text(text)

        if self.debug:
            print("ORIGINAL TEXT STATS = {0} chars, {1} words, {2} sentences".
                  format(len(text), len(text.split(' ')), len(raw_sentences)))
            print("*** RAW SENTENCES ***")
            for i, s in enumerate(raw_sentences):
                print(i, s)
            print("*** CLEAN SENTENCES ***")
            for i, s in enumerate(clean_sentences):
                print(i, s)

        centroid_words = self.get_topic_idf(clean_sentences)

        if self.debug:
            print("*** CENTROID WORDS ***")
            print(len(centroid_words), centroid_words)

        self.word_vectors_cache(clean_sentences)
        centroid_vector = self.compose_vectors(centroid_words)

        tfidf, centroid_bow = self.get_bow(clean_sentences)
        max_length = get_max_length(clean_sentences)

        sentences_scores = []
        for i in range(len(clean_sentences)):
            scores = []
            words = clean_sentences[i].split()
            sentence_vector = self.compose_vectors(words)

            scores.append(base.similarity(sentence_vector, centroid_vector))
            scores.append(self.bow_param *
                          base.similarity(tfidf[i, :], centroid_bow))
            scores.append(self.length_param * (1 - (len(words) / max_length)))
            scores.append(self.position_param * (1 / (i + 1)))

            score = average_score(scores)
            # score = stanford_cerainty_factor(scores)

            sentences_scores.append(
                (i, raw_sentences[i], score, sentence_vector))

            if self.debug:
                print(i, scores, score)

        sentence_scores_sort = sorted(sentences_scores,
                                      key=lambda el: el[2],
                                      reverse=True)
        if self.debug:
            print("*** SENTENCE SCORES ***")
            for s in sentence_scores_sort:
                print(s[0], s[1], s[2])

        count = 0
        sentences_summary = []

        if self.keep_first:
            for s in sentence_scores_sort:
                if s[0] == 0:
                    sentences_summary.append(s)
                    if limit_type == 'word':
                        count += len(s[1].split())
                    else:
                        count += len(s[1])
                    sentence_scores_sort.remove(s)
                    break

        for s in sentence_scores_sort:
            if count > limit:
                break
            include_flag = True
            for ps in sentences_summary:
                sim = base.similarity(s[3], ps[3])
                # print(s[0], ps[0], sim)
                if sim > self.sim_threshold:
                    include_flag = False
            if include_flag:
                # print(s[0], s[1])
                sentences_summary.append(s)
                if limit_type == 'word':
                    count += len(s[1].split())
                else:
                    count += len(s[1])

        if self.reordering:
            sentences_summary = sorted(sentences_summary,
                                       key=lambda el: el[0],
                                       reverse=False)

        summary = "\n".join([s[1] for s in sentences_summary])

        if self.debug:
            print("SUMMARY TEXT STATS = {0} chars, {1} words, {2} sentences".
                  format(len(summary), len(summary.split(' ')),
                         len(sentences_summary)))

            print("*** SUMMARY ***")
            print(summary)

        return summary