def summarize(self, text, limit_type='word', limit=100): raw_sentences = self.sent_tokenize(text) clean_sentences = self.preprocess_text(text) vectorizer = CountVectorizer() sent_word_matrix = vectorizer.fit_transform(clean_sentences) transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False) tfidf = transformer.fit_transform(sent_word_matrix) tfidf = tfidf.toarray() centroid_vector = tfidf.sum(0) centroid_vector = np.divide(centroid_vector, centroid_vector.max()) for i in range(centroid_vector.shape[0]): if centroid_vector[i] <= self.topic_threshold: centroid_vector[i] = 0 sentences_scores = [] for i in range(tfidf.shape[0]): score = base.similarity(tfidf[i, :], centroid_vector) sentences_scores.append((i, raw_sentences[i], score, tfidf[i, :])) sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True) count = 0 sentences_summary = [] for s in sentence_scores_sort: if count > limit: break include_flag = True for ps in sentences_summary: sim = base.similarity(s[3], ps[3]) # print(s[0], ps[0], sim) if sim > self.sim_threshold: include_flag = False if include_flag: # print(s[0], s[1]) sentences_summary.append(s) if limit_type == 'word': count += len(s[1].split()) else: count += len(s[1]) summary = "\n".join([s[1] for s in sentences_summary]) return summary
def summarize(self, text, limit_type='word', limit=100): """ Main function for text summarization using word2vec embeddings. The idea is similar to the previous , but more extense. """ raw_sentences = self.sent_tokenize(text) # tokenize by sentences clean_sentences = self.preprocess_text(text) # preprocess sentences with model's prepreocessor (nltk or regex) if self.debug: print("ORIGINAL TEXT STATS = {0} chars, {1} words, {2} sentences".format(len(text), len(text.split(' ')), len(raw_sentences))) print("*** RAW SENTENCES ***") for i, s in enumerate(raw_sentences): print(i, s) print("*** CLEAN SENTENCES ***") for i, s in enumerate(clean_sentences): print(i, s) centroid_words = self.get_topic_idf(clean_sentences) # obtain all word whose tfidf id more than topic_threshold #print(centroid_words) if self.debug: print("*** CENTROID WORDS ***") print(len(centroid_words), centroid_words) self.word_vectors_cache(clean_sentences) # obtain word representations from the model embeddings centroid_vector = self.compose_vectors(centroid_words, debug=True) # obtain centroid vector from the centroid word embeddings tfidf, centroid_bow = self.get_bow(clean_sentences) # obtain tfidf vectors ebmeddings and the resulting tfidf vector max_length = get_max_length(clean_sentences) # max lenght of the clean sentences sentences_scores = [] # to store the scores for i in range(len(clean_sentences)): # for each sentence in clean sentences scores = [] words = clean_sentences[i].split() # obtain agggregates words sentence_vector = self.compose_vectors(words) # obtain the sentence embedding representation scores.append(base.similarity(sentence_vector, centroid_vector)) # obtain similarity between sentence and centroid scores.append(self.bow_param * base.similarity(tfidf[i, :], centroid_bow)) # obtain similarity score of the ith tfidf word vector and centroid scores.append(self.length_param * (1 - (len(words) / max_length))) # score for the length and complement ration of the numer of words and sentence max lentgth scores.append(self.position_param * (1 / (i + 1))) # score for relative position of the sentence (position matters in relevance) score = average_score(scores) # average over all the scores obtained # score = stanford_cerainty_factor(scores) sentences_scores.append((i, raw_sentences[i], score, sentence_vector)) # (index, sentence i, avg score, sentence vector) if self.debug: print(i, scores, score) sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True) # sort the sentences by their relative score if self.debug: print("*** SENTENCE SCORES ***") for s in sentence_scores_sort: print(s[0], s[1], s[2]) count = 0 sentences_summary = [] if self.keep_first: # fi we want to keep the first sentence for s in sentence_scores_sort: if s[0] == 0: sentences_summary.append(s) if limit_type == 'word': count += len(s[1].split()) else: count += len(s[1]) sentence_scores_sort.remove(s) break for s in sentence_scores_sort: # for each sentence in sorted sentences if count > limit: # if the count exceeeds limit, break the algorihtm break include_flag = True for ps in sentences_summary: # for every other sentence sim = base.similarity(s[3], ps[3]) #somare how similar they are # print(s[0], ps[0], sim) if sim > self.sim_threshold: # if too similar include_flag = False # don't include if include_flag: # print(s[0], s[1]) sentences_summary.append(s) # append if limit_type == 'word': # decide limit by word or by sentence count += len(s[1].split()) else: count += len(s[1]) if self.reordering: # reorderby sorting sentences_summary = sorted(sentences_summary, key=lambda el: el[0], reverse=False) summary = "\n".join([s[1] for s in sentences_summary]) # obtain summary by joinin the sentences if self.debug: print("SUMMARY TEXT STATS = {0} chars, {1} words, {2} sentences".format(len(summary), len(summary.split(' ')), len(sentences_summary))) print("*** SUMMARY ***") print(summary) return summary
def summarize(self, text, limit_type='word', limit=100): raw_sentences = self.sent_tokenize(text) # sent_tokenize input text clean_sentences = self.preprocess_text( text) # preprocess sentences using nltk ot regex vectorizer = CountVectorizer() # instantiate CountVectorizer object sent_word_matrix = vectorizer.fit_transform( clean_sentences) # create mapping of sentences transformer = TfidfTransformer( norm=None, sublinear_tf=False, smooth_idf=False) # instantiate tfidf weighting tfidf = transformer.fit_transform( sent_word_matrix) # fit tfidf weighting to the counts matrix tfidf = tfidf.toarray() # convert to numpy array centroid_vector = tfidf.sum(0) # centroid vector for the input text centroid_vector = np.divide( centroid_vector, centroid_vector.max()) # normalize by the maximum value for i in range(centroid_vector.shape[0]): # for i=0 to m (sentences) if centroid_vector[i] <= self.topic_threshold: centroid_vector[i] = 0 sentences_scores = [] # store sentence scores for i in range(tfidf.shape[0]): score = base.similarity( tfidf[i, :], centroid_vector ) # compute similarity of the sentences and centroid vector sentences_scores.append( (i, raw_sentences[i], score, tfidf[i, :])) # (i, sentence i, score , tfidf_vector) sentence_scores_sort = sorted( sentences_scores, key=lambda el: el[2], reverse=True) # DESC sort sentence scores count = 0 sentences_summary = [] # to store the summary for s in sentence_scores_sort: # summary is commposed of limit number of sentences if count > limit: break include_flag = True # include the sentence or not for ps in sentences_summary: # for each sentence i the current summary sim = base.similarity(s[3], ps[3]) # obtain their similarity scores # print(s[0], ps[0], sim) if sim > self.sim_threshold: # if too similar to existent sentence in summary, discard include_flag = False if include_flag: # print(s[0], s[1]) sentences_summary.append(s) # include sentence inthe summary if limit_type == 'word': count += len(s[1].split( )) # max summary length is total number of words else: count += len( s[1] ) # max summary legnth is total number of sentences summary = "\n".join([s[1] for s in sentences_summary ]) # create the summary. return summary
def summarize(self, text, limit_type='word', limit=100): raw_sentences = self.sent_tokenize(text) clean_sentences = self.preprocess_text(text) if self.debug: print("ORIGINAL TEXT STATS = {0} chars, {1} words, {2} sentences". format(len(text), len(text.split(' ')), len(raw_sentences))) print("*** RAW SENTENCES ***") for i, s in enumerate(raw_sentences): print(i, s) print("*** CLEAN SENTENCES ***") for i, s in enumerate(clean_sentences): print(i, s) centroid_words = self.get_topic_idf(clean_sentences) if self.debug: print("*** CENTROID WORDS ***") print(len(centroid_words), centroid_words) self.word_vectors_cache(clean_sentences) centroid_vector = self.compose_vectors(centroid_words) tfidf, centroid_bow = self.get_bow(clean_sentences) max_length = get_max_length(clean_sentences) sentences_scores = [] for i in range(len(clean_sentences)): scores = [] words = clean_sentences[i].split() sentence_vector = self.compose_vectors(words) scores.append(base.similarity(sentence_vector, centroid_vector)) scores.append(self.bow_param * base.similarity(tfidf[i, :], centroid_bow)) scores.append(self.length_param * (1 - (len(words) / max_length))) scores.append(self.position_param * (1 / (i + 1))) score = average_score(scores) # score = stanford_cerainty_factor(scores) sentences_scores.append( (i, raw_sentences[i], score, sentence_vector)) if self.debug: print(i, scores, score) sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True) if self.debug: print("*** SENTENCE SCORES ***") for s in sentence_scores_sort: print(s[0], s[1], s[2]) count = 0 sentences_summary = [] if self.keep_first: for s in sentence_scores_sort: if s[0] == 0: sentences_summary.append(s) if limit_type == 'word': count += len(s[1].split()) else: count += len(s[1]) sentence_scores_sort.remove(s) break for s in sentence_scores_sort: if count > limit: break include_flag = True for ps in sentences_summary: sim = base.similarity(s[3], ps[3]) # print(s[0], ps[0], sim) if sim > self.sim_threshold: include_flag = False if include_flag: # print(s[0], s[1]) sentences_summary.append(s) if limit_type == 'word': count += len(s[1].split()) else: count += len(s[1]) if self.reordering: sentences_summary = sorted(sentences_summary, key=lambda el: el[0], reverse=False) summary = "\n".join([s[1] for s in sentences_summary]) if self.debug: print("SUMMARY TEXT STATS = {0} chars, {1} words, {2} sentences". format(len(summary), len(summary.split(' ')), len(sentences_summary))) print("*** SUMMARY ***") print(summary) return summary