def optimise( self, sents, k, filter, ix_to_label, pairwise_sims, sent_coverages, avg_sent_sims, ): alpha = self.a / len(sents) remaining = set(range(len(sents))) selected = [] while len(remaining) > 0 and len(selected) < k: i_to_score = {} for i in remaining: summary_indices = selected + [i] cov = self.compute_summary_coverage( alpha, summary_indices, sent_coverages, pairwise_sims ) div = self.compute_summary_diversity( summary_indices, ix_to_label, avg_sent_sims ) score = cov + self.div_weight * div i_to_score[i] = score if self.plug: sent_pages = [int(s.article_page) for s in sents] count_dict = { i_score[0]: (i_score[1], sent_pages[i_score[0]]) for i_score in i_to_score.items() } ranked = plugin.get_combined_1st_rank( count_dict, page_weight=self.plug, output_one=False ) else: ranked = sorted(i_to_score.items(), key=lambda x: x[1], reverse=True) for i, score in ranked: s = sents[i] remaining.remove(i) if filter and not filter(s): continue else: selected.append(i) break return selected
def rank(self, clusters, collection=None, vectorizer=None, plug=False): if plug: count_dict = { i: ( len(cluster), plugin.get_page_sum_from_cluster(cluster) / len(cluster), ) for i, cluster in enumerate(clusters) } ranked = plugin.get_combined_1st_rank( count_dict, page_weight=plug, output_one=False ) return [clusters[i] for i, _ in ranked] else: return sorted(clusters, key=len, reverse=True)
def most_mentioned_time_with_influence_rank(self, page_weight=1.0): count_dict = {} for a in self.articles: for s in a.sentences: if s.time and s.time_level == "d": count, page_sum = count_dict.get(s.time, (0, 0)) count_dict[s.time] = (count + 1, page_sum + int(s.article_page)) count_dict = { key: (tuple_[0], tuple_[1] / tuple_[0]) for key, tuple_ in count_dict.items() } if count_dict: # equals to "if len(mentioned_times) != 0:" return plugin.get_combined_1st_rank(count_dict, page_weight) else: return None
def summarize(self, sents, k, vectorizer, filter=None): raw_sents = [s.raw for s in sents] try: X = vectorizer.transform(raw_sents) for i, s in enumerate(sents): s.vector = X[i] except: return None scores = self.score_sentences(X) indices = list(range(len(sents))) if self.plug: sent_pages = [int(s.article_page) for s in sents] count_dict = {i: (scores[i], sent_pages[i]) for i in indices} ranked = plugin.get_combined_1st_rank( count_dict, page_weight=self.plug, output_one=False ) else: ranked = sorted(zip(indices, scores), key=lambda x: x[1], reverse=True) summary_sents = [] summary_vectors = [] for i, _ in ranked: if len(summary_sents) >= k: break new_x = X[i] s = sents[i] is_redundant = False for x in summary_vectors: if cosine_similarity(new_x, x)[0, 0] > self.max_sim: is_redundant = True break if filter and not filter(s): continue elif is_redundant: continue else: summary_sents.append(sents[i]) summary_vectors.append(new_x) summary = [s.raw for s in summary_sents] return summary
def optimise(self, centroid, X, sents, k, filter): remaining = set(range(len(sents))) selected = [] while len(remaining) > 0 and len(selected) < k: if len(selected) > 0: summary_vector = sparse.vstack([X[i] for i in selected]) summary_vector = sparse.csr_matrix(summary_vector.sum(0)) i_to_score = {} for i in remaining: if len(selected) > 0: new_x = X[i] new_summary_vector = sparse.vstack([new_x, summary_vector]) new_summary_vector = normalize(new_summary_vector.sum(0)) else: new_summary_vector = X[i] score = cosine_similarity(new_summary_vector, centroid)[0, 0] i_to_score[i] = score if self.plug: sent_pages = [int(s.article_page) for s in sents] count_dict = { i_score[0]: (i_score[1], sent_pages[i_score[0]]) for i_score in i_to_score.items() } ranked = plugin.get_combined_1st_rank( count_dict, page_weight=self.plug, output_one=False ) else: ranked = sorted(i_to_score.items(), key=lambda x: x[1], reverse=True) for i, _ in ranked: s = sents[i] remaining.remove(i) if filter and not filter(s): continue elif self.is_redundant(i, selected, X): continue else: selected.append(i) break return selected
def rank_dates(self, collection, plug=False): date_to_count = collections.defaultdict(int) if plug: pages_to_count = collections.defaultdict(int) for a in collection.articles(): d = a.time.date() date_to_count[d] += 1 if plug: pages_to_count[d] += a.page if plug: count_dict = { d: (count, pages_to_count[d] / count) for d, count in date_to_count.items() } ranked = plugin.get_combined_1st_rank(count_dict, page_weight=plug, output_one=False) else: ranked = sorted(date_to_count.items(), key=lambda x: x[1], reverse=True) return [d for d, _ in ranked]
def rank_dates(self, collection, plug=False): if plug: dates, X, pages_to_count = self.extract_features(collection, plug) else: dates, X = self.extract_features(collection) X = normalize(X, norm="l2", axis=0) if self.method == "classification": Y = [y[1] for y in self.model["model"].predict_proba(X)] else: Y = self.model["model"].predict(X) if plug: count_dict = { d: (Y[i], pages_to_count[d]) for i, d in enumerate(dates) } ranked = [ x[0] for x in plugin.get_combined_1st_rank( count_dict, page_weight=plug, output_one=False) ] else: scored = sorted(zip(dates, Y), key=lambda x: x[1], reverse=True) ranked = [x[0] for x in scored] return ranked
def rank(self, clusters, collection=None, vectorizer=None, plug=False): date_to_count = collections.defaultdict(int) for a in collection.articles(): for s in a.sentences: d = s.get_date() if d: date_to_count[d] += 1 clusters = sorted(clusters, reverse=True, key=len) def get_count(c): if plug: t = c.most_mentioned_time_with_influence_rank(plug) else: t = c.most_mentioned_time() if t: return date_to_count[t.date()] else: return 0 clusters = sorted( clusters, reverse=True, key=get_count ) # to give each cluster a specific date if plug: count_dict = { i: ( len(cluster), plugin.get_page_sum_from_cluster(cluster) / len(cluster), ) for i, cluster in enumerate(clusters) } ranked = plugin.get_combined_1st_rank( count_dict, page_weight=plug, output_one=False ) return [clusters[i] for i, _ in ranked] else: return sorted(clusters, key=len, reverse=True)