示例#1
0
    def optimise(
        self,
        sents,
        k,
        filter,
        ix_to_label,
        pairwise_sims,
        sent_coverages,
        avg_sent_sims,
    ):

        alpha = self.a / len(sents)
        remaining = set(range(len(sents)))
        selected = []

        while len(remaining) > 0 and len(selected) < k:

            i_to_score = {}
            for i in remaining:
                summary_indices = selected + [i]
                cov = self.compute_summary_coverage(
                    alpha, summary_indices, sent_coverages, pairwise_sims
                )
                div = self.compute_summary_diversity(
                    summary_indices, ix_to_label, avg_sent_sims
                )
                score = cov + self.div_weight * div
                i_to_score[i] = score

            if self.plug:
                sent_pages = [int(s.article_page) for s in sents]
                count_dict = {
                    i_score[0]: (i_score[1], sent_pages[i_score[0]])
                    for i_score in i_to_score.items()
                }
                ranked = plugin.get_combined_1st_rank(
                    count_dict, page_weight=self.plug, output_one=False
                )
            else:
                ranked = sorted(i_to_score.items(), key=lambda x: x[1], reverse=True)
            for i, score in ranked:
                s = sents[i]
                remaining.remove(i)
                if filter and not filter(s):
                    continue
                else:
                    selected.append(i)
                    break

        return selected
示例#2
0
 def rank(self, clusters, collection=None, vectorizer=None, plug=False):
     if plug:
         count_dict = {
             i: (
                 len(cluster),
                 plugin.get_page_sum_from_cluster(cluster) / len(cluster),
             )
             for i, cluster in enumerate(clusters)
         }
         ranked = plugin.get_combined_1st_rank(
             count_dict, page_weight=plug, output_one=False
         )
         return [clusters[i] for i, _ in ranked]
     else:
         return sorted(clusters, key=len, reverse=True)
示例#3
0
    def most_mentioned_time_with_influence_rank(self, page_weight=1.0):
        count_dict = {}

        for a in self.articles:
            for s in a.sentences:
                if s.time and s.time_level == "d":
                    count, page_sum = count_dict.get(s.time, (0, 0))
                    count_dict[s.time] = (count + 1, page_sum + int(s.article_page))
        count_dict = {
            key: (tuple_[0], tuple_[1] / tuple_[0])
            for key, tuple_ in count_dict.items()
        }
        if count_dict:  # equals to "if len(mentioned_times) != 0:"
            return plugin.get_combined_1st_rank(count_dict, page_weight)
        else:
            return None
示例#4
0
    def summarize(self, sents, k, vectorizer, filter=None):
        raw_sents = [s.raw for s in sents]

        try:
            X = vectorizer.transform(raw_sents)
            for i, s in enumerate(sents):
                s.vector = X[i]
        except:
            return None

        scores = self.score_sentences(X)
        indices = list(range(len(sents)))

        if self.plug:
            sent_pages = [int(s.article_page) for s in sents]
            count_dict = {i: (scores[i], sent_pages[i]) for i in indices}
            ranked = plugin.get_combined_1st_rank(
                count_dict, page_weight=self.plug, output_one=False
            )
        else:
            ranked = sorted(zip(indices, scores), key=lambda x: x[1], reverse=True)

        summary_sents = []
        summary_vectors = []
        for i, _ in ranked:
            if len(summary_sents) >= k:
                break
            new_x = X[i]
            s = sents[i]
            is_redundant = False
            for x in summary_vectors:
                if cosine_similarity(new_x, x)[0, 0] > self.max_sim:
                    is_redundant = True
                    break
            if filter and not filter(s):
                continue
            elif is_redundant:
                continue
            else:
                summary_sents.append(sents[i])
                summary_vectors.append(new_x)

        summary = [s.raw for s in summary_sents]
        return summary
示例#5
0
    def optimise(self, centroid, X, sents, k, filter):
        remaining = set(range(len(sents)))
        selected = []
        while len(remaining) > 0 and len(selected) < k:
            if len(selected) > 0:
                summary_vector = sparse.vstack([X[i] for i in selected])
                summary_vector = sparse.csr_matrix(summary_vector.sum(0))
            i_to_score = {}
            for i in remaining:
                if len(selected) > 0:
                    new_x = X[i]
                    new_summary_vector = sparse.vstack([new_x, summary_vector])
                    new_summary_vector = normalize(new_summary_vector.sum(0))
                else:
                    new_summary_vector = X[i]
                score = cosine_similarity(new_summary_vector, centroid)[0, 0]
                i_to_score[i] = score

            if self.plug:
                sent_pages = [int(s.article_page) for s in sents]
                count_dict = {
                    i_score[0]: (i_score[1], sent_pages[i_score[0]])
                    for i_score in i_to_score.items()
                }
                ranked = plugin.get_combined_1st_rank(
                    count_dict, page_weight=self.plug, output_one=False
                )
            else:
                ranked = sorted(i_to_score.items(), key=lambda x: x[1], reverse=True)
            for i, _ in ranked:
                s = sents[i]
                remaining.remove(i)
                if filter and not filter(s):
                    continue
                elif self.is_redundant(i, selected, X):
                    continue
                else:
                    selected.append(i)
                    break
        return selected
示例#6
0
 def rank_dates(self, collection, plug=False):
     date_to_count = collections.defaultdict(int)
     if plug:
         pages_to_count = collections.defaultdict(int)
     for a in collection.articles():
         d = a.time.date()
         date_to_count[d] += 1
         if plug:
             pages_to_count[d] += a.page
     if plug:
         count_dict = {
             d: (count, pages_to_count[d] / count)
             for d, count in date_to_count.items()
         }
         ranked = plugin.get_combined_1st_rank(count_dict,
                                               page_weight=plug,
                                               output_one=False)
     else:
         ranked = sorted(date_to_count.items(),
                         key=lambda x: x[1],
                         reverse=True)
     return [d for d, _ in ranked]
示例#7
0
 def rank_dates(self, collection, plug=False):
     if plug:
         dates, X, pages_to_count = self.extract_features(collection, plug)
     else:
         dates, X = self.extract_features(collection)
     X = normalize(X, norm="l2", axis=0)
     if self.method == "classification":
         Y = [y[1] for y in self.model["model"].predict_proba(X)]
     else:
         Y = self.model["model"].predict(X)
     if plug:
         count_dict = {
             d: (Y[i], pages_to_count[d])
             for i, d in enumerate(dates)
         }
         ranked = [
             x[0] for x in plugin.get_combined_1st_rank(
                 count_dict, page_weight=plug, output_one=False)
         ]
     else:
         scored = sorted(zip(dates, Y), key=lambda x: x[1], reverse=True)
         ranked = [x[0] for x in scored]
     return ranked
示例#8
0
    def rank(self, clusters, collection=None, vectorizer=None, plug=False):
        date_to_count = collections.defaultdict(int)
        for a in collection.articles():
            for s in a.sentences:
                d = s.get_date()
                if d:
                    date_to_count[d] += 1

        clusters = sorted(clusters, reverse=True, key=len)

        def get_count(c):
            if plug:
                t = c.most_mentioned_time_with_influence_rank(plug)
            else:
                t = c.most_mentioned_time()
            if t:
                return date_to_count[t.date()]
            else:
                return 0

        clusters = sorted(
            clusters, reverse=True, key=get_count
        )  # to give each cluster a specific date
        if plug:
            count_dict = {
                i: (
                    len(cluster),
                    plugin.get_page_sum_from_cluster(cluster) / len(cluster),
                )
                for i, cluster in enumerate(clusters)
            }
            ranked = plugin.get_combined_1st_rank(
                count_dict, page_weight=plug, output_one=False
            )
            return [clusters[i] for i, _ in ranked]
        else:
            return sorted(clusters, key=len, reverse=True)