Пример #1
0
def summarize(title, text, summary_length=5):
    """
    Summarizes a single document.

    Args:
        | title (str)   -- the document title
        | text (str)    -- the document test
        | summary_length (int)  -- the preferred sentence length of the summary (default=5)

    Returns:
        | summary (list)    -- list of sentences selected for the summary.

    Currently uses a modified version of `PyTeaser <https://github.com/xiaoxu193/PyTeaser>`, which is based off of `TextTeaser <https://github.com/MojoJolo/textteaser>`.
    """
    summary = []
    keys = keywords(text)
    title_tokens = tokenize(title)

    # Score sentences and use the top selections.
    ranks = score(sent_tokenize(text), title_tokens,
                  keys).most_common(summary_length)
    for rank in ranks:
        summary.append(rank[0])

    return summary
Пример #2
0
def keywords(text):
    """
    Gets the top 10 keywords and their frequency scores
    from a document.
    Sorts them in descending order by number of occurrences.
    """
    from operator import itemgetter  # for sorting

    text = sub(r'[^\w ]', '', text)  # strip special chars
    text_with_stops = [x.strip('.').lower() for x in text.split()]

    numWords = len(text_with_stops)
    text = tokenize(text)
    freq = Counter()
    for word in text:
        freq[word] += 1

    minSize = min(10, len(freq))
    keywords = tuple(freq.most_common(minSize))  # get first 10
    keywords = dict((x, y) for x, y in keywords)  # recreate a dict

    for k in keywords:
        articleScore = keywords[k] * 1.0 / numWords
        keywords[k] = articleScore * 1.5 + 1

    keywords = sorted(iter(keywords.items()), key=itemgetter(1))
    keywords.reverse()
    return dict(keywords)
Пример #3
0
def score(sentences, title_words, keywords):
    """
    Score sentences based on their features.

    Args:
        | sentences (list)      -- list of sentences to score
        | title_words (list)    -- list of words in the title
        | keywords (list)       -- list of keywords from the document
    """
    num_sentences = len(sentences)
    ranks = Counter()
    for i, s in enumerate(sentences):
        sentence = tokenize(s)

        # Calculate features.
        title_score = score_title(title_words, sentence)
        s_length = sentence_length(sentence)
        s_position = sentence_position(i + 1, num_sentences)
        sbs_feature = sbs(sentence, keywords)
        dbs_feature = dbs(sentence, keywords)
        frequency = (sbs_feature + dbs_feature) / 2.0 * 10.0

        # Weighted average of feature scores.
        total_score = (title_score * 1.5 + frequency * 2.0 + s_length * 1.0 +
                       s_position * 1.0) / 4.0
        ranks[s] = total_score
    return ranks
Пример #4
0
def keywords(text):
    """
    Gets the top 10 keywords and their frequency scores
    from a document.
    Sorts them in descending order by number of occurrences.
    """
    from operator import itemgetter  # for sorting

    text = sub(r'[^\w ]', '', text)  # strip special chars
    text_with_stops = [x.strip('.').lower() for x in text.split()]

    numWords = len(text_with_stops)
    text = tokenize(text)
    freq = Counter()
    for word in text:
        freq[word] += 1

    minSize = min(10, len(freq))
    keywords = tuple(freq.most_common(minSize))  # get first 10
    keywords = dict((x, y) for x, y in keywords)  # recreate a dict

    for k in keywords:
        articleScore = keywords[k]*1.0 / numWords
        keywords[k] = articleScore * 1.5 + 1

    keywords = sorted(iter(keywords.items()), key=itemgetter(1))
    keywords.reverse()
    return dict(keywords)
Пример #5
0
def score(sentences, title_words, keywords):
    """
    Score sentences based on their features.

    Args:
        | sentences (list)      -- list of sentences to score
        | title_words (list)    -- list of words in the title
        | keywords (list)       -- list of keywords from the document
    """
    num_sentences = len(sentences)
    ranks = Counter()
    for i, s in enumerate(sentences):
        sentence = tokenize(s)

        # Calculate features.
        title_score = score_title(title_words, sentence)
        s_length = sentence_length(sentence)
        s_position = sentence_position(i+1, num_sentences)
        sbs_feature = sbs(sentence, keywords)
        dbs_feature = dbs(sentence, keywords)
        frequency = (sbs_feature + dbs_feature) / 2.0 * 10.0

        # Weighted average of feature scores.
        total_score = (title_score*1.5 + frequency*2.0 +
                      s_length*1.0 + s_position*1.0) / 4.0
        ranks[s] = total_score
    return ranks
Пример #6
0
def summarize(title, text, summary_length=5):
    """
    Summarizes a single document.

    Args:
        | title (str)   -- the document title
        | text (str)    -- the document test
        | summary_length (int)  -- the preferred sentence length of the summary (default=5)

    Returns:
        | summary (list)    -- list of sentences selected for the summary.

    Currently uses a modified version of `PyTeaser <https://github.com/xiaoxu193/PyTeaser>`, which is based off of `TextTeaser <https://github.com/MojoJolo/textteaser>`.
    """
    summary = []
    keys = keywords(text)
    title_tokens = tokenize(title)

    # Score sentences and use the top selections.
    ranks = score(sent_tokenize(text), title_tokens, keys).most_common(summary_length)
    for rank in ranks:
        summary.append(rank[0])

    return summary
Пример #7
0
def multisummarize(docs, summary_length=5):
    """
    Summarize multi documents.

    Args:
        | docs (list)           -- list of documents (i.e. texts)
        | summary_length (int)  -- the preferred sentence length of the summary (default=5)

    .. note::
        The current implementation is super naive,
        thus the quality and coherence of its summaries is pretty damn terrible.
        But it's purpose for now is that there is *some* API for
        multidoc summarization.

    Returns:
        | summary (list)    -- list of sentences selected for the summary.

    .. note::
        BTW: this is super slow. takes well over a minute for 4 moderately-sized documents.
    """
    # Collect all sentences from the input documents.
    # Also collect position information about each sentence.
    sents = []
    for doc in docs:
        sents += [(sent, vectorize(sent), pos + 1)
                  for pos, sent in enumerate(sent_tokenize(doc))]
    clusters = []

    # Cluster the sentences.
    for sent in sents:
        # sent = (sent, vec, pos)

        # Keep track of the maximum scoring cluster
        # (above some minimum similarity)
        # and the avg sim score.
        # The higher the min_sim,
        # the harder it is to join a cluster.
        min_sim = 0.2
        max_cluster = None, min_sim
        for cluster in clusters:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += (1 - cosine(sent[1], sent_c[1]))
            avg_sim = avg_sim / len(cluster)
            if avg_sim >= max_cluster[1]:
                max_cluster = cluster, avg_sim

        # If a cluster was found,
        # add the sentence to it
        if max_cluster[0]:
            max_cluster[0].append(sent)

        # Otherwise, create a new cluster.
        else:
            clusters.append([sent])

    # Rank the clusters.
    # Assuming that clusters with more sentences are more important,
    # take the top 5.
    ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length]

    # For each sentence cluster, select the highest scoring sentence.
    # Again - very naive.
    ideal_length = 20
    summary_sentences = []
    for cluster in ranked_clusters:
        max_sent = '', 0
        for sent in cluster:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += 1 - cosine(sent[1], sent_c[1])
            avg_sim = avg_sim / len(cluster)
            pos = sent[2]
            length = fabs(ideal_length - len(tokenize(sent[0]))) / ideal_length

            # Score is the average similarity penalized by distance from ideal length,
            # weighted by the inverse of the position.
            score = (avg_sim - length / 2) / pos
            if score >= max_sent[1]:
                max_sent = sent[0], score
        summary_sentences.append(max_sent[0])

    return summary_sentences
Пример #8
0
def multisummarize(docs, summary_length=5):
    """
    Summarize multi documents.

    Args:
        | docs (list)           -- list of documents (i.e. texts)
        | summary_length (int)  -- the preferred sentence length of the summary (default=5)

    .. note::
        The current implementation is super naive,
        thus the quality and coherence of its summaries is pretty damn terrible.
        But it's purpose for now is that there is *some* API for
        multidoc summarization.

    Returns:
        | summary (list)    -- list of sentences selected for the summary.

    .. note::
        BTW: this is super slow. takes well over a minute for 4 moderately-sized documents.
    """
    # Collect all sentences from the input documents.
    # Also collect position information about each sentence.
    sents = []
    for doc in docs:
        sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sent_tokenize(doc))]
    clusters = []

    # Cluster the sentences.
    for sent in sents:
        # sent = (sent, vec, pos)

        # Keep track of the maximum scoring cluster
        # (above some minimum similarity)
        # and the avg sim score.
        # The higher the min_sim,
        # the harder it is to join a cluster.
        min_sim = 0.2
        max_cluster = None, min_sim
        for cluster in clusters:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += (1 - cosine(sent[1], sent_c[1]))
            avg_sim = avg_sim/len(cluster)
            if avg_sim >= max_cluster[1]:
                max_cluster = cluster, avg_sim

        # If a cluster was found,
        # add the sentence to it
        if max_cluster[0]:
            max_cluster[0].append(sent)

        # Otherwise, create a new cluster.
        else:
            clusters.append([sent])

    # Rank the clusters.
    # Assuming that clusters with more sentences are more important,
    # take the top 5.
    ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length]

    # For each sentence cluster, select the highest scoring sentence.
    # Again - very naive.
    ideal_length = 20
    summary_sentences = []
    for cluster in ranked_clusters:
        max_sent = '', 0
        for sent in cluster:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += 1 - cosine(sent[1], sent_c[1])
            avg_sim = avg_sim/len(cluster)
            pos = sent[2]
            length = fabs(ideal_length - len(tokenize(sent[0])))/ideal_length

            # Score is the average similarity penalized by distance from ideal length,
            # weighted by the inverse of the position.
            score = (avg_sim - length/2)/pos
            if score >= max_sent[1]:
                max_sent = sent[0], score
        summary_sentences.append(max_sent[0])

    return summary_sentences