예제 #1
0
def make_scores(summaries: list, candidate_key: str, ref_key: str,
                evaluation_method, embedding: Embedding) -> list:
    """
    Computes scores with a certain evaluation method for a list of candidate summaries and the corresponding reference summaries
    Args:
        summaries: A `list` of dictionnaries.
        candidate_key: A string corresponding to the candidate summary in the dictionnaries.
        ref_key: A string corresponding to the reference summary in the dictionnaries.
        evaluation_lethod: A function that computes the score (float from two vectors).
        embedding: An Embedding that will allow us to get the representative vectors of the summaries
    Returns:
        A list containing the score of each summary.
    """
    measures = list()
    for summarie in summaries:
        candidate_vect = (embedding.query_projection(
            summarie[candidate_key]))[0].toarray()
        ref_vect = (embedding.query_projection(summarie[ref_key]))[0].toarray()
        dim = np.size(ref_vect)
        measures.append(
            evaluation_method(candidate_vect.reshape(dim),
                              ref_vect.reshape(dim)))
    return measures
예제 #2
0
def summarize_cluster(
        documents: list,
        centroid,  # csr_matrix
        siblings_centroids,  # csr_matrix
        query: str = "",
        num_documents: int = None,
        num_sentences: int = None,
        ratio: float = 0.05,
        embedding: Embedding = None,
        is_documents_embedding: bool = False,
        num_keywords: int = 15,
        size_generic_query: int = 5,
        used_sentences: set = None,
        filter_sentences=is_relevant_sentence,
        get_content=lambda x: x["content"] + x["summary"]) -> tuple:
    """
    Extended summarizer that produces a list of sentences and a list of keywords.
    Args:
        documents: A list of dict corresponding to documents.
        query: A string.
        num_documents: An int corresponding to the number of top documents
                        to be taking into account for the summary.
        num_sentences: An int corresponding of the number of sentences wanted in the summary.
        ratio: A float in [0, 1] giving the length of the summary
                as a proportion of the length of the num_documents kept.
        embedding: An Embedding fitted on a bigger corpus than documents.
        num_keywords: An int corresponding to the number of keywords returned
        used_sentences: A set of "forbidden" sentences.
        filter_sentences: A function returning a bool, allowing the selection of a sentence.
        get_content: A function that allows the retrieval of a document's content.
        centroid: the centroid of the cluster that is summarized.
        centroid_siblings: the siblings centroids of the cluster that is summarized.
    Returns:
        A tuple containing:
            A list of the summary sentences,
            A list of keywords.
    """
    nlp = spacy.load('en_core_web_sm')

    assert num_sentences or ratio
    assert type(documents) == list

    if used_sentences is None:
        used_sentences = set()

    # Get number of documents
    if num_documents is None:
        num_documents = len(documents)
    else:
        num_documents = min(len(documents), num_documents)

    # Find best documents
    #    start_time = time.clock()
    assert num_documents != 0
    if num_documents == 1:
        best_documents = [documents[0]]

    else:
        documents_gismo = make_gismo(
            documents=documents,
            other_embedding=embedding,
            is_documents_embedding=is_documents_embedding)
        documents_gismo.rank(query)
        best_documents = documents_gismo.get_documents_by_rank(k=num_documents)
        if query == "":
            query = " ".join(
                documents_gismo.get_features_by_rank(k=size_generic_query))
    #    print("finding best documents : ", time.clock() - start_time)
    # Split best document into sentences.
    #    start_time = time.clock()
    contents_sentences = [
        sentence for document in best_documents
        for sentence in make_sentences_wiki(get_content(document))
    ]

    assert contents_sentences is not None

    #    print("Splitting best docs in sentences : ", time.clock() - start_time)
    # Scale the number of sentences proportionally to the total number
    # of sentences in the top documents.
    if num_sentences is None:
        num_sentences = max(int(ratio * len(contents_sentences)), 1)

    streching_for_duplicates = 7

    # Computation of the score and selection of sentences
    if siblings_centroids is not None:
        number_of_siblings = len(siblings_centroids)
    else:
        number_of_siblings = 0
    summary = sorted(
        [
            {
                "sentence": contents_sentences[i],
                "index": i,
                "score": 2 * (number_of_siblings + 1) * cosine_similarity(
                    embedding.query_projection(contents_sentences[i])[0],
                    centroid
                ) - sum([
                    cosine_similarity(
                        embedding.query_projection(contents_sentences[i])[0],
                        siblings_centroids[sibling_index]
                    )
                    for sibling_index in range(number_of_siblings)
                ])
            }
            for i in range(len(contents_sentences))
            if is_relevant_sentence(contents_sentences[i]) and \
               (contents_sentences[i] not in used_sentences)
        ],
        key=lambda k: k["score"],
        reverse=True
    )[:(streching_for_duplicates * num_sentences)]

    # Removing adverbs and nominal sentences, pronoun resolution
    sentences_to_remove = list()
    for (sum_index, sentence_dict) in enumerate(summary):
        sentence = nlp(sentence_dict["sentence"])
        if sentence[0].pos_ == "ADV":
            if sentence[1].pos_ == "PUNCT":
                sentence = sentence[2:]
            else:
                sentence = sentence[1:]
            sentence_dict["sentence"] = sentence.text
        if "VBZ" not in {token.tag_ for token in sentence}:
            sentences_to_remove.append(sentence_dict)
        if "PRP" in {token.tag_ for token in sentence}:  # elif si VBZ ici
            i = int(sentence_dict["index"])
            extract_str = " ".join(
                [sentence for sentence in contents_sentences[i - 2:i + 1]])
            extract = nlp(extract_str)
            if extract._.has_coref:
                resolved_extract = extract._.coref_resolved
                sentence_dict["sentence"] = make_sentences_wiki(
                    resolved_extract)[-1]

    summary = [
        sentence for sentence in summary
        if (sentence not in sentences_to_remove)
    ]

    return [
        sentence_dict["sentence"] for sentence_dict in summary[:num_sentences]
    ]  # , keywords)