def make_scores(summaries: list, candidate_key: str, ref_key: str, evaluation_method, embedding: Embedding) -> list: """ Computes scores with a certain evaluation method for a list of candidate summaries and the corresponding reference summaries Args: summaries: A `list` of dictionnaries. candidate_key: A string corresponding to the candidate summary in the dictionnaries. ref_key: A string corresponding to the reference summary in the dictionnaries. evaluation_lethod: A function that computes the score (float from two vectors). embedding: An Embedding that will allow us to get the representative vectors of the summaries Returns: A list containing the score of each summary. """ measures = list() for summarie in summaries: candidate_vect = (embedding.query_projection( summarie[candidate_key]))[0].toarray() ref_vect = (embedding.query_projection(summarie[ref_key]))[0].toarray() dim = np.size(ref_vect) measures.append( evaluation_method(candidate_vect.reshape(dim), ref_vect.reshape(dim))) return measures
def summarize_cluster( documents: list, centroid, # csr_matrix siblings_centroids, # csr_matrix query: str = "", num_documents: int = None, num_sentences: int = None, ratio: float = 0.05, embedding: Embedding = None, is_documents_embedding: bool = False, num_keywords: int = 15, size_generic_query: int = 5, used_sentences: set = None, filter_sentences=is_relevant_sentence, get_content=lambda x: x["content"] + x["summary"]) -> tuple: """ Extended summarizer that produces a list of sentences and a list of keywords. Args: documents: A list of dict corresponding to documents. query: A string. num_documents: An int corresponding to the number of top documents to be taking into account for the summary. num_sentences: An int corresponding of the number of sentences wanted in the summary. ratio: A float in [0, 1] giving the length of the summary as a proportion of the length of the num_documents kept. embedding: An Embedding fitted on a bigger corpus than documents. num_keywords: An int corresponding to the number of keywords returned used_sentences: A set of "forbidden" sentences. filter_sentences: A function returning a bool, allowing the selection of a sentence. get_content: A function that allows the retrieval of a document's content. centroid: the centroid of the cluster that is summarized. centroid_siblings: the siblings centroids of the cluster that is summarized. Returns: A tuple containing: A list of the summary sentences, A list of keywords. """ nlp = spacy.load('en_core_web_sm') assert num_sentences or ratio assert type(documents) == list if used_sentences is None: used_sentences = set() # Get number of documents if num_documents is None: num_documents = len(documents) else: num_documents = min(len(documents), num_documents) # Find best documents # start_time = time.clock() assert num_documents != 0 if num_documents == 1: best_documents = [documents[0]] else: documents_gismo = make_gismo( documents=documents, other_embedding=embedding, is_documents_embedding=is_documents_embedding) documents_gismo.rank(query) best_documents = documents_gismo.get_documents_by_rank(k=num_documents) if query == "": query = " ".join( documents_gismo.get_features_by_rank(k=size_generic_query)) # print("finding best documents : ", time.clock() - start_time) # Split best document into sentences. # start_time = time.clock() contents_sentences = [ sentence for document in best_documents for sentence in make_sentences_wiki(get_content(document)) ] assert contents_sentences is not None # print("Splitting best docs in sentences : ", time.clock() - start_time) # Scale the number of sentences proportionally to the total number # of sentences in the top documents. if num_sentences is None: num_sentences = max(int(ratio * len(contents_sentences)), 1) streching_for_duplicates = 7 # Computation of the score and selection of sentences if siblings_centroids is not None: number_of_siblings = len(siblings_centroids) else: number_of_siblings = 0 summary = sorted( [ { "sentence": contents_sentences[i], "index": i, "score": 2 * (number_of_siblings + 1) * cosine_similarity( embedding.query_projection(contents_sentences[i])[0], centroid ) - sum([ cosine_similarity( embedding.query_projection(contents_sentences[i])[0], siblings_centroids[sibling_index] ) for sibling_index in range(number_of_siblings) ]) } for i in range(len(contents_sentences)) if is_relevant_sentence(contents_sentences[i]) and \ (contents_sentences[i] not in used_sentences) ], key=lambda k: k["score"], reverse=True )[:(streching_for_duplicates * num_sentences)] # Removing adverbs and nominal sentences, pronoun resolution sentences_to_remove = list() for (sum_index, sentence_dict) in enumerate(summary): sentence = nlp(sentence_dict["sentence"]) if sentence[0].pos_ == "ADV": if sentence[1].pos_ == "PUNCT": sentence = sentence[2:] else: sentence = sentence[1:] sentence_dict["sentence"] = sentence.text if "VBZ" not in {token.tag_ for token in sentence}: sentences_to_remove.append(sentence_dict) if "PRP" in {token.tag_ for token in sentence}: # elif si VBZ ici i = int(sentence_dict["index"]) extract_str = " ".join( [sentence for sentence in contents_sentences[i - 2:i + 1]]) extract = nlp(extract_str) if extract._.has_coref: resolved_extract = extract._.coref_resolved sentence_dict["sentence"] = make_sentences_wiki( resolved_extract)[-1] summary = [ sentence for sentence in summary if (sentence not in sentences_to_remove) ] return [ sentence_dict["sentence"] for sentence_dict in summary[:num_sentences] ] # , keywords)