def __init__(self, x_embedding=None, y_embedding=None, filename=None, path=".", **kwargs): if filename is not None: self.load(filename=filename, path=path) else: embedding = Embedding() embedding.n = x_embedding.m embedding.m = y_embedding.m embedding.features = y_embedding.features embedding.x = np.dot(x_embedding.y, y_embedding.x) embedding.x_norm = np.ones(embedding.n) embedding.y = np.dot(y_embedding.y, x_embedding.x) embedding.y_norm = np.ones(embedding.m) embedding.idf = y_embedding.idf super().__init__(corpus=Corpus(x_embedding.features, to_text=lambda x: x), embedding=embedding, **kwargs) self.x_projection = x_embedding.query_projection self.y_projection = y_embedding.query_projection
def my_gismo(): corpus = Corpus(toy_source_dict, lambda x: x['content']) vectorizer = CountVectorizer(dtype=float) embedding = Embedding(vectorizer=vectorizer) embedding.fit_transform(corpus) gismo = Gismo(corpus, embedding) gismo.parameters.distortion = 0.0 gismo.rank("Gizmo") return gismo
def get_reduced_gismo(self, gismo, rebuild=True): reduced_corpus = Corpus(self.get_reduced_source(gismo, rebuild=rebuild), to_text=gismo.corpus.to_text) reduced_embedding = Embedding(vectorizer=gismo.embedding.vectorizer) reduced_embedding.fit_transform(reduced_corpus) reduced_gismo = Gismo(reduced_corpus, reduced_embedding) reduced_gismo.parameters = gismo.parameters return reduced_gismo
def initialize_embedding( documents: list, stop_words: list = None, max_ngram: int = 1, min_df: float = 0.02, max_df: float = 0.85, document_to_text=simplified_document_to_string, # All the values by default preprocessor=None) -> Embedding: """ Initializes an embedding, fitting it from documents Parameters ---------- documents: A `list` of `dict` representing documents with strings in the values. stop_words: A `list` of words to ignore in the vocabulary. max_ngram: the maximum length of ngrams to take into account (e.g. 2 if bigrams in vocabulary). min_df: minimum frequency of a word to be considered in the vocabulary, if an int the word must be contained in at least min_df documents. max_df: maximum frequency of a word to be considered in the vocabulary. document_to_text: Callback(Document) -> str. preprocessor: Returns ------- Embedding: The embedding fitted on the documents. """ corpus = Corpus(documents, document_to_text) vectorizer = CountVectorizer(dtype=float, stop_words=stop_words, ngram_range=(1, max_ngram), min_df=min_df, max_df=max_df, preprocessor=preprocessor) embedding = Embedding(vectorizer=vectorizer) embedding.fit_transform(corpus) return embedding
def test_embedding_io(): corpus=Corpus(toy_source_text) embedding = Embedding() embedding.fit_transform(corpus) assert embedding.features[3] == 'demon' with tempfile.TemporaryDirectory() as tmp: embedding.save(filename="test", path=tmp) new_embedding = Embedding(filename="test", path=tmp) assert new_embedding.features[3] == 'demon'
def build_sentence_gismo(self, itf=None, s_g_p=None): """ Creates the Gismo of sentences (:attr:`~sisu.summarizer.Summarizer.sentence_gismo_`) Parameters ---------- itf: :class:`bool`, optional Applies TF-IDTF embedding. I False, TF-IDF embedding is used. s_g_p: :class:`dict` Parameters for the sentence Gismo. Returns ------- None """ if itf is None: itf = self.parameters.itf if s_g_p is None: s_g_p = self.parameters.sentence_gismo_parameters sentence_corpus = Corpus(source=self.sentences_, to_text=lambda s: s['sanitized']) sentence_embedding = Embedding() if itf else IdfEmbedding() sentence_embedding.fit_ext(embedding=self.gismo.embedding) sentence_embedding.transform(sentence_corpus) self.sentence_gismo_ = Gismo(sentence_corpus, sentence_embedding, **s_g_p)
def make_scores(summaries: list, candidate_key: str, ref_key: str, evaluation_method, embedding: Embedding) -> list: """ Computes scores with a certain evaluation method for a list of candidate summaries and the corresponding reference summaries Args: summaries: A `list` of dictionnaries. candidate_key: A string corresponding to the candidate summary in the dictionnaries. ref_key: A string corresponding to the reference summary in the dictionnaries. evaluation_lethod: A function that computes the score (float from two vectors). embedding: An Embedding that will allow us to get the representative vectors of the summaries Returns: A list containing the score of each summary. """ measures = list() for summarie in summaries: candidate_vect = (embedding.query_projection( summarie[candidate_key]))[0].toarray() ref_vect = (embedding.query_projection(summarie[ref_key]))[0].toarray() dim = np.size(ref_vect) measures.append( evaluation_method(candidate_vect.reshape(dim), ref_vect.reshape(dim))) return measures
def make_sent_gismo(self, query=None, txt=None, k=None, **kwargs): """ Construct a sentence-level Gismo stored in the :py:attr:`sent_gismo` attribute. Parameters ---------- query: str (optional) Query to run on the document-level Gismo. txt: str (optional) Text to use for sentence extraction. If not set, the sentences will be extracted from the top-documents. k: int (optional) Number of top-documents used for the built. If not set, the :py:func:`~gismo.common.auto_k` heuristic will be used. kwargs: dict Custom default runtime parameters to pass to the sentence-level Gismo. You just need to specify the parameters that differ from :obj:`~gismo.parameters.DEFAULT_PARAMETERS`. Note that distortion will be automatically de-activated. If you really want it, manually change the value of ``self.sent_gismo.parameters.distortion`` afterwards. Returns ------- Sentencizer """ if txt is None: if query is not None: self.doc_gismo.rank(query) txt = [(self.doc_gismo.corpus.to_text(self.doc_gismo.corpus[i]), i) for i in self.doc_gismo.get_documents_by_rank(k, post=False) ] self.splitter(txt) local_embedding = Embedding() local_embedding.fit_ext(self.doc_gismo.embedding) local_embedding.transform(self.sent_corpus) self.sent_gismo = Gismo(self.sent_corpus, local_embedding, **kwargs) self.sent_gismo.parameters.distortion = 0.0 self.sent_gismo.post_documents_item = lambda g, i: g.corpus.to_text( g.corpus[i]) return self
def summarize_cluster( documents: list, centroid, # csr_matrix siblings_centroids, # csr_matrix query: str = "", num_documents: int = None, num_sentences: int = None, ratio: float = 0.05, embedding: Embedding = None, is_documents_embedding: bool = False, num_keywords: int = 15, size_generic_query: int = 5, used_sentences: set = None, filter_sentences=is_relevant_sentence, get_content=lambda x: x["content"] + x["summary"]) -> tuple: """ Extended summarizer that produces a list of sentences and a list of keywords. Args: documents: A list of dict corresponding to documents. query: A string. num_documents: An int corresponding to the number of top documents to be taking into account for the summary. num_sentences: An int corresponding of the number of sentences wanted in the summary. ratio: A float in [0, 1] giving the length of the summary as a proportion of the length of the num_documents kept. embedding: An Embedding fitted on a bigger corpus than documents. num_keywords: An int corresponding to the number of keywords returned used_sentences: A set of "forbidden" sentences. filter_sentences: A function returning a bool, allowing the selection of a sentence. get_content: A function that allows the retrieval of a document's content. centroid: the centroid of the cluster that is summarized. centroid_siblings: the siblings centroids of the cluster that is summarized. Returns: A tuple containing: A list of the summary sentences, A list of keywords. """ nlp = spacy.load('en_core_web_sm') assert num_sentences or ratio assert type(documents) == list if used_sentences is None: used_sentences = set() # Get number of documents if num_documents is None: num_documents = len(documents) else: num_documents = min(len(documents), num_documents) # Find best documents # start_time = time.clock() assert num_documents != 0 if num_documents == 1: best_documents = [documents[0]] else: documents_gismo = make_gismo( documents=documents, other_embedding=embedding, is_documents_embedding=is_documents_embedding) documents_gismo.rank(query) best_documents = documents_gismo.get_documents_by_rank(k=num_documents) if query == "": query = " ".join( documents_gismo.get_features_by_rank(k=size_generic_query)) # print("finding best documents : ", time.clock() - start_time) # Split best document into sentences. # start_time = time.clock() contents_sentences = [ sentence for document in best_documents for sentence in make_sentences_wiki(get_content(document)) ] assert contents_sentences is not None # print("Splitting best docs in sentences : ", time.clock() - start_time) # Scale the number of sentences proportionally to the total number # of sentences in the top documents. if num_sentences is None: num_sentences = max(int(ratio * len(contents_sentences)), 1) streching_for_duplicates = 7 # Computation of the score and selection of sentences if siblings_centroids is not None: number_of_siblings = len(siblings_centroids) else: number_of_siblings = 0 summary = sorted( [ { "sentence": contents_sentences[i], "index": i, "score": 2 * (number_of_siblings + 1) * cosine_similarity( embedding.query_projection(contents_sentences[i])[0], centroid ) - sum([ cosine_similarity( embedding.query_projection(contents_sentences[i])[0], siblings_centroids[sibling_index] ) for sibling_index in range(number_of_siblings) ]) } for i in range(len(contents_sentences)) if is_relevant_sentence(contents_sentences[i]) and \ (contents_sentences[i] not in used_sentences) ], key=lambda k: k["score"], reverse=True )[:(streching_for_duplicates * num_sentences)] # Removing adverbs and nominal sentences, pronoun resolution sentences_to_remove = list() for (sum_index, sentence_dict) in enumerate(summary): sentence = nlp(sentence_dict["sentence"]) if sentence[0].pos_ == "ADV": if sentence[1].pos_ == "PUNCT": sentence = sentence[2:] else: sentence = sentence[1:] sentence_dict["sentence"] = sentence.text if "VBZ" not in {token.tag_ for token in sentence}: sentences_to_remove.append(sentence_dict) if "PRP" in {token.tag_ for token in sentence}: # elif si VBZ ici i = int(sentence_dict["index"]) extract_str = " ".join( [sentence for sentence in contents_sentences[i - 2:i + 1]]) extract = nlp(extract_str) if extract._.has_coref: resolved_extract = extract._.coref_resolved sentence_dict["sentence"] = make_sentences_wiki( resolved_extract)[-1] summary = [ sentence for sentence in summary if (sentence not in sentences_to_remove) ] return [ sentence_dict["sentence"] for sentence_dict in summary[:num_sentences] ] # , keywords)
def old_make_gismo( documents: list, alpha: float = .2, other_embedding: Embedding = None, is_documents_embedding: bool = False, document_to_text=simplified_document_to_string # All the values by default ) -> Gismo: """ Make a Gismo object from a list of documents. Args: documents: A `list` of documents with strings in the values. alpha: A `float` in [0, 1] indicating the damping factor used in the D-iteration used by Gismo. other_embedding: embedding already fitted on a corpus. document_to_text: Callback(Document) -> str. Returns: A Gismo object made from the given documents and embedding. """ def post_document(gismo: Gismo, i: int) -> dict: document = gismo.corpus[i] return document # print("corpus") corpus = Corpus(documents, document_to_text) if other_embedding is None: # print("vectorizer") vectorizer = CountVectorizer(dtype=float) embedding = Embedding(vectorizer=vectorizer) # print("fit_transform") embedding.fit_transform(corpus) else: if is_documents_embedding: embedding = Embedding() embedding = copy.copy(other_embedding) else: embedding = Embedding() # print("fit_ext") embedding.fit_ext(other_embedding) # print("transform") embedding.transform(corpus) # print("gismo") gismo = Gismo(corpus, embedding) gismo.post_document = post_document gismo.diteration.alpha = alpha return gismo
def summarize(documents, query="", num_documents=None, num_sentences=None, ratio=0.05, embedding=None, num_keywords: int = 15, size_generic_query: int = 5, used_sentences: set = None, get_content=lambda x: x["content"]) -> tuple: """ Produces a list of sentences and a list of keywords. Parameters ---------- documents: :class:`list` A list of documents. query: :class:`str`, optional Textual query to focus the summary on one subject. num_documents: :class:`int`, optional Number of top documents to be taking into account for the summary. num_sentences: :class:`int`, optional Number of sentences wanted in the summary. Overrides ratio. ratio: :class:`float` in ]0, 1], optional length of the summary as a proportion of the length of the num_documents kept. embedding: :class:`~gismo.embedding.Embedding`, optional An Embedding fitted on a bigger corpus than documents. num_keywords: :class:`int`, optional An int corresponding to the number of keywords returned size_generic_query: :class:`int`, optional size generic query used_sentences: :class:`set`, optional A set of "forbidden" sentences. Will be updated inplace. get_content: callable, optional A function that allows the retrieval of a document's content. Returns ------- :class:`list` A list of the summary sentences, A list of keywords. Examples -------- >>> from gismo.datasets.reuters import get_reuters_news >>> summarize(get_reuters_news(), num_documents=10, num_sentences=4) # doctest: +NORMALIZE_WHITESPACE (['Gum arabic has a history dating back to ancient times.', 'Hungry nomads pluck gum arabic as they pass with grazing goats and cattle.', 'For impoverished sub-Saharan states producing the bulk of world demand, gum arabic simply means export currency.', "After years of war-induced poverty, gum arabic is offering drought-stricken Chad's rural poor a lifeline to the production plants of the world's food and beverage giants."], ['norilsk', 'icewine', 'amiel', 'gum', 'arabic', 'her', 'tibet', 'chad', 'deng', 'oil', 'grapes', 'she', 'his', 'czechs', 'chechnya']) >>> summarize(get_reuters_news(), query="Ericsson", num_documents=10, num_sentences=5) # doctest: +NORMALIZE_WHITESPACE (['The restraints are few in areas such as consumer products, while in sectors such as banking, distribution and insurance, foreign firms are kept on a very tight leash.', 'These latest wins follow a recent $350 million contract win with Telefon AB L.M.', 'Pocket is the first from the high-priced 1996 auction known to have filed for bankruptcy protection.', '"That is, assuming the deal is done right," she added.', '"Generally speaking, the easiest place to make a profit tends to be in the consumer industry, usually fairly small-scale operations," said Anne Stevenson-Yang, director of China operations for the U.S.-China Business Council.'], ['ericsson', 'sweden', 'motorola', 'telecommuncation', 'communciation', 'bolstering', 'priced', 'sectors', 'makers', 'equipment', 'schaumberg', 'lm', 'done', 'manufacturing', 'consumer']) """ if used_sentences is None: used_sentences = set() if num_documents is None: num_documents = len(documents) doc_corpus = Corpus(source=documents, to_text=get_content) if embedding: doc_embedding = Embedding() doc_embedding.fit_ext(embedding) doc_embedding.transform(corpus=doc_corpus) else: vectorizer = CountVectorizer(dtype=float) doc_embedding = Embedding(vectorizer=vectorizer) doc_embedding.fit_transform(corpus=doc_corpus) documents_gismo = Gismo(corpus=doc_corpus, embedding=doc_embedding, alpha=.2) # print("- Running D-iteration (query = %s)" % query) documents_gismo.rank(query) # print("- Extracting results (gismo = %s)" % documents_gismo) best_documents = documents_gismo.get_documents_by_rank(k=num_documents) # Split best document into sentences. Remove duplicates # print("Splitting documents into sentences") contents_sentences = sorted({ sentence for document in best_documents for sentence in make_sentences(get_content(document)) }) # Scale the number of sentences proportionally to the total number # of sentences in the top documents. if num_sentences is None: num_sentences = int(ratio * len(contents_sentences)) # print("Scaling num_sentences to %d (ratio = %s)" % (num_sentences, ratio)) # print("Preparing sentence-based gismo") sent_corpus = Corpus(source=contents_sentences) sent_embedding = Embedding() if embedding: sent_embedding.fit_ext(embedding) else: sent_embedding.fit_ext(doc_embedding) sent_embedding.transform(corpus=sent_corpus) sentences_gismo = Gismo(corpus=sent_corpus, embedding=sent_embedding, alpha=.2) # print("Preparing sentence-based gismo") sentences_gismo.rank(query) keywords = sentences_gismo.get_features_by_rank(k=num_keywords) if query == "": sentences_gismo.rank(" ".join(keywords[:size_generic_query])) sentences_ranks = sentences_gismo.diteration.x_order # List of sentence indices by decreasing relevance # print("Extracting %d-top sentences" % num_sentences) num_kept_sentences = 0 i = 0 ranked_sentences = list() while num_kept_sentences < num_sentences and i < len(contents_sentences): sentence = contents_sentences[sentences_ranks[i]] if sentence not in used_sentences and is_relevant_sentence(sentence): used_sentences.add(sentence) ranked_sentences.append(sentence) num_kept_sentences += 1 i += 1 return ranked_sentences, keywords
def make_tree(documents: list, query: str = "", depth: int = 1, trees: list = None, documents_gismo: Gismo = None, num_documents: int = None, num_sentences: int = None, embedding: Embedding = None, used_sentences: set = None) -> list: r""" Builds a hierarchical summary. Parameters ---------- documents: :class:`list` of :class:`dict` A list of dict corresponding to documents, only the values of the "content" key will be summarized. query: :class:`str`, optional Textual query to focus the summary on one subject. depth: :class:`int`, optional An int giving the depth of the summary (depth one is a sequential summary). trees: :class:`list`, optional A list of dict being completed, necessary for the recursivity. documents_gismo: :class:`~gismo.gismo.Gismo` Pre-existing Gismo num_documents: :class:`int`, optional Number of top documents to be taking into account for the summary. num_sentences: :class:`int`, optional Number of sentences wanted in the summary. embedding: :class:`~gismo.embedding.Embedding`, optional An Embedding fitted on a bigger corpus than documents. used_sentences: :class:`set`, optional A set of "forbidden" sentences. Will be updated inplace. Returns ------- :class:`list` of :class:`dict` A list of dict corresponding to the hierarchical summary Examples -------- >>> from gismo.datasets.reuters import get_reuters_news >>> make_tree(get_reuters_news(), query="Orange", num_documents=10, num_sentences=3, depth=2) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS [{'text': 'But some analysts still believe Orange is overvalued.', 'current_keywords': ['orange', 'one', 'is', 'at', 'on', 'in', 'and', 'its', 'shares', 'has', 'analysts', 'of', 'market', 'believe', 'overvalued'], 'url': None, 'children': [{'text': 'Trading sources said China was staying out of the market, and that Indian meal was currently overvalued by a good $20 a tonne.', 'current_keywords': ['orange', 'overvalued', 'analysts', 'that', 'and', 'are', 'compared', 'believe', 'market', 'but', 'some', 'still', 'of', 'said', 'we'], 'url': None, 'children': []}, {'text': 'Since the purchase, widely seen by analysts as overvalued, Quaker has struggled with the line of ready-to-drink teas and juices.', 'current_keywords': ['orange', 'overvalued', 'analysts', 'that', 'and', 'are', 'compared', 'believe', 'market', 'but', 'some', 'still', 'of', 'said', 'we'], 'url': None, 'children': []}, {'text': '"No question that if the dollar continues to be overvalued and continues to be strong, we\'ll see some price erosion later in the year."', 'current_keywords': ['orange', 'overvalued', 'analysts', 'that', 'and', 'are', 'compared', 'believe', 'market', 'but', 'some', 'still', 'of', 'said', 'we'], 'url': None, 'children': []}]}, {'text': 'Orange shares were 2.5p higher at 188p on Friday.', 'current_keywords': ['orange', 'one', 'is', 'at', 'on', 'in', 'and', 'its', 'shares', 'has', 'analysts', 'of', 'market', 'believe', 'overvalued'], 'url': None, 'children': [{'text': 'Orange, Calif.-based Bergen is the largest U.S. distributor of generic drugs, while Miami-based Ivax is a generic drug manufacturing giant.', 'current_keywords': ['orange', 'higher', 'shares', 'friday', 'on', 'at', 'and', 'in', 'its', 'of', 'percent', 'one', 'mobile', 'to', 'market'], 'url': None, 'children': []}, {'text': 'One-2-One and Orange ORA.L, which offer only digital services, are due to release their connection figures next week.', 'current_keywords': ['orange', 'higher', 'shares', 'friday', 'on', 'at', 'and', 'in', 'its', 'of', 'percent', 'one', 'mobile', 'to', 'market'], 'url': None, 'children': []}, {'text': "Dodd noted that BT's plans to raise the price of calls to Orange and One 2 One handsets would be beneficial.", 'current_keywords': ['orange', 'higher', 'shares', 'friday', 'on', 'at', 'and', 'in', 'its', 'of', 'percent', 'one', 'mobile', 'to', 'market'], 'url': None, 'children': []}]}, {'text': 'Orange already has a full roaming agreement in Germany and a partial one in France, centred on Paris.', 'current_keywords': ['orange', 'one', 'is', 'at', 'on', 'in', 'and', 'its', 'shares', 'has', 'analysts', 'of', 'market', 'believe', 'overvalued'], 'url': None, 'children': [{'text': 'Orange says its offer of roaming services between the UK and other countries is part of its aim to provide customers with the best value for money.', 'current_keywords': ['orange', 'roaming', 'partial', 'centred', 'paris', 'france', 'germany', 'agreement', 'full', 'on', 'and', 'in', 'of', 'for', 'with'], 'url': None, 'children': []}, {'text': 'As with all roaming agreements, the financial details of the Swiss deal remain a trade secret.', 'current_keywords': ['orange', 'roaming', 'partial', 'centred', 'paris', 'france', 'germany', 'agreement', 'full', 'on', 'and', 'in', 'of', 'for', 'with'], 'url': None, 'children': []}, {'text': '"We look forward in 1997 to continuing to move ahead and to extending our international service through new roaming agreements and the introduction of dual band handsets."', 'current_keywords': ['orange', 'roaming', 'partial', 'centred', 'paris', 'france', 'germany', 'agreement', 'full', 'on', 'and', 'in', 'of', 'for', 'with'], 'url': None, 'children': []}]}] """ num_keywords = 15 if used_sentences == None: used_sentences = set() if depth == 0: return list() if documents_gismo == None: doc_corpus = Corpus(source=documents, to_text=simplified_document_to_string) if embedding: doc_embedding = Embedding() doc_embedding.fit_ext(embedding) doc_embedding.transform(corpus=doc_corpus) else: vectorizer = CountVectorizer(dtype=float) doc_embedding = Embedding(vectorizer=vectorizer) doc_embedding.fit_transform(corpus=doc_corpus) documents_gismo = Gismo(corpus=doc_corpus, embedding=doc_embedding, alpha=.2) documents_gismo.rank(query) best_documents = [ (i, documents_gismo.corpus[i]) for i in documents_gismo.diteration.x_order[:num_documents] ] # documents_gismo.get_documents_by_rank(k=num_documents) sentences_dictionnaries = [{ "sentence": sentence, "url": document.get("url"), "doc_index": i, } for i, document in best_documents for sentence in list( OrderedDict.fromkeys(make_sentences(document["content"])))] sent_corpus = Corpus(source=sentences_dictionnaries, to_text=lambda s: s['sentence']) if embedding: sent_embedding = Embedding() sent_embedding.fit_ext(embedding) sent_embedding.transform(corpus=sent_corpus) else: vectorizer = CountVectorizer(dtype=float) sent_embedding = Embedding(vectorizer=vectorizer) sent_embedding.fit_transform(corpus=sent_corpus) sentences_gismo = Gismo(corpus=sent_corpus, embedding=sent_embedding, alpha=.2) sentences_gismo.rank(query) keywords = sentences_gismo.get_features_by_rank(k=num_keywords) sentences_ranks = sentences_gismo.diteration.x_order num_kept_sentences = 0 ranked_sentences_dict = list() for rank in sentences_ranks: sentence_dict = sentences_dictionnaries[rank] sentence = sentence_dict["sentence"] if sentence not in used_sentences and is_relevant_sentence(sentence): ranked_sentences_dict.append(sentence_dict) used_sentences.add(sentence) num_kept_sentences += 1 if num_kept_sentences >= num_sentences: break children = ranked_sentences_dict return [{ "text": child["sentence"], "current_keywords": keywords, "url": child.get("url"), "children": make_tree(trees=trees, depth=depth - 1, documents_gismo=documents_gismo, documents=documents, query=make_query(" ".join([query, child["sentence"]])), num_sentences=num_sentences, embedding=embedding, used_sentences=used_sentences) } for child in children]