Python Embedding примеры, gismo.embedding.Embedding Python примеры использования

Пример #1

0

Показать файл

Файл: gismo.py Проект: balouf/gismo

    def __init__(self,
                 x_embedding=None,
                 y_embedding=None,
                 filename=None,
                 path=".",
                 **kwargs):
        if filename is not None:
            self.load(filename=filename, path=path)
        else:
            embedding = Embedding()
            embedding.n = x_embedding.m
            embedding.m = y_embedding.m
            embedding.features = y_embedding.features
            embedding.x = np.dot(x_embedding.y, y_embedding.x)
            embedding.x_norm = np.ones(embedding.n)
            embedding.y = np.dot(y_embedding.y, x_embedding.x)
            embedding.y_norm = np.ones(embedding.m)
            embedding.idf = y_embedding.idf
            super().__init__(corpus=Corpus(x_embedding.features,
                                           to_text=lambda x: x),
                             embedding=embedding,
                             **kwargs)

            self.x_projection = x_embedding.query_projection
            self.y_projection = y_embedding.query_projection

Пример #2

0

Показать файл

Файл: test_gismo.py Проект: balouf/gismo

def my_gismo():
    corpus = Corpus(toy_source_dict, lambda x: x['content'])
    vectorizer = CountVectorizer(dtype=float)
    embedding = Embedding(vectorizer=vectorizer)
    embedding.fit_transform(corpus)
    gismo = Gismo(corpus, embedding)
    gismo.parameters.distortion = 0.0
    gismo.rank("Gizmo")
    return gismo

Пример #3

0

Показать файл

 def get_reduced_gismo(self, gismo, rebuild=True):
     reduced_corpus = Corpus(self.get_reduced_source(gismo,
                                                     rebuild=rebuild),
                             to_text=gismo.corpus.to_text)
     reduced_embedding = Embedding(vectorizer=gismo.embedding.vectorizer)
     reduced_embedding.fit_transform(reduced_corpus)
     reduced_gismo = Gismo(reduced_corpus, reduced_embedding)
     reduced_gismo.parameters = gismo.parameters
     return reduced_gismo

Пример #4

0

Показать файл

Файл: gismo_wrapper.py Проект: balouf/sisu

def initialize_embedding(
        documents: list,
        stop_words: list = None,
        max_ngram: int = 1,
        min_df: float = 0.02,
        max_df: float = 0.85,
        document_to_text=simplified_document_to_string,  # All the values by default
        preprocessor=None) -> Embedding:
    """
    Initializes an embedding, fitting it from documents

    Parameters
    ----------
    documents:
        A `list` of `dict` representing documents with strings in the values.
    stop_words:
        A `list` of words to ignore in the vocabulary.
    max_ngram:
        the maximum length of ngrams to take into account (e.g. 2 if bigrams in vocabulary).
    min_df:
        minimum frequency of a word to be considered in the vocabulary,
        if an int the word must be contained in at least min_df documents.
    max_df: maximum frequency of a word to be considered in the vocabulary.
    document_to_text:
        Callback(Document) -> str.
    preprocessor:

    Returns
    -------
    Embedding:
        The embedding fitted on the documents.
    """
    corpus = Corpus(documents, document_to_text)
    vectorizer = CountVectorizer(dtype=float,
                                 stop_words=stop_words,
                                 ngram_range=(1, max_ngram),
                                 min_df=min_df,
                                 max_df=max_df,
                                 preprocessor=preprocessor)
    embedding = Embedding(vectorizer=vectorizer)
    embedding.fit_transform(corpus)
    return embedding

Пример #5

0

Показать файл

Файл: test_embedding.py Проект: balouf/gismo

def test_embedding_io():
    corpus=Corpus(toy_source_text)
    embedding = Embedding()
    embedding.fit_transform(corpus)
    assert embedding.features[3] == 'demon'
    with tempfile.TemporaryDirectory() as tmp:
        embedding.save(filename="test", path=tmp)
        new_embedding = Embedding(filename="test", path=tmp)
    assert new_embedding.features[3] == 'demon'

Пример #6

0

Показать файл

    def build_sentence_gismo(self, itf=None, s_g_p=None):
        """
        Creates the Gismo of sentences (:attr:`~sisu.summarizer.Summarizer.sentence_gismo_`)

        Parameters
        ----------
        itf: :class:`bool`, optional
            Applies TF-IDTF embedding. I False, TF-IDF embedding is used.
        s_g_p: :class:`dict`
            Parameters for the sentence Gismo.

        Returns
        -------
        None
        """
        if itf is None:
            itf = self.parameters.itf
        if s_g_p is None:
            s_g_p = self.parameters.sentence_gismo_parameters
        sentence_corpus = Corpus(source=self.sentences_,
                                 to_text=lambda s: s['sanitized'])
        sentence_embedding = Embedding() if itf else IdfEmbedding()
        sentence_embedding.fit_ext(embedding=self.gismo.embedding)
        sentence_embedding.transform(sentence_corpus)
        self.sentence_gismo_ = Gismo(sentence_corpus, sentence_embedding,
                                     **s_g_p)

Пример #7

0

Показать файл

def make_scores(summaries: list, candidate_key: str, ref_key: str,
                evaluation_method, embedding: Embedding) -> list:
    """
    Computes scores with a certain evaluation method for a list of candidate summaries and the corresponding reference summaries
    Args:
        summaries: A `list` of dictionnaries.
        candidate_key: A string corresponding to the candidate summary in the dictionnaries.
        ref_key: A string corresponding to the reference summary in the dictionnaries.
        evaluation_lethod: A function that computes the score (float from two vectors).
        embedding: An Embedding that will allow us to get the representative vectors of the summaries
    Returns:
        A list containing the score of each summary.
    """
    measures = list()
    for summarie in summaries:
        candidate_vect = (embedding.query_projection(
            summarie[candidate_key]))[0].toarray()
        ref_vect = (embedding.query_projection(summarie[ref_key]))[0].toarray()
        dim = np.size(ref_vect)
        measures.append(
            evaluation_method(candidate_vect.reshape(dim),
                              ref_vect.reshape(dim)))
    return measures

Пример #8

0

Показать файл

    def make_sent_gismo(self, query=None, txt=None, k=None, **kwargs):
        """
        Construct a sentence-level Gismo stored in the :py:attr:`sent_gismo` attribute.

        Parameters
        ----------
        query: str (optional)
            Query to run on the document-level Gismo.
        txt: str (optional)
            Text to use for sentence extraction.
            If not set, the sentences will be extracted from the top-documents.
        k: int (optional)
            Number of top-documents used for the built.
            If not set, the :py:func:`~gismo.common.auto_k` heuristic will be used.
        kwargs: dict
            Custom default runtime parameters to pass to the sentence-level Gismo.
            You just need to specify the parameters that differ from :obj:`~gismo.parameters.DEFAULT_PARAMETERS`.
            Note that distortion will be automatically de-activated. If you really want it, manually change the value
            of ``self.sent_gismo.parameters.distortion`` afterwards.


        Returns
        -------
        Sentencizer
        """
        if txt is None:
            if query is not None:
                self.doc_gismo.rank(query)
            txt = [(self.doc_gismo.corpus.to_text(self.doc_gismo.corpus[i]), i)
                   for i in self.doc_gismo.get_documents_by_rank(k, post=False)
                   ]
        self.splitter(txt)
        local_embedding = Embedding()
        local_embedding.fit_ext(self.doc_gismo.embedding)
        local_embedding.transform(self.sent_corpus)
        self.sent_gismo = Gismo(self.sent_corpus, local_embedding, **kwargs)
        self.sent_gismo.parameters.distortion = 0.0
        self.sent_gismo.post_documents_item = lambda g, i: g.corpus.to_text(
            g.corpus[i])
        return self

Пример #9

0

Показать файл

def summarize_cluster(
        documents: list,
        centroid,  # csr_matrix
        siblings_centroids,  # csr_matrix
        query: str = "",
        num_documents: int = None,
        num_sentences: int = None,
        ratio: float = 0.05,
        embedding: Embedding = None,
        is_documents_embedding: bool = False,
        num_keywords: int = 15,
        size_generic_query: int = 5,
        used_sentences: set = None,
        filter_sentences=is_relevant_sentence,
        get_content=lambda x: x["content"] + x["summary"]) -> tuple:
    """
    Extended summarizer that produces a list of sentences and a list of keywords.
    Args:
        documents: A list of dict corresponding to documents.
        query: A string.
        num_documents: An int corresponding to the number of top documents
                        to be taking into account for the summary.
        num_sentences: An int corresponding of the number of sentences wanted in the summary.
        ratio: A float in [0, 1] giving the length of the summary
                as a proportion of the length of the num_documents kept.
        embedding: An Embedding fitted on a bigger corpus than documents.
        num_keywords: An int corresponding to the number of keywords returned
        used_sentences: A set of "forbidden" sentences.
        filter_sentences: A function returning a bool, allowing the selection of a sentence.
        get_content: A function that allows the retrieval of a document's content.
        centroid: the centroid of the cluster that is summarized.
        centroid_siblings: the siblings centroids of the cluster that is summarized.
    Returns:
        A tuple containing:
            A list of the summary sentences,
            A list of keywords.
    """
    nlp = spacy.load('en_core_web_sm')

    assert num_sentences or ratio
    assert type(documents) == list

    if used_sentences is None:
        used_sentences = set()

    # Get number of documents
    if num_documents is None:
        num_documents = len(documents)
    else:
        num_documents = min(len(documents), num_documents)

    # Find best documents
    #    start_time = time.clock()
    assert num_documents != 0
    if num_documents == 1:
        best_documents = [documents[0]]

    else:
        documents_gismo = make_gismo(
            documents=documents,
            other_embedding=embedding,
            is_documents_embedding=is_documents_embedding)
        documents_gismo.rank(query)
        best_documents = documents_gismo.get_documents_by_rank(k=num_documents)
        if query == "":
            query = " ".join(
                documents_gismo.get_features_by_rank(k=size_generic_query))
    #    print("finding best documents : ", time.clock() - start_time)
    # Split best document into sentences.
    #    start_time = time.clock()
    contents_sentences = [
        sentence for document in best_documents
        for sentence in make_sentences_wiki(get_content(document))
    ]

    assert contents_sentences is not None

    #    print("Splitting best docs in sentences : ", time.clock() - start_time)
    # Scale the number of sentences proportionally to the total number
    # of sentences in the top documents.
    if num_sentences is None:
        num_sentences = max(int(ratio * len(contents_sentences)), 1)

    streching_for_duplicates = 7

    # Computation of the score and selection of sentences
    if siblings_centroids is not None:
        number_of_siblings = len(siblings_centroids)
    else:
        number_of_siblings = 0
    summary = sorted(
        [
            {
                "sentence": contents_sentences[i],
                "index": i,
                "score": 2 * (number_of_siblings + 1) * cosine_similarity(
                    embedding.query_projection(contents_sentences[i])[0],
                    centroid
                ) - sum([
                    cosine_similarity(
                        embedding.query_projection(contents_sentences[i])[0],
                        siblings_centroids[sibling_index]
                    )
                    for sibling_index in range(number_of_siblings)
                ])
            }
            for i in range(len(contents_sentences))
            if is_relevant_sentence(contents_sentences[i]) and \
               (contents_sentences[i] not in used_sentences)
        ],
        key=lambda k: k["score"],
        reverse=True
    )[:(streching_for_duplicates * num_sentences)]

    # Removing adverbs and nominal sentences, pronoun resolution
    sentences_to_remove = list()
    for (sum_index, sentence_dict) in enumerate(summary):
        sentence = nlp(sentence_dict["sentence"])
        if sentence[0].pos_ == "ADV":
            if sentence[1].pos_ == "PUNCT":
                sentence = sentence[2:]
            else:
                sentence = sentence[1:]
            sentence_dict["sentence"] = sentence.text
        if "VBZ" not in {token.tag_ for token in sentence}:
            sentences_to_remove.append(sentence_dict)
        if "PRP" in {token.tag_ for token in sentence}:  # elif si VBZ ici
            i = int(sentence_dict["index"])
            extract_str = " ".join(
                [sentence for sentence in contents_sentences[i - 2:i + 1]])
            extract = nlp(extract_str)
            if extract._.has_coref:
                resolved_extract = extract._.coref_resolved
                sentence_dict["sentence"] = make_sentences_wiki(
                    resolved_extract)[-1]

    summary = [
        sentence for sentence in summary
        if (sentence not in sentences_to_remove)
    ]

    return [
        sentence_dict["sentence"] for sentence_dict in summary[:num_sentences]
    ]  # , keywords)

Пример #10

0

Показать файл

Файл: gismo_wrapper.py Проект: balouf/sisu

def old_make_gismo(
    documents: list,
    alpha: float = .2,
    other_embedding: Embedding = None,
    is_documents_embedding: bool = False,
    document_to_text=simplified_document_to_string  # All the values by default
) -> Gismo:
    """
    Make a Gismo object from a list of documents.
    Args:
        documents: A `list` of documents with strings in the values.
        alpha: A `float` in [0, 1] indicating the damping factor used in the D-iteration used by Gismo.
        other_embedding: embedding already fitted on a corpus.
        document_to_text: Callback(Document) -> str.
    Returns:
        A Gismo object made from the given documents and embedding.
    """
    def post_document(gismo: Gismo, i: int) -> dict:
        document = gismo.corpus[i]
        return document

    #    print("corpus")
    corpus = Corpus(documents, document_to_text)
    if other_embedding is None:
        #        print("vectorizer")
        vectorizer = CountVectorizer(dtype=float)
        embedding = Embedding(vectorizer=vectorizer)
        #        print("fit_transform")
        embedding.fit_transform(corpus)
    else:
        if is_documents_embedding:
            embedding = Embedding()
            embedding = copy.copy(other_embedding)
        else:
            embedding = Embedding()
            #            print("fit_ext")
            embedding.fit_ext(other_embedding)
            #            print("transform")
            embedding.transform(corpus)
    #    print("gismo")
    gismo = Gismo(corpus, embedding)
    gismo.post_document = post_document
    gismo.diteration.alpha = alpha

    return gismo

Пример #11

0

Показать файл

Файл: building_summary.py Проект: balouf/sisu

def summarize(documents,
              query="",
              num_documents=None,
              num_sentences=None,
              ratio=0.05,
              embedding=None,
              num_keywords: int = 15,
              size_generic_query: int = 5,
              used_sentences: set = None,
              get_content=lambda x: x["content"]) -> tuple:
    """
    Produces a list of sentences and a list of keywords.

    Parameters
    ----------
    documents: :class:`list`
        A list of documents.
    query: :class:`str`, optional
        Textual query to focus the summary on one subject.
    num_documents: :class:`int`, optional
        Number of top documents to be taking into account for the summary.
    num_sentences: :class:`int`, optional
        Number of sentences wanted in the summary. Overrides ratio.
    ratio: :class:`float` in ]0, 1], optional
        length of the summary as a proportion of the length of the num_documents kept.
    embedding: :class:`~gismo.embedding.Embedding`, optional
        An Embedding fitted on a bigger corpus than documents.
    num_keywords: :class:`int`, optional
        An int corresponding to the number of keywords returned
    size_generic_query: :class:`int`, optional
        size generic query
    used_sentences: :class:`set`, optional
        A set of "forbidden" sentences. Will be updated inplace.
    get_content: callable, optional
        A function that allows the retrieval of a document's content.

    Returns
    -------
    :class:`list`
        A list of the summary sentences,
        A list of keywords.

    Examples
    --------
    >>> from gismo.datasets.reuters import get_reuters_news
    >>> summarize(get_reuters_news(), num_documents=10, num_sentences=4) # doctest: +NORMALIZE_WHITESPACE
    (['Gum arabic has a history dating back to ancient times.',
      'Hungry nomads pluck gum arabic as they pass with grazing goats and cattle.',
      'For impoverished sub-Saharan states producing the bulk of world demand, gum arabic simply means export currency.',
      "After years of war-induced poverty, gum arabic is offering drought-stricken Chad's rural poor a lifeline to the production plants of the world's food and beverage giants."],
      ['norilsk', 'icewine', 'amiel', 'gum', 'arabic', 'her', 'tibet', 'chad', 'deng', 'oil', 'grapes', 'she', 'his', 'czechs', 'chechnya'])
    >>> summarize(get_reuters_news(), query="Ericsson", num_documents=10, num_sentences=5) # doctest: +NORMALIZE_WHITESPACE
    (['The restraints are few in areas such as consumer products, while in sectors such as banking, distribution and insurance, foreign firms are kept on a very tight leash.',
      'These latest wins follow a recent $350 million contract win with Telefon AB L.M.',
      'Pocket is the first from the high-priced 1996 auction known to have filed for bankruptcy protection.',
      '"That is, assuming the deal is done right," she added.',
      '"Generally speaking, the easiest place to make a profit tends to be in the consumer industry, usually fairly small-scale operations," said Anne Stevenson-Yang, director of China operations for the U.S.-China Business Council.'],
      ['ericsson', 'sweden', 'motorola', 'telecommuncation', 'communciation', 'bolstering', 'priced', 'sectors', 'makers', 'equipment', 'schaumberg', 'lm', 'done', 'manufacturing', 'consumer'])
    """
    if used_sentences is None:
        used_sentences = set()

    if num_documents is None:
        num_documents = len(documents)

    doc_corpus = Corpus(source=documents, to_text=get_content)

    if embedding:
        doc_embedding = Embedding()
        doc_embedding.fit_ext(embedding)
        doc_embedding.transform(corpus=doc_corpus)
    else:
        vectorizer = CountVectorizer(dtype=float)
        doc_embedding = Embedding(vectorizer=vectorizer)
        doc_embedding.fit_transform(corpus=doc_corpus)

    documents_gismo = Gismo(corpus=doc_corpus,
                            embedding=doc_embedding,
                            alpha=.2)

    #        print("- Running D-iteration (query = %s)" % query)
    documents_gismo.rank(query)
    #        print("- Extracting results (gismo = %s)" % documents_gismo)
    best_documents = documents_gismo.get_documents_by_rank(k=num_documents)

    #    Split best document into sentences. Remove duplicates
    #    print("Splitting documents into sentences")
    contents_sentences = sorted({
        sentence
        for document in best_documents
        for sentence in make_sentences(get_content(document))
    })

    # Scale the number of sentences proportionally to the total number
    # of sentences in the top documents.
    if num_sentences is None:
        num_sentences = int(ratio * len(contents_sentences))
    #        print("Scaling num_sentences to %d (ratio = %s)" % (num_sentences, ratio))

    #    print("Preparing sentence-based gismo")

    sent_corpus = Corpus(source=contents_sentences)

    sent_embedding = Embedding()
    if embedding:
        sent_embedding.fit_ext(embedding)
    else:
        sent_embedding.fit_ext(doc_embedding)

    sent_embedding.transform(corpus=sent_corpus)
    sentences_gismo = Gismo(corpus=sent_corpus,
                            embedding=sent_embedding,
                            alpha=.2)

    #    print("Preparing sentence-based gismo")
    sentences_gismo.rank(query)
    keywords = sentences_gismo.get_features_by_rank(k=num_keywords)
    if query == "":
        sentences_gismo.rank(" ".join(keywords[:size_generic_query]))
    sentences_ranks = sentences_gismo.diteration.x_order  # List of sentence indices by decreasing relevance
    #    print("Extracting %d-top sentences" % num_sentences)

    num_kept_sentences = 0
    i = 0
    ranked_sentences = list()
    while num_kept_sentences < num_sentences and i < len(contents_sentences):
        sentence = contents_sentences[sentences_ranks[i]]
        if sentence not in used_sentences and is_relevant_sentence(sentence):
            used_sentences.add(sentence)
            ranked_sentences.append(sentence)
            num_kept_sentences += 1
        i += 1
    return ranked_sentences, keywords

Пример #12

0

Показать файл

Файл: building_summary.py Проект: balouf/sisu

def make_tree(documents: list,
              query: str = "",
              depth: int = 1,
              trees: list = None,
              documents_gismo: Gismo = None,
              num_documents: int = None,
              num_sentences: int = None,
              embedding: Embedding = None,
              used_sentences: set = None) -> list:
    r"""
    Builds a hierarchical summary.

    Parameters
    ----------
    documents: :class:`list` of :class:`dict`
        A list of dict corresponding to documents, only the values of the "content" key will be summarized.
    query: :class:`str`, optional
        Textual query to focus the summary on one subject.
    depth: :class:`int`, optional
        An int giving the depth of the summary (depth one is a sequential summary).
    trees: :class:`list`, optional
        A list of dict being completed, necessary for the recursivity.
    documents_gismo: :class:`~gismo.gismo.Gismo`
        Pre-existing Gismo
    num_documents: :class:`int`, optional
        Number of top documents to be taking into account for the summary.
    num_sentences: :class:`int`, optional
        Number of sentences wanted in the summary.
    embedding: :class:`~gismo.embedding.Embedding`, optional
        An Embedding fitted on a bigger corpus than documents.
    used_sentences: :class:`set`, optional
        A set of "forbidden" sentences. Will be updated inplace.

    Returns
    -------
    :class:`list` of :class:`dict`
        A list of dict corresponding to the hierarchical summary

    Examples
    --------
    >>> from gismo.datasets.reuters import get_reuters_news
    >>> make_tree(get_reuters_news(), query="Orange", num_documents=10, num_sentences=3, depth=2) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
    [{'text': 'But some analysts still believe Orange is overvalued.',
      'current_keywords': ['orange', 'one', 'is', 'at', 'on', 'in', 'and', 'its', 'shares', 'has', 'analysts', 'of', 'market', 'believe', 'overvalued'],
      'url': None,
      'children': [{'text': 'Trading sources said China was staying out of the market, and that Indian meal was currently overvalued by a good $20 a tonne.',
                    'current_keywords': ['orange', 'overvalued', 'analysts', 'that', 'and', 'are', 'compared', 'believe', 'market', 'but', 'some', 'still', 'of', 'said', 'we'],
                    'url': None, 'children': []},
                   {'text': 'Since the purchase, widely seen by analysts as overvalued, Quaker has struggled with the line of ready-to-drink teas and juices.',
                    'current_keywords': ['orange', 'overvalued', 'analysts', 'that', 'and', 'are', 'compared', 'believe', 'market', 'but', 'some', 'still', 'of', 'said', 'we'],
                    'url': None, 'children': []},
                   {'text': '"No question that if the dollar continues to be overvalued and continues to be strong, we\'ll see some price erosion later in the year."',
                    'current_keywords': ['orange', 'overvalued', 'analysts', 'that', 'and', 'are', 'compared', 'believe', 'market', 'but', 'some', 'still', 'of', 'said', 'we'],
                    'url': None, 'children': []}]},
     {'text': 'Orange shares were 2.5p higher at 188p on Friday.',
      'current_keywords': ['orange', 'one', 'is', 'at', 'on', 'in', 'and', 'its', 'shares', 'has', 'analysts', 'of', 'market', 'believe', 'overvalued'],
      'url': None,
      'children': [{'text': 'Orange, Calif.-based Bergen is the largest U.S. distributor of generic drugs, while Miami-based Ivax is a generic drug manufacturing giant.',
                    'current_keywords': ['orange', 'higher', 'shares', 'friday', 'on', 'at', 'and', 'in', 'its', 'of', 'percent', 'one', 'mobile', 'to', 'market'],
                    'url': None, 'children': []},
                   {'text': 'One-2-One and Orange ORA.L, which offer only digital services, are due to release their connection figures next week.',
                    'current_keywords': ['orange', 'higher', 'shares', 'friday', 'on', 'at', 'and', 'in', 'its', 'of', 'percent', 'one', 'mobile', 'to', 'market'],
                    'url': None, 'children': []},
                   {'text': "Dodd noted that BT's plans to raise the price of calls to Orange and One 2 One handsets would be beneficial.",
                    'current_keywords': ['orange', 'higher', 'shares', 'friday', 'on', 'at', 'and', 'in', 'its', 'of', 'percent', 'one', 'mobile', 'to', 'market'],
                    'url': None, 'children': []}]},
     {'text': 'Orange already has a full roaming agreement in Germany and a partial one in France, centred on Paris.',
      'current_keywords': ['orange', 'one', 'is', 'at', 'on', 'in', 'and', 'its', 'shares', 'has', 'analysts', 'of', 'market', 'believe', 'overvalued'],
      'url': None,
      'children': [{'text': 'Orange says its offer of roaming services between the UK and other countries is part of its aim to provide customers with the best value for money.',
                    'current_keywords': ['orange', 'roaming', 'partial', 'centred', 'paris', 'france', 'germany', 'agreement', 'full', 'on', 'and', 'in', 'of', 'for', 'with'],
                    'url': None, 'children': []},
                   {'text': 'As with all roaming agreements, the financial details of the Swiss deal remain a trade secret.',
                    'current_keywords': ['orange', 'roaming', 'partial', 'centred', 'paris', 'france', 'germany', 'agreement', 'full', 'on', 'and', 'in', 'of', 'for', 'with'],
                    'url': None, 'children': []},
                   {'text': '"We look forward in 1997 to continuing to move ahead and to extending our international service through new roaming agreements and the introduction of dual band handsets."',
                    'current_keywords': ['orange', 'roaming', 'partial', 'centred', 'paris', 'france', 'germany', 'agreement', 'full', 'on', 'and', 'in', 'of', 'for', 'with'],
                    'url': None, 'children': []}]}]
    """
    num_keywords = 15
    if used_sentences == None:
        used_sentences = set()

    if depth == 0:
        return list()
    if documents_gismo == None:
        doc_corpus = Corpus(source=documents,
                            to_text=simplified_document_to_string)
        if embedding:
            doc_embedding = Embedding()
            doc_embedding.fit_ext(embedding)
            doc_embedding.transform(corpus=doc_corpus)
        else:
            vectorizer = CountVectorizer(dtype=float)
            doc_embedding = Embedding(vectorizer=vectorizer)
            doc_embedding.fit_transform(corpus=doc_corpus)

        documents_gismo = Gismo(corpus=doc_corpus,
                                embedding=doc_embedding,
                                alpha=.2)

    documents_gismo.rank(query)
    best_documents = [
        (i, documents_gismo.corpus[i])
        for i in documents_gismo.diteration.x_order[:num_documents]
    ]
    # documents_gismo.get_documents_by_rank(k=num_documents)
    sentences_dictionnaries = [{
        "sentence": sentence,
        "url": document.get("url"),
        "doc_index": i,
    } for i, document in best_documents for sentence in list(
        OrderedDict.fromkeys(make_sentences(document["content"])))]

    sent_corpus = Corpus(source=sentences_dictionnaries,
                         to_text=lambda s: s['sentence'])
    if embedding:
        sent_embedding = Embedding()
        sent_embedding.fit_ext(embedding)
        sent_embedding.transform(corpus=sent_corpus)
    else:
        vectorizer = CountVectorizer(dtype=float)
        sent_embedding = Embedding(vectorizer=vectorizer)
        sent_embedding.fit_transform(corpus=sent_corpus)

    sentences_gismo = Gismo(corpus=sent_corpus,
                            embedding=sent_embedding,
                            alpha=.2)
    sentences_gismo.rank(query)
    keywords = sentences_gismo.get_features_by_rank(k=num_keywords)
    sentences_ranks = sentences_gismo.diteration.x_order

    num_kept_sentences = 0
    ranked_sentences_dict = list()
    for rank in sentences_ranks:
        sentence_dict = sentences_dictionnaries[rank]
        sentence = sentence_dict["sentence"]
        if sentence not in used_sentences and is_relevant_sentence(sentence):
            ranked_sentences_dict.append(sentence_dict)
            used_sentences.add(sentence)
            num_kept_sentences += 1
            if num_kept_sentences >= num_sentences:
                break
    children = ranked_sentences_dict
    return [{
        "text":
        child["sentence"],
        "current_keywords":
        keywords,
        "url":
        child.get("url"),
        "children":
        make_tree(trees=trees,
                  depth=depth - 1,
                  documents_gismo=documents_gismo,
                  documents=documents,
                  query=make_query(" ".join([query, child["sentence"]])),
                  num_sentences=num_sentences,
                  embedding=embedding,
                  used_sentences=used_sentences)
    } for child in children]

Python Embedding примеры использования