示例#1
0
def lda2vec(
    corpus: List[str],
    n_topics: int,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    stemming: Callable = sastrawi,
    cleaning: Callable = simple_textcleaning,
    vectorizer: str = 'bow',
    stop_words: List[str] = None,
    window_size: int = 2,
    embedding_size: int = 128,
    epoch: int = 10,
    switch_loss: int = 3,
    skip: int = 5,
    **kwargs,
):
    """
    Train a LDA2Vec model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    n_topics: int, (default=10)
        size of decomposition column.
    stemming: function, (default=sastrawi)
        function to stem the corpus.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus.
    stop_words: list, (default=None)
        list of stop words to remove. If None, default is malaya.texts._text_functions.STOPWORDS
    embedding_size: int, (default=128)
        embedding size of lda2vec tensors.
    training_iteration: int, (default=10)
        training iteration, how many loop need to train.
    switch_loss: int, (default=3)
        baseline to switch from document based loss to document + word based loss.
    vectorizer: str, (default='bow')
        vectorizer technique. Allowed values:

        * ``'bow'`` - Bag of Word.
        * ``'tfidf'`` - Term frequency inverse Document Frequency.
        * ``'skip-gram'`` - Bag of Word with skipping certain n-grams.
    skip: int, (default=5)
        skip value if vectorizer = 'skip-gram'

    Returns
    -------
    _DEEP_TOPIC: malaya.topic_modelling._DEEP_TOPIC class
    """
    if not isinstance(stemming, collections.Callable) and stemming is not None:
        raise ValueError('stemming must be a callable type or None')

    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')

    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    tf_vectorizer = Vectorizer(
        ngram_range=ngram,
        min_df=min_df,
        max_df=max_df,
        stop_words=stop_words,
    )
    if stop_words is None:
        stop_words = STOPWORDS

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = stemming(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stop_words]))
    tf_vectorizer.fit(text_clean)
    idx_text_clean, len_idx_text_clean = [], []
    transformed_text_clean = tf_vectorizer.transform(text_clean)
    for text in transformed_text_clean:
        splitted = text.nonzero()[1]
        idx_text_clean.append(splitted)
        len_idx_text_clean.append(len(splitted))
    dictionary = {
        i: no
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    reversed_dictionary = {
        no: i
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    freqs = transformed_text_clean.toarray().sum(axis=0).tolist()
    doc_ids = np.arange(len(idx_text_clean))
    num_unique_documents = doc_ids.max()
    pivot_words, target_words, doc_ids = [], [], []
    for i, t in enumerate(idx_text_clean):
        pairs, _ = skipgrams(
            t,
            vocabulary_size=len(dictionary),
            window_size=window_size,
            shuffle=True,
            negative_samples=0,
        )
        for pair in pairs:
            temp_data = pair
            pivot_words.append(temp_data[0])
            target_words.append(temp_data[1])
            doc_ids.append(i)
    pivot_words, target_words, doc_ids = shuffle(pivot_words,
                                                 target_words,
                                                 doc_ids,
                                                 random_state=10)
    num_unique_documents = len(idx_text_clean)

    model = LDA2VEC(
        num_unique_documents,
        len(dictionary),
        n_topics,
        freqs,
        embedding_size=embedding_size,
        **kwargs,
    )
    model.train(pivot_words,
                target_words,
                doc_ids,
                epoch,
                switch_loss=switch_loss)
    return _DEEP_TOPIC(
        model,
        dictionary,
        reversed_dictionary,
        freqs,
        len_idx_text_clean,
        text_clean,
    )
示例#2
0
def lda2vec(
    corpus: List[str],
    vectorizer,
    n_topics: int = 10,
    cleaning=simple_textcleaning,
    stopwords=get_stopwords,
    window_size: int = 2,
    embedding_size: int = 128,
    epoch: int = 10,
    switch_loss: int = 1000,
    **kwargs,
):
    """
    Train a LDA2Vec model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    vectorizer : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm.
        * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm.
    n_topics: int, (default=10)
        size of decomposition column.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    embedding_size: int, (default=128)
        embedding size of lda2vec tensors.
    epoch: int, (default=10)
        training iteration, how many loop need to train.
    switch_loss: int, (default=3)
        baseline to switch from document based loss to document + word based loss.

    Returns
    -------
    result: malaya.topic_modelling.DeepTopic class
    """
    validator.validate_function(cleaning, 'cleaning')
    stopwords = validator.validate_stopwords(stopwords)
    stopwords = list(stopwords)

    tf_vectorizer = vectorizer

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))
    tf_vectorizer.fit(text_clean)
    idx_text_clean, len_idx_text_clean = [], []
    transformed_text_clean = tf_vectorizer.transform(text_clean)
    for text in transformed_text_clean:
        splitted = text.nonzero()[1]
        idx_text_clean.append(splitted)
        len_idx_text_clean.append(len(splitted))
    dictionary = {
        i: no
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    reversed_dictionary = {
        no: i
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    freqs = transformed_text_clean.toarray().sum(axis=0).tolist()
    doc_ids = np.arange(len(idx_text_clean))
    num_unique_documents = doc_ids.max()
    pivot_words, target_words, doc_ids = [], [], []
    for i, t in enumerate(idx_text_clean):
        pairs, _ = skipgrams(
            t,
            vocabulary_size=len(dictionary),
            window_size=window_size,
            shuffle=True,
            negative_samples=0,
        )
        for pair in pairs:
            temp_data = pair
            pivot_words.append(temp_data[0])
            target_words.append(temp_data[1])
            doc_ids.append(i)
    pivot_words, target_words, doc_ids = shuffle(pivot_words,
                                                 target_words,
                                                 doc_ids,
                                                 random_state=10)
    num_unique_documents = len(idx_text_clean)

    model = LDA2Vec(
        num_unique_documents,
        len(dictionary),
        n_topics,
        freqs,
        embedding_size=embedding_size,
        **kwargs,
    )
    model.train(pivot_words,
                target_words,
                doc_ids,
                epoch,
                switch_loss=switch_loss)
    return DeepTopic(
        model,
        dictionary,
        reversed_dictionary,
        freqs,
        len_idx_text_clean,
        text_clean,
    )