def lda2vec( corpus: List[str], n_topics: int, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), stemming: Callable = sastrawi, cleaning: Callable = simple_textcleaning, vectorizer: str = 'bow', stop_words: List[str] = None, window_size: int = 2, embedding_size: int = 128, epoch: int = 10, switch_loss: int = 3, skip: int = 5, **kwargs, ): """ Train a LDA2Vec model to do topic modelling based on corpus / list of strings given. Parameters ---------- corpus: list n_topics: int, (default=10) size of decomposition column. stemming: function, (default=sastrawi) function to stem the corpus. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=simple_textcleaning) function to clean the corpus. stop_words: list, (default=None) list of stop words to remove. If None, default is malaya.texts._text_functions.STOPWORDS embedding_size: int, (default=128) embedding size of lda2vec tensors. training_iteration: int, (default=10) training iteration, how many loop need to train. switch_loss: int, (default=3) baseline to switch from document based loss to document + word based loss. vectorizer: str, (default='bow') vectorizer technique. Allowed values: * ``'bow'`` - Bag of Word. * ``'tfidf'`` - Term frequency inverse Document Frequency. * ``'skip-gram'`` - Bag of Word with skipping certain n-grams. skip: int, (default=5) skip value if vectorizer = 'skip-gram' Returns ------- _DEEP_TOPIC: malaya.topic_modelling._DEEP_TOPIC class """ if not isinstance(stemming, collections.Callable) and stemming is not None: raise ValueError('stemming must be a callable type or None') vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") tf_vectorizer = Vectorizer( ngram_range=ngram, min_df=min_df, max_df=max_df, stop_words=stop_words, ) if stop_words is None: stop_words = STOPWORDS if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) if stemming: for i in range(len(corpus)): corpus[i] = stemming(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stop_words])) tf_vectorizer.fit(text_clean) idx_text_clean, len_idx_text_clean = [], [] transformed_text_clean = tf_vectorizer.transform(text_clean) for text in transformed_text_clean: splitted = text.nonzero()[1] idx_text_clean.append(splitted) len_idx_text_clean.append(len(splitted)) dictionary = { i: no for no, i in enumerate(tf_vectorizer.get_feature_names()) } reversed_dictionary = { no: i for no, i in enumerate(tf_vectorizer.get_feature_names()) } freqs = transformed_text_clean.toarray().sum(axis=0).tolist() doc_ids = np.arange(len(idx_text_clean)) num_unique_documents = doc_ids.max() pivot_words, target_words, doc_ids = [], [], [] for i, t in enumerate(idx_text_clean): pairs, _ = skipgrams( t, vocabulary_size=len(dictionary), window_size=window_size, shuffle=True, negative_samples=0, ) for pair in pairs: temp_data = pair pivot_words.append(temp_data[0]) target_words.append(temp_data[1]) doc_ids.append(i) pivot_words, target_words, doc_ids = shuffle(pivot_words, target_words, doc_ids, random_state=10) num_unique_documents = len(idx_text_clean) model = LDA2VEC( num_unique_documents, len(dictionary), n_topics, freqs, embedding_size=embedding_size, **kwargs, ) model.train(pivot_words, target_words, doc_ids, epoch, switch_loss=switch_loss) return _DEEP_TOPIC( model, dictionary, reversed_dictionary, freqs, len_idx_text_clean, text_clean, )
def lda2vec( corpus: List[str], vectorizer, n_topics: int = 10, cleaning=simple_textcleaning, stopwords=get_stopwords, window_size: int = 2, embedding_size: int = 128, epoch: int = 10, switch_loss: int = 1000, **kwargs, ): """ Train a LDA2Vec model to do topic modelling based on corpus / list of strings given. Parameters ---------- corpus: list vectorizer : object Should have `fit_transform` method. Commonly: * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm. * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm. * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm. * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm. n_topics: int, (default=10) size of decomposition column. cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] embedding_size: int, (default=128) embedding size of lda2vec tensors. epoch: int, (default=10) training iteration, how many loop need to train. switch_loss: int, (default=3) baseline to switch from document based loss to document + word based loss. Returns ------- result: malaya.topic_modelling.DeepTopic class """ validator.validate_function(cleaning, 'cleaning') stopwords = validator.validate_stopwords(stopwords) stopwords = list(stopwords) tf_vectorizer = vectorizer if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) tf_vectorizer.fit(text_clean) idx_text_clean, len_idx_text_clean = [], [] transformed_text_clean = tf_vectorizer.transform(text_clean) for text in transformed_text_clean: splitted = text.nonzero()[1] idx_text_clean.append(splitted) len_idx_text_clean.append(len(splitted)) dictionary = { i: no for no, i in enumerate(tf_vectorizer.get_feature_names()) } reversed_dictionary = { no: i for no, i in enumerate(tf_vectorizer.get_feature_names()) } freqs = transformed_text_clean.toarray().sum(axis=0).tolist() doc_ids = np.arange(len(idx_text_clean)) num_unique_documents = doc_ids.max() pivot_words, target_words, doc_ids = [], [], [] for i, t in enumerate(idx_text_clean): pairs, _ = skipgrams( t, vocabulary_size=len(dictionary), window_size=window_size, shuffle=True, negative_samples=0, ) for pair in pairs: temp_data = pair pivot_words.append(temp_data[0]) target_words.append(temp_data[1]) doc_ids.append(i) pivot_words, target_words, doc_ids = shuffle(pivot_words, target_words, doc_ids, random_state=10) num_unique_documents = len(idx_text_clean) model = LDA2Vec( num_unique_documents, len(dictionary), n_topics, freqs, embedding_size=embedding_size, **kwargs, ) model.train(pivot_words, target_words, doc_ids, epoch, switch_loss=switch_loss) return DeepTopic( model, dictionary, reversed_dictionary, freqs, len_idx_text_clean, text_clean, )