def sklearn( corpus: List[str], model, vectorizer, n_topics: int, cleaning=simple_textcleaning, stopwords=get_stopwords, **kwargs, ): """ Train a SKlearn model to do topic modelling based on corpus / list of strings given. Parameters ---------- corpus: list model : object Should have `fit_transform` method. Commonly: * ``sklearn.decomposition.TruncatedSVD`` - LSA algorithm. * ``sklearn.decomposition.LatentDirichletAllocation`` - LDA algorithm. * ``sklearn.decomposition.NMF`` - NMF algorithm. vectorizer : object Should have `fit_transform` method. Commonly: * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm. * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm. * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm. * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm. n_topics: int, (default=10) size of decomposition column. cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: malaya.topic_modelling.Topic class """ stopwords = validator.validate_stopwords(stopwords) stopwords = list(stopwords) validator.validate_function(cleaning, 'cleaning') if not hasattr(vectorizer, 'fit_transform'): raise ValueError('vectorizer must have `fit_transform` method') if len(corpus) < n_topics: raise ValueError( 'length corpus must be bigger than or equal to n_topics') if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) tf = vectorizer.fit_transform(corpus) tf_features = vectorizer.get_feature_names() compose = model(n_topics).fit(tf) return Topic(tf_features, compose, corpus, compose.transform(tf), vectorizer, tf)
def _base_topic_modelling( corpus: List[str], n_topics: int, decomposition, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), vectorizer: str = 'bow', cleaning=simple_textcleaning, stopwords: List[str] = None, **kwargs, ): validator.validate_function(cleaning, 'cleaning') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if len(corpus) < n_topics: raise ValueError( 'length corpus must be bigger than or equal to n_topics') if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) Vectorizer = vectorizer_mapping.get(vectorizer) if not Vectorizer: raise ValueError( 'vectorizer is not supported, please check supported vectorizers from `malaya.topic_model.available_vectorizer()`' ) tf_vectorizer = Vectorizer( max_df=max_df, min_df=min_df, ngram_range=ngram, stop_words=stopwords, **kwargs, ) tf = tf_vectorizer.fit_transform(corpus) tf_features = tf_vectorizer.get_feature_names() compose = decomposition(n_topics).fit(tf) return TOPIC( tf_features, compose, [classification_textcleaning(c) for c in corpus], compose.transform(tf), tf_vectorizer, tf, )
def cluster_entity_linking( corpus: List[str], vectorizer, entity_model, topic_modeling_model, threshold: float = 0.3, topic_decomposition: int = 2, topic_length: int = 10, fuzzy_ratio: int = 70, accepted_entities: List[str] = [ 'law', 'location', 'organization', 'person', 'event', ], cleaning=simple_textcleaning, colors: List[str] = None, stopwords=get_stopwords, max_df: float = 1.0, min_df: int = 1, ngram: Tuple[int, int] = (2, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot undirected graph for Entities and topics relationship. Parameters ---------- corpus: list or str vectorizer: class titles: list list of titles, length must same with corpus. colors: list list of colors, length must same with num_clusters. threshold: float, (default=0.3) 0.3 means, 30% above absolute pearson correlation. topic_decomposition: int, (default=2) size of decomposition. topic_length: int, (default=10) size of topic models. fuzzy_ratio: int, (default=70) size of ratio for fuzzywuzzy. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str] or List[str] or Tuple[str] Returns ------- dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels} """ import inspect validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if 'max_df' not in inspect.getargspec(topic_modeling_model)[0]: raise ValueError('topic_modeling_model must have `max_df` parameter') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100): raise ValueError( 'fuzzy_ratio must be bigger than 0, less than or equal to 100') if not isinstance(threshold, float): raise ValueError('threshold must be a float') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout import pandas as pd from fuzzywuzzy import fuzz sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib, seaborn, networkx, fuzzywuzzy not installed. Please install it and try again.' ) if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) corpus = [string for string in corpus if len(string) > 5] if not colors: colors = sns.color_palette(n_colors=len(accepted_entities) + 1) else: if len(colors) != (len(accepted_entities) + 1): raise ValueError('len of colors must same as %d' % (len(accepted_entities) + 1)) topic_model = topic_modeling_model( corpus, topic_decomposition, ngram=ngram, max_df=max_df, min_df=min_df, ) topics = [] for no, topic in enumerate(topic_model.comp.components_): for i in topic.argsort()[:-topic_length - 1:-1]: topics.append(topic_model.features[i]) entities_cluster = {entity: [] for entity in accepted_entities} for string in corpus: entities_clustered = cluster_entities(entity_model.predict(string)) for entity in accepted_entities: entities_cluster[entity].extend(entities_clustered[entity]) for entity in accepted_entities: entities_cluster[entity] = cluster_words( list(set(entities_cluster[entity]))) topics = cluster_words(list(set(topics))) color_dict = {topic: colors[-1] for topic in topics} for no, entity in enumerate(accepted_entities): for e in entities_cluster[entity]: topics.append(e) color_dict[e] = colors[no] topics_corpus = [] for topic in topics: nested_corpus = [] for string in corpus: if (topic in string or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio): nested_corpus.append(string) topics_corpus.append(' '.join(nested_corpus)) corpus = topics_corpus if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean).todense() features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: attentions.extend(text_clean[i:index]) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) DxT = transformed_text_clean DxD = np.abs(pd.DataFrame(DxT.T).corr()).values G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=topics[i], label=topics[i]) len_dense = len(DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if DxD[i, j] >= threshold: weight = DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(color_dict[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) f = plt.figure(figsize=figsize) ax = f.add_subplot(1, 1, 1) for no, entity in enumerate(accepted_entities): ax.plot([0], [0], color=colors[no], label=entity) ax.plot([0], [0], color=colors[-1], label='topics') nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax) plt.legend() plt.tight_layout() plt.show() return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def cluster_graph( corpus: List[str], vectorizer, threshold: float = 0.9, num_clusters: int = 5, titles: List[str] = None, colors: List[str] = None, stopwords=get_stopwords, ngram: Tuple[int, int] = (1, 3), cleaning=simple_textcleaning, clustering=KMeans, figsize: Tuple[int, int] = (17, 9), with_labels: bool = True, batch_size: int = 20, ): """ plot undirected graph with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. threshold: float, (default=0.9) 0.9 means, 90% above absolute pearson correlation. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=True) list of titles, length must same with corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str] or List[str] or Tuple[str]. cleaning: function, (default=malaya.texts.function.simple_textcleaning) function to clean the corpus. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels} """ validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if colors: if len(colors) != num_clusters: raise ValueError( 'size of colors must be same with number of clusters') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout import pandas as pd sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib, seaborn, networkx not installed. Please install it and try again.' ) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean).todense() features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) DxT = transformed_text_clean DxD = np.abs(pd.DataFrame(DxT.T).corr()).values km = clustering(n_clusters=num_clusters) km.fit(DxT) clusters = km.labels_.tolist() if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort(np.array( transformed_text_clean[i])[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) if not colors: colors = sns.color_palette(n_colors=num_clusters) G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=titles[i], label=clusters[i]) len_dense = len(DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if DxD[i, j] >= threshold: weight = DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(colors[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) plt.figure(figsize=figsize) if with_labels: nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels) else: nx.draw(G, node_color=node_colors, pos=pos) return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def cluster_dendogram( corpus: List[str], vectorizer, titles: List[str] = None, stopwords=get_stopwords, cleaning=simple_textcleaning, random_samples: float = 0.3, ngram: Tuple[int, int] = (1, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot hierarchical dendogram with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=None) list of titles, length must same with corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. random_samples: float, (default=0.3) random samples from the corpus, 0.3 means 30%. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles} """ if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if not (random_samples < 1 and random_samples > 0): raise ValueError('random_samples must be between 0 and 1') try: import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import ward, dendrogram sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib and seaborn not installed. Please install it and try again.' ) corpus = random.sample(corpus, k=int(random_samples * len(corpus))) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean) features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) dist = 1 - cosine_similarity(transformed_text_clean) linkage_matrix = ward(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) plt.figure(figsize=figsize) ax = dendrogram(linkage_matrix, orientation='right', labels=titles) plt.tick_params( axis='x', which='both', bottom='off', top='off', labelbottom='off', ) plt.tight_layout() plt.show() return {'linkage_matrix': linkage_matrix, 'titles': titles}
def cluster_scatter( corpus: List[str], vectorizer, num_clusters: int = 5, titles: List[str] = None, colors: List[str] = None, stopwords=get_stopwords, cleaning=simple_textcleaning, clustering=KMeans, decomposition=MDS, ngram: Tuple[int, int] = (1, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot scatter plot on similar text clusters. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=None) list of titles, length must same with corpus. colors: List[str], (default=None) list of colors, length must same with num_clusters. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=malaya.texts.function.simple_textcleaning) function to clean the corpus. batch_size: int, (default=10) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'X': X, 'Y': Y, 'labels': clusters, 'vector': transformed_text_clean, 'titles': titles} """ if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if colors: if len(colors) != num_clusters: raise ValueError( 'size of colors must be same with number of clusters') validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') try: import matplotlib.pyplot as plt import seaborn as sns sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib and seaborn not installed. Please install it and try again.' ) if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean) features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) km = clustering(n_clusters=num_clusters) dist = 1 - cosine_similarity(transformed_text_clean) km.fit(transformed_text_clean) clusters = km.labels_.tolist() if isinstance(decomposition, MDS): decomposed = decomposition(n_components=2, dissimilarity='precomputed') else: decomposed = decomposition(n_components=2) pos = decomposed.fit_transform(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) if not colors: colors = sns.color_palette(n_colors=num_clusters) X, Y = pos[:, 0], pos[:, 1] plt.figure(figsize=figsize) for i in np.unique(clusters): plt.scatter( X[clusters == i], Y[clusters == i], color=colors[i], label='cluster %d' % (i), ) for i in range(len(X)): plt.text(X[i], Y[i], titles[i], size=8) plt.legend() plt.show() return { 'X': X, 'Y': Y, 'labels': clusters, 'vector': transformed_text_clean, 'titles': titles, }
def attention( corpus: List[str], n_topics: int, vectorizer, cleaning=simple_textcleaning, stopwords=get_stopwords, ngram: Tuple[int, int] = (1, 3), batch_size: int = 10, ): """ Use attention from transformer model to do topic modelling based on corpus / list of strings given. Parameters ---------- corpus: list n_topics: int, (default=10) size of decomposition column. vectorizer: object cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] ngram: tuple, (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=10) size of strings for each vectorization and attention. Returns ------- result: malaya.topic_modelling.AttentionTopic class """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(vectorizer, 'attention') and not hasattr( vectorizer, 'vectorize'): raise ValueError( 'vectorizer must have `attention` and `vectorize` methods') validator.validate_function(cleaning, 'cleaning') if len(corpus) < n_topics: raise ValueError( 'length corpus must be bigger than or equal to n_topics') from sklearn.cluster import KMeans if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) def generate_ngram(seq, ngram=(1, 3)): g = [] for i in range(ngram[0], ngram[-1] + 1): g.extend(list(ngrams_generator(seq, i))) return g rows, attentions = [], [] for i in range(0, len(corpus), batch_size): index = min(i + batch_size, len(corpus)) rows.append(vectorizer.vectorize(corpus[i:index])) attentions.extend(vectorizer.attention(corpus[i:index])) concat = np.concatenate(rows, axis=0) kmeans = KMeans(n_clusters=n_topics, random_state=0).fit(concat) labels = kmeans.labels_ overall, filtered_a = [], [] for a in attentions: f = [i for i in a if i[0] not in stopwords] overall.extend(f) filtered_a.append(f) o_ngram = generate_ngram(overall, ngram) features = [] for i in o_ngram: features.append(' '.join([w[0] for w in i])) features = list(set(features)) components = np.zeros((n_topics, len(features))) for no, i in enumerate(labels): f = generate_ngram(filtered_a[no], ngram) for w in f: word = ' '.join([r[0] for r in w]) score = np.mean([r[1] for r in w]) if word in features: components[i, features.index(word)] += score return AttentionTopic(features, components)
def lda2vec( corpus: List[str], vectorizer, n_topics: int = 10, cleaning=simple_textcleaning, stopwords=get_stopwords, window_size: int = 2, embedding_size: int = 128, epoch: int = 10, switch_loss: int = 1000, **kwargs, ): """ Train a LDA2Vec model to do topic modelling based on corpus / list of strings given. Parameters ---------- corpus: list vectorizer : object Should have `fit_transform` method. Commonly: * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm. * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm. * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm. * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm. n_topics: int, (default=10) size of decomposition column. cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] embedding_size: int, (default=128) embedding size of lda2vec tensors. epoch: int, (default=10) training iteration, how many loop need to train. switch_loss: int, (default=3) baseline to switch from document based loss to document + word based loss. Returns ------- result: malaya.topic_modelling.DeepTopic class """ validator.validate_function(cleaning, 'cleaning') stopwords = validator.validate_stopwords(stopwords) stopwords = list(stopwords) tf_vectorizer = vectorizer if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) tf_vectorizer.fit(text_clean) idx_text_clean, len_idx_text_clean = [], [] transformed_text_clean = tf_vectorizer.transform(text_clean) for text in transformed_text_clean: splitted = text.nonzero()[1] idx_text_clean.append(splitted) len_idx_text_clean.append(len(splitted)) dictionary = { i: no for no, i in enumerate(tf_vectorizer.get_feature_names()) } reversed_dictionary = { no: i for no, i in enumerate(tf_vectorizer.get_feature_names()) } freqs = transformed_text_clean.toarray().sum(axis=0).tolist() doc_ids = np.arange(len(idx_text_clean)) num_unique_documents = doc_ids.max() pivot_words, target_words, doc_ids = [], [], [] for i, t in enumerate(idx_text_clean): pairs, _ = skipgrams( t, vocabulary_size=len(dictionary), window_size=window_size, shuffle=True, negative_samples=0, ) for pair in pairs: temp_data = pair pivot_words.append(temp_data[0]) target_words.append(temp_data[1]) doc_ids.append(i) pivot_words, target_words, doc_ids = shuffle(pivot_words, target_words, doc_ids, random_state=10) num_unique_documents = len(idx_text_clean) model = LDA2Vec( num_unique_documents, len(dictionary), n_topics, freqs, embedding_size=embedding_size, **kwargs, ) model.train(pivot_words, target_words, doc_ids, epoch, switch_loss=switch_loss) return DeepTopic( model, dictionary, reversed_dictionary, freqs, len_idx_text_clean, text_clean, )
def lda2vec( corpus: List[str], n_topics: int, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), cleaning=simple_textcleaning, vectorizer: str = 'bow', stopwords=get_stopwords, window_size: int = 2, embedding_size: int = 128, epoch: int = 10, switch_loss: int = 3, skip: int = 5, **kwargs, ): """ Train a LDA2Vec model to do topic modelling based on corpus / list of strings given. Parameters ---------- corpus: list n_topics: int, (default=10) size of decomposition column. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] embedding_size: int, (default=128) embedding size of lda2vec tensors. training_iteration: int, (default=10) training iteration, how many loop need to train. switch_loss: int, (default=3) baseline to switch from document based loss to document + word based loss. vectorizer: str, (default='bow') vectorizer technique. Allowed values: * ``'bow'`` - Bag of Word. * ``'tfidf'`` - Term frequency inverse Document Frequency. * ``'skip-gram'`` - Bag of Word with skipping certain n-grams. skip: int, (default=5) skip value if vectorizer = 'skip-gram' Returns ------- result: malaya.topic_modelling.DEEP_TOPIC class """ validator.validate_function(cleaning, 'cleaning') stopwords = validator.validate_stopwords(stopwords) stopwords = list(stopwords) vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') Vectorizer = vectorizer_mapping.get(vectorizer) if not Vectorizer: raise ValueError( 'vectorizer is not supported, please check supported vectorizers from `malaya.topic_model.available_vectorizer()`' ) tf_vectorizer = Vectorizer( ngram_range=ngram, min_df=min_df, max_df=max_df, stop_words=stopwords, ) if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) tf_vectorizer.fit(text_clean) idx_text_clean, len_idx_text_clean = [], [] transformed_text_clean = tf_vectorizer.transform(text_clean) for text in transformed_text_clean: splitted = text.nonzero()[1] idx_text_clean.append(splitted) len_idx_text_clean.append(len(splitted)) dictionary = { i: no for no, i in enumerate(tf_vectorizer.get_feature_names()) } reversed_dictionary = { no: i for no, i in enumerate(tf_vectorizer.get_feature_names()) } freqs = transformed_text_clean.toarray().sum(axis=0).tolist() doc_ids = np.arange(len(idx_text_clean)) num_unique_documents = doc_ids.max() pivot_words, target_words, doc_ids = [], [], [] for i, t in enumerate(idx_text_clean): pairs, _ = skipgrams( t, vocabulary_size=len(dictionary), window_size=window_size, shuffle=True, negative_samples=0, ) for pair in pairs: temp_data = pair pivot_words.append(temp_data[0]) target_words.append(temp_data[1]) doc_ids.append(i) pivot_words, target_words, doc_ids = shuffle(pivot_words, target_words, doc_ids, random_state=10) num_unique_documents = len(idx_text_clean) model = LDA2VEC( num_unique_documents, len(dictionary), n_topics, freqs, embedding_size=embedding_size, **kwargs, ) model.train(pivot_words, target_words, doc_ids, epoch, switch_loss=switch_loss) return DEEP_TOPIC( model, dictionary, reversed_dictionary, freqs, len_idx_text_clean, text_clean, )