def draw_graph(self, colors=None): pos = fruchterman_reingold_layout(self.graph) node_sizes = [ 100000 * x * x + 50 for x in list(self.betweenness.values()) ] colors = colors if colors else self.colors nx.draw_networkx_nodes(self.graph, pos, node_size=node_sizes, node_color=colors, alpha=0.5, line_color=None) nx.draw_networkx_edges(self.graph, pos, alpha=0.05, style='solid') nx.draw_networkx_labels(self.graph, pos, {node: node for node in self.nodes}, font_size=8)
def cluster_entity_linking( corpus: List[str], vectorizer, entity_model, topic_modeling_model, threshold: float = 0.3, topic_decomposition: int = 2, topic_length: int = 10, fuzzy_ratio: int = 70, accepted_entities: List[str] = [ 'law', 'location', 'organization', 'person', 'event', ], cleaning=simple_textcleaning, stemming=sastrawi, colors: List[str] = None, stop_words: List[str] = STOPWORDS, max_df: float = 1.0, min_df: int = 1, ngram: Tuple[int, int] = (2, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot undirected graph for Entities and topics relationship. Parameters ---------- corpus: list or str vectorizer: class titles: list list of titles, length must same with corpus. colors: list list of colors, length must same with num_clusters. threshold: float, (default=0.3) 0.3 means, 30% above absolute pearson correlation. topic_decomposition: int, (default=2) size of decomposition. topic_length: int, (default=10) size of topic models. fuzzy_ratio: int, (default=70) size of ratio for fuzzywuzzy. stemming: bool, (default=True) If True, sastrawi_stemmer will apply. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=simple_textcleaning) function to clean the corpus. stop_words: list, (default=STOPWORDS) list of stop words to remove. Returns ------- dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels} """ import inspect if not isinstance(stemming, Callable) and stemming is not None: raise ValueError('stemming must be a callable type or None') if not isinstance(cleaning, Callable) and cleaning is not None: raise ValueError('cleaning must be a callable type or None') if not hasattr(vectorizer, 'vectorize') and not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must has `fit` and `vectorize` methods') if 'max_df' not in inspect.getargspec(topic_modeling_model)[0]: raise ValueError('topic_modeling_model must has `max_df` parameter') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100): raise ValueError( 'fuzzy_ratio must be bigger than 0, less than or equal to 100') if not isinstance(threshold, float): raise ValueError('threshold must be a float') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout import pandas as pd from fuzzywuzzy import fuzz sns.set() except: raise Exception( 'matplotlib, seaborn, networkx, fuzzywuzzy not installed. Please install it and try again.' ) if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) corpus = [string for string in corpus if len(string) > 5] if not colors: colors = sns.color_palette(n_colors=len(accepted_entities) + 1) else: if len(colors) != (len(accepted_entities) + 1): raise ValueError('len of colors must same as %d' % (len(accepted_entities) + 1)) topic_model = topic_modeling_model( corpus, topic_decomposition, ngram=ngram, max_df=max_df, min_df=min_df, ) topics = [] for no, topic in enumerate(topic_model.comp.components_): for i in topic.argsort()[:-topic_length - 1:-1]: topics.append(topic_model.features[i]) entities_cluster = {entity: [] for entity in accepted_entities} for string in corpus: entities_clustered = cluster_entities(entity_model.predict(string)) for entity in accepted_entities: entities_cluster[entity].extend(entities_clustered[entity]) for entity in accepted_entities: entities_cluster[entity] = cluster_words( list(set(entities_cluster[entity]))) topics = cluster_words(list(set(topics))) color_dict = {topic: colors[-1] for topic in topics} for no, entity in enumerate(accepted_entities): for e in entities_cluster[entity]: topics.append(e) color_dict[e] = colors[no] topics_corpus = [] for topic in topics: nested_corpus = [] for string in corpus: if (topic in string or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio): nested_corpus.append(string) topics_corpus.append(' '.join(nested_corpus)) corpus = topics_corpus if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) if stemming: for i in range(len(corpus)): corpus[i] = stemming(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stop_words])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean).todense() features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) attentions.extend(vectorizer.attention(text_clean[i:index])) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) DxT = transformed_text_clean DxD = np.abs(pd.DataFrame(DxT.T).corr()).values G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=topics[i], label=topics[i]) len_dense = len(DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if DxD[i, j] >= threshold: weight = DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(color_dict[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) f = plt.figure(figsize=figsize) ax = f.add_subplot(1, 1, 1) for no, entity in enumerate(accepted_entities): ax.plot([0], [0], color=colors[no], label=entity) ax.plot([0], [0], color=colors[-1], label='topics') nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax) plt.legend() plt.tight_layout() plt.show() return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def cluster_graph( corpus: List[str], vectorizer, threshold: float = 0.9, num_clusters: int = 5, titles: List[str] = None, colors: List[str] = None, stop_words: List[str] = STOPWORDS, stemming=sastrawi, ngram: Tuple[int, int] = (1, 3), cleaning=simple_textcleaning, clustering=KMeans, figsize: Tuple[int, int] = (17, 9), with_labels: bool = True, batch_size: int = 20, ): """ plot undirected graph with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. threshold: float, (default=0.9) 0.9 means, 90% above absolute pearson correlation. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=True) list of titles, length must same with corpus. stemming: function, (default=malaya.stem.sastrawi) function to stem the corpus. stop_words: List[str], (default=malaya.texts.function.STOPWORDS) list of stop words to remove. cleaning: function, (default=malaya.texts.function.simple_textcleaning) function to clean the corpus. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels} """ if not isinstance(stemming, Callable) and stemming is not None: raise ValueError('stemming must be a callable type or None') if not isinstance(cleaning, Callable) and cleaning is not None: raise ValueError('cleaning must be a callable type or None') if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if colors: if len(colors) != num_clusters: raise ValueError( 'size of colors must be same with number of clusters') if not hasattr(vectorizer, 'vectorize') and not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must has `fit` and `vectorize` methods') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout import pandas as pd sns.set() except: raise Exception( 'matplotlib, seaborn, networkx not installed. Please install it and try again.' ) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) if stemming: for i in range(len(corpus)): corpus[i] = stemming(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stop_words])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean).todense() features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) attentions.extend(vectorizer.attention(text_clean[i:index])) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) DxT = transformed_text_clean DxD = np.abs(pd.DataFrame(DxT.T).corr()).values km = clustering(n_clusters=num_clusters) km.fit(DxT) clusters = km.labels_.tolist() if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort(np.array( transformed_text_clean[i])[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) if not colors: colors = sns.color_palette(n_colors=num_clusters) G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=titles[i], label=clusters[i]) len_dense = len(DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if DxD[i, j] >= threshold: weight = DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(colors[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) plt.figure(figsize=figsize) if with_labels: nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels) else: nx.draw(G, node_color=node_colors, pos=pos) return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def cluster_graph( corpus, titles = None, colors = None, threshold = 0.3, stemming = True, max_df = 0.95, min_df = 2, ngram = (1, 3), cleaning = simple_textcleaning, vectorizer = 'bow', stop_words = STOPWORDS, num_clusters = 5, clustering = KMeans, figsize = (17, 9), with_labels = True, ): """ plot undirected graph with similar texts corpus: list titles: list list of titles, length must same with corpus colors: list list of colors, length must same with num_clusters num_clusters: int, (default=5) size of unsupervised clusters. stemming: bool, (default=True) If True, sastrawi_stemmer will apply max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus cleaning: function, (default=simple_textcleaning) function to clean the corpus stop_words: list, (default=STOPWORDS) list of stop words to remove vectorizer: str, (default='bow') vectorizer technique. Allowed values: * ``'bow'`` - Bag of Word * ``'tfidf'`` - Term frequency inverse Document Frequency * ``'skip-gram'`` - Bag of Word with skipping certain n-grams """ if not isinstance(corpus, list): raise ValueError('corpus must be a list') if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if not isinstance(titles, list) and titles is not None: raise ValueError('titles must be a list or None') if not isinstance(colors, list) and colors is not None: raise ValueError('colors must be a list or None') if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if colors: if len(colors) != num_clusters: raise ValueError( 'size of colors must be same with number of clusters' ) if not isinstance(vectorizer, str): raise ValueError('vectorizer must be a string') if not isinstance(stemming, bool): raise ValueError('bool must be a boolean') vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if not isinstance(ngram, tuple): raise ValueError('ngram must be a tuple') if not len(ngram) == 2: raise ValueError('ngram size must equal to 2') if not isinstance(min_df, int): raise ValueError('min_df must be an integer') if not (isinstance(max_df, int) or isinstance(max_df, float)): raise ValueError('max_df must be an integer or a float') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df < 1 and max_df > 0): raise ValueError('max_df must be bigger than 0, less than 1') if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout sns.set() except: raise Exception( 'matplotlib, seaborn, networkx not installed. Please install it and try again.' ) tf_vectorizer = Vectorizer( ngram_range = ngram, min_df = min_df, max_df = max_df, stop_words = stop_words, ) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) if stemming: for i in range(len(corpus)): corpus[i] = sastrawi(corpus[i]) text_clean = [] for text in corpus: text_clean.append( ' '.join([word for word in text.split() if word not in stop_words]) ) tf_vectorizer.fit(text_clean) DxT = tf_vectorizer.transform(text_clean) DxD = np.dot(DxT, DxT.T) km = clustering(n_clusters = num_clusters) km.fit(DxT) clusters = km.labels_.tolist() features = tf_vectorizer.get_feature_names() if not titles: titles = [] for i in range(DxT.shape[0]): indices = np.argsort(np.array(DxT[i].todense())[0])[::-1] titles.append(' '.join([features[i] for i in indices[: ngram[1]]])) if not colors: colors = sns.color_palette(n_colors = num_clusters) G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text = titles[i], label = clusters[i]) dense_DxD = DxD.toarray() len_dense = len(dense_DxD) for i in range(len_dense): for j in range(i + 1, len_dense): if dense_DxD[i, j] >= threshold: weight = dense_DxD[i, j] G.add_edge(i, j, weight = weight) for node, degree in list(dict(G.degree()).items()): if degree == 0: G.remove_node(node) node_colors, node_labels = [], {} for node in G: node_colors.append(colors[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout( G, k = 1.5 / np.sqrt(len(G.nodes())) ) plt.figure(figsize = figsize) if with_labels: nx.draw(G, node_color = node_colors, pos = pos, labels = node_labels) else: nx.draw(G, node_color = node_colors, pos = pos) return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def filterP(links): outputs = run_parallel_in_threads(getP, links) overall_emotion, overall_sentiment, overall_subj, overall_pol, overall_irony, overall_msg, overall_bias = [], [], [], [], [], [], [] overall_local_entities_nouns = [] persons, orgs, gpes = [], [], [] df_texts, df_sentiments = [], [] for i in range(len(outputs)): local_entities_nouns, local_persons, local_orgs, local_gpes = [], [], [], [] for sentence in outputs[i]['p']: for token in nlp(sentence): if token.ent_type_ == 'PERSON': local_persons.append(str(token)) if token.ent_type_ == 'ORG': local_orgs.append(str(token)) if token.ent_type_ == 'GPE': local_gpes.append(str(token)) if (len(token.ent_type_) > 0 or token.tag_ in ['NNP', 'NN'] ) and str(token).lower() not in english_stopwords: local_entities_nouns.append(str(token)) sentiments = getsentiment(outputs[i]['p-classifier']) df_sentiments += np.argmax(sentiments, axis=1).tolist() df_texts += outputs[i]['p-classifier'] emotions = getemotion(outputs[i]['p-classifier']) msgs = getmsg(outputs[i]['p-classifier']) subjectivities, polarities, ironies, biases = getpolar( outputs[i]['p-classifier']) overall_local_entities_nouns += local_entities_nouns persons += local_persons orgs += local_orgs gpes += local_gpes local_entities_nouns_unique, local_entities_nouns_count = np.unique( local_entities_nouns, return_counts=True) sorted_val = local_entities_nouns_unique[np.argsort( local_entities_nouns_count)[::-1]].tolist() outputs[i]['tokens'] = sorted_val[:15] outputs[i]['sentiment'] = sentiments.tolist() outputs[i]['emotion'] = emotions.tolist() outputs[i]['msg'] = msgs.tolist() outputs[i]['subjectivity'] = subjectivities.tolist() outputs[i]['polarity'] = polarities.tolist() outputs[i]['irony'] = ironies.tolist() outputs[i]['bias'] = biases.tolist() outputs[i]['person'] = list(set(local_persons)) outputs[i]['org'] = list(set(local_orgs)) outputs[i]['gpes'] = list(set(local_gpes)) avg_sentiment = sentiments.mean(axis=0) avg_emotion = emotions.mean(axis=0) avg_msg = msgs.mean(axis=0) avg_subjectivity = subjectivities.mean() avg_polarity = polarities.mean() avg_irony = ironies.mean() avg_bias = biases.mean() overall_emotion.append(avg_emotion) overall_sentiment.append(avg_sentiment) overall_msg.append(avg_msg) overall_subj.append(avg_subjectivity) overall_pol.append(avg_polarity) overall_irony.append(avg_irony) overall_bias.append(avg_bias) outputs[i]['avg_sentiment'] = avg_sentiment.tolist() outputs[i]['avg_emotion'] = avg_emotion.tolist() outputs[i]['avg_msg'] = avg_msg.tolist() outputs[i]['avg_subjectivity'] = avg_subjectivity.tolist() outputs[i]['avg_polarity'] = avg_polarity.tolist() outputs[i]['avg_irony'] = avg_irony.tolist() outputs[i]['avg_bias'] = avg_bias.tolist() # graph pipeline df = pd.DataFrame({'text': df_texts, 'sentiment': df_sentiments}) df['id'] = df.index tfidf = TfidfVectorizer(stop_words='english', norm='l2') DxT = tfidf.fit_transform(df['text']) DxD = np.dot(DxT, DxT.T) G = nx.Graph() for i in range(df.shape[0]): idx = df.at[i, 'id'] text = df.at[i, 'text'] sentiment = df.at[i, 'sentiment'] G.add_node(idx, text=text, sentiment=sentiment) dense_DxD = DxD.toarray() len_dense = len(dense_DxD) cutoff = 0 for i in range(len_dense): for j in range(i + 1, len_dense): if dense_DxD[i, j] >= cutoff: weight = dense_DxD[i, j] G.add_edge(df.at[i, 'id'], df.at[j, 'id'], weight=weight) for node, degree in list(dict(G.degree()).items()): if degree == 0: G.remove_node(node) pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) edge_data = [] colors = {0: '1', 1: '2'} for u, v, w in G.edges(data=True): x0, y0 = pos[u] x1, y1 = pos[v] w = w['weight'] edge_data.append( go.Scatter(x=[x0, x1, None], y=[y0, y1, None], line=go.Line(width=3.0 * w, color='#888'), hoverinfo='none', mode='lines')) node_data = go.Scatter(x=[], y=[], text=[], mode='markers', hoverinfo='text', marker=go.Marker( showscale=True, reversescale=True, color=[], size=5.0, colorbar=dict(thickness=15, xanchor='left', tickmode='array', tickvals=[1, 2], ticktext=['negative', 'positive'], ticks='outside'), line=dict(width=0.5))) for u, w in G.nodes(data=True): x, y = pos[u] color = colors[w['sentiment']] text = w['text'] node_data['x'].append(x) node_data['y'].append(y) node_data['text'].append(text) node_data['marker']['color'].append(color) # end graph pipeline overall_unique, overall_count = np.unique(overall_local_entities_nouns, return_counts=True) overall_unique = overall_unique[np.argsort(overall_count) [::-1]][:200].tolist() overall_count = overall_count[np.argsort(overall_count) [::-1]][:200].tolist() return { 'overall_sentiment': np.array(overall_sentiment).mean(axis=0).tolist(), 'overall_emotion': np.array(overall_emotion).mean(axis=0).tolist(), 'overall_msg': np.array(overall_msg).mean(axis=0).tolist(), 'overall_subjectivity': np.array(overall_subj).mean().tolist(), 'overall_polarity': np.array(overall_pol).mean().tolist(), 'overall_irony': np.array(overall_irony).mean().tolist(), 'overall_bias': np.array(overall_bias).mean().tolist(), 'person': list(set(persons)), 'org': list(set(orgs)), 'gpe': list(set(gpes)), 'outputs': outputs, 'wordcloud': list(zip(overall_unique, overall_count)), 'sentiment-network': edge_data + [node_data] }
def cluster_entity_linking(corpus, entity_model, topic_modeling_model, topic_decomposition=2, topic_length=10, threshold=0.3, fuzzy_ratio=70, accepted_entities=[ 'law', 'location', 'organization', 'person', 'event' ], colors=None, max_df=1.0, min_df=1, ngram=(2, 3), stemming=True, cleaning=simple_textcleaning, vectorizer='bow', stop_words=STOPWORDS, figsize=(17, 9), **kwargs): """ plot undirected graph for Entities and topics relationship. Parameters ---------- corpus: list or str titles: list list of titles, length must same with corpus. colors: list list of colors, length must same with num_clusters. threshold: float, (default=0.3) threshold to assume similarity for covariance matrix. topic_decomposition: int, (default=2) size of decomposition. topic_length: int, (default=10) size of topic models. fuzzy_ratio: int, (default=70) size of ratio for fuzzywuzzy. stemming: bool, (default=True) If True, sastrawi_stemmer will apply. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=simple_textcleaning) function to clean the corpus. stop_words: list, (default=STOPWORDS) list of stop words to remove. vectorizer: str, (default='bow') vectorizer technique. Allowed values: * ``'bow'`` - Bag of Word. * ``'tfidf'`` - Term frequency inverse Document Frequency. * ``'skip-gram'`` - Bag of Word with skipping certain n-grams. Returns ------- dictionary: { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, } """ if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if not isinstance(colors, list) and colors is not None: raise ValueError('colors must be a list or None') if not isinstance(vectorizer, str): raise ValueError('vectorizer must be a string') if not isinstance(stemming, bool): raise ValueError('bool must be a boolean') vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if not isinstance(ngram, tuple): raise ValueError('ngram must be a tuple') if not len(ngram) == 2: raise ValueError('ngram size must equal to 2') if not isinstance(min_df, int): raise ValueError('min_df must be an integer') if not isinstance(topic_decomposition, int): raise ValueError('topic_decomposition must be an integer') if not isinstance(topic_length, int): raise ValueError('topic_length must be an integer') if not isinstance(fuzzy_ratio, int): raise ValueError('fuzzy_ratio must be an integer') if not isinstance(max_df, float): raise ValueError('max_df must be a float') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100): raise ValueError( 'fuzzy_ratio must be bigger than 0, less than or equal to 100') if not isinstance(threshold, float): raise ValueError('threshold must be a float') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout sns.set() except: raise Exception( 'matplotlib, seaborn, networkx not installed. Please install it and try again.' ) if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if isinstance(corpus, str): corpus = corpus.replace('\n', '.') corpus = split_by_dot(corpus) else: corpus = [c + '.' for c in corpus] corpus = ' '.join(corpus) corpus = re.findall('(?=\S)[^.\n]+(?<=\S)', corpus) corpus = [string for string in corpus if len(string) > 5] if not colors: colors = sns.color_palette(n_colors=len(accepted_entities) + 1) else: if len(colors) != (len(accepted_entities) + 1): raise ValueError('len of colors must same as %d' % (len(accepted_entities) + 1)) topic_model = topic_modeling_model( corpus, topic_decomposition, stemming=stemming, vectorizer=vectorizer, ngram=ngram, max_df=max_df, min_df=min_df, ) topics = [] for no, topic in enumerate(topic_model.comp.components_): for i in topic.argsort()[:-topic_length - 1:-1]: topics.append(topic_model.features[i]) entities_cluster = {entity: [] for entity in accepted_entities} for string in corpus: entities_clustered = cluster_entities(entity_model.predict(string)) for entity in accepted_entities: entities_cluster[entity].extend(entities_clustered[entity]) for entity in accepted_entities: entities_cluster[entity] = cluster_words( list(set(entities_cluster[entity]))) topics = cluster_words(list(set(topics))) color_dict = {topic: colors[-1] for topic in topics} for no, entity in enumerate(accepted_entities): for e in entities_cluster[entity]: topics.append(e) color_dict[e] = colors[no] topics_corpus = [] for topic in topics: nested_corpus = [] for string in corpus: if (topic in string or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio): nested_corpus.append(string) topics_corpus.append(' '.join(nested_corpus)) tf_vectorizer = Vectorizer(ngram_range=ngram, min_df=min_df, max_df=max_df, stop_words=stop_words, **kwargs) if cleaning is not None: for i in range(len(topics_corpus)): topics_corpus[i] = cleaning(topics_corpus[i]) if stemming: for i in range(len(topics_corpus)): topics_corpus[i] = sastrawi(topics_corpus[i]) tf_vectorizer.fit(topics_corpus) DxT = tf_vectorizer.transform(topics_corpus) DxD = np.dot(DxT, DxT.T) G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=topics[i], label=topics[i]) dense_DxD = DxD.toarray() len_dense = len(dense_DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if dense_DxD[i, j] >= threshold: weight = dense_DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(color_dict[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) f = plt.figure(figsize=figsize) ax = f.add_subplot(1, 1, 1) for no, entity in enumerate(accepted_entities): ax.plot([0], [0], color=colors[no], label=entity) ax.plot([0], [0], color=colors[-1], label='topics') nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax) plt.legend() plt.tight_layout() plt.show() return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }