def normalizer(speller=None, **kwargs): """ Load a Normalizer using any spelling correction model. Parameters ---------- speller : spelling correction object, optional (default = None) Returns ------- result: malaya.normalize.NORMALIZER class """ validator.validate_object_methods(speller, ['correct', 'normalize_elongated'], 'speller') from malaya.preprocessing import TOKENIZER tokenizer = TOKENIZER(**kwargs).tokenize return NORMALIZER(tokenizer, speller)
def preprocessing( normalize: List[str] = [ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number', ], annotate: List[str] = [ 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored', 'hashtag', ], lowercase: bool = True, fix_unidecode: bool = True, expand_english_contractions: bool = True, translate_english_to_bm: bool = True, speller=None, segmenter=None, stemmer=None, **kwargs, ): """ Load Preprocessing class. Parameters ---------- normalize: list normalizing tokens, can check all supported normalizing at `malaya.preprocessing.get_normalize()`. annotate: list annonate tokens <open></open>, only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']. lowercase: bool fix_unidecode: bool expand_english_contractions: bool expand english contractions translate_english_to_bm: bool translate english words to bahasa malaysia words speller: object spelling correction object, need to have a method `correct` segmenter: object segmentation object, need to have a method `segment`. If provide, it will expand hashtags, #mondayblues == monday blues stemmer: object stemmer object, need to have a method `stem`. If provide, it will stem or lemmatize the string. Returns ------- result : malaya.preprocessing.PREPROCESSING class """ if any([e not in _normalize for e in normalize]): raise ValueError( 'normalize element not able to recognize, supported normalization can check at get_normalize()' ) if any([e not in _annotate for e in annotate]): raise ValueError( "annotate only accept ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']" ) validator.validate_object_methods(speller, ['correct', 'normalize_elongated'], 'speller') validator.validate_object(segmenter, 'segment', 'segmenter') validator.validate_object(stemmer, 'stem', 'stemmer') if translate_english_to_bm: check_file( PATH_PREPROCESSING['english-malay'], S3_PATH_PREPROCESSING['english-malay'], **kwargs, ) with open(PATH_PREPROCESSING['english-malay']['model']) as fopen: translator = json.load(fopen) else: translator = None return PREPROCESSING( normalize=normalize, annotate=annotate, lowercase=lowercase, fix_unidecode=fix_unidecode, expand_english_contractions=expand_english_contractions, translator=translator, speller=speller, segmenter=segmenter, stemmer=stemmer, )
def cluster_entity_linking( corpus: List[str], vectorizer, entity_model, topic_modeling_model, threshold: float = 0.3, topic_decomposition: int = 2, topic_length: int = 10, fuzzy_ratio: int = 70, accepted_entities: List[str] = [ 'law', 'location', 'organization', 'person', 'event', ], cleaning=simple_textcleaning, colors: List[str] = None, stopwords=get_stopwords, max_df: float = 1.0, min_df: int = 1, ngram: Tuple[int, int] = (2, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot undirected graph for Entities and topics relationship. Parameters ---------- corpus: list or str vectorizer: class titles: list list of titles, length must same with corpus. colors: list list of colors, length must same with num_clusters. threshold: float, (default=0.3) 0.3 means, 30% above absolute pearson correlation. topic_decomposition: int, (default=2) size of decomposition. topic_length: int, (default=10) size of topic models. fuzzy_ratio: int, (default=70) size of ratio for fuzzywuzzy. max_df: float, (default=0.95) maximum of a word selected based on document frequency. min_df: int, (default=2) minimum of a word selected on based on document frequency. ngram: tuple, (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=simple_textcleaning) function to clean the corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str] or List[str] or Tuple[str] Returns ------- dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels} """ import inspect validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if 'max_df' not in inspect.getargspec(topic_modeling_model)[0]: raise ValueError('topic_modeling_model must have `max_df` parameter') if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100): raise ValueError( 'fuzzy_ratio must be bigger than 0, less than or equal to 100') if not isinstance(threshold, float): raise ValueError('threshold must be a float') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout import pandas as pd from fuzzywuzzy import fuzz sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib, seaborn, networkx, fuzzywuzzy not installed. Please install it and try again.' ) if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) corpus = [string for string in corpus if len(string) > 5] if not colors: colors = sns.color_palette(n_colors=len(accepted_entities) + 1) else: if len(colors) != (len(accepted_entities) + 1): raise ValueError('len of colors must same as %d' % (len(accepted_entities) + 1)) topic_model = topic_modeling_model( corpus, topic_decomposition, ngram=ngram, max_df=max_df, min_df=min_df, ) topics = [] for no, topic in enumerate(topic_model.comp.components_): for i in topic.argsort()[:-topic_length - 1:-1]: topics.append(topic_model.features[i]) entities_cluster = {entity: [] for entity in accepted_entities} for string in corpus: entities_clustered = cluster_entities(entity_model.predict(string)) for entity in accepted_entities: entities_cluster[entity].extend(entities_clustered[entity]) for entity in accepted_entities: entities_cluster[entity] = cluster_words( list(set(entities_cluster[entity]))) topics = cluster_words(list(set(topics))) color_dict = {topic: colors[-1] for topic in topics} for no, entity in enumerate(accepted_entities): for e in entities_cluster[entity]: topics.append(e) color_dict[e] = colors[no] topics_corpus = [] for topic in topics: nested_corpus = [] for string in corpus: if (topic in string or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio): nested_corpus.append(string) topics_corpus.append(' '.join(nested_corpus)) corpus = topics_corpus if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean).todense() features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: attentions.extend(text_clean[i:index]) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) DxT = transformed_text_clean DxD = np.abs(pd.DataFrame(DxT.T).corr()).values G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=topics[i], label=topics[i]) len_dense = len(DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if DxD[i, j] >= threshold: weight = DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(color_dict[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) f = plt.figure(figsize=figsize) ax = f.add_subplot(1, 1, 1) for no, entity in enumerate(accepted_entities): ax.plot([0], [0], color=colors[no], label=entity) ax.plot([0], [0], color=colors[-1], label='topics') nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax) plt.legend() plt.tight_layout() plt.show() return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def cluster_graph( corpus: List[str], vectorizer, threshold: float = 0.9, num_clusters: int = 5, titles: List[str] = None, colors: List[str] = None, stopwords=get_stopwords, ngram: Tuple[int, int] = (1, 3), cleaning=simple_textcleaning, clustering=KMeans, figsize: Tuple[int, int] = (17, 9), with_labels: bool = True, batch_size: int = 20, ): """ plot undirected graph with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. threshold: float, (default=0.9) 0.9 means, 90% above absolute pearson correlation. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=True) list of titles, length must same with corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str] or List[str] or Tuple[str]. cleaning: function, (default=malaya.texts.function.simple_textcleaning) function to clean the corpus. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels} """ validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if colors: if len(colors) != num_clusters: raise ValueError( 'size of colors must be same with number of clusters') if not (threshold <= 1 and threshold > 0): raise ValueError( 'threshold must be bigger than 0, less than or equal to 1') try: import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import networkx.drawing.layout as nxlayout import pandas as pd sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib, seaborn, networkx not installed. Please install it and try again.' ) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean).todense() features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) DxT = transformed_text_clean DxD = np.abs(pd.DataFrame(DxT.T).corr()).values km = clustering(n_clusters=num_clusters) km.fit(DxT) clusters = km.labels_.tolist() if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort(np.array( transformed_text_clean[i])[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) if not colors: colors = sns.color_palette(n_colors=num_clusters) G = nx.Graph() for i in range(DxT.shape[0]): G.add_node(i, text=titles[i], label=clusters[i]) len_dense = len(DxD) for i in range(len_dense): for j in range(len_dense): if j == i: continue if DxD[i, j] >= threshold: weight = DxD[i, j] G.add_edge(i, j, weight=weight) node_colors, node_labels = [], {} for node in G: node_colors.append(colors[G.node[node]['label']]) node_labels[node] = G.node[node]['text'] pos = nxlayout.fruchterman_reingold_layout(G, k=1.5 / np.sqrt(len(G.nodes()))) plt.figure(figsize=figsize) if with_labels: nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels) else: nx.draw(G, node_color=node_colors, pos=pos) return { 'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels, }
def cluster_dendogram( corpus: List[str], vectorizer, titles: List[str] = None, stopwords=get_stopwords, cleaning=simple_textcleaning, random_samples: float = 0.3, ngram: Tuple[int, int] = (1, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot hierarchical dendogram with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=None) list of titles, length must same with corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. random_samples: float, (default=0.3) random samples from the corpus, 0.3 means 30%. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles} """ if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if not (random_samples < 1 and random_samples > 0): raise ValueError('random_samples must be between 0 and 1') try: import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import ward, dendrogram sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib and seaborn not installed. Please install it and try again.' ) corpus = random.sample(corpus, k=int(random_samples * len(corpus))) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean) features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) dist = 1 - cosine_similarity(transformed_text_clean) linkage_matrix = ward(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) plt.figure(figsize=figsize) ax = dendrogram(linkage_matrix, orientation='right', labels=titles) plt.tick_params( axis='x', which='both', bottom='off', top='off', labelbottom='off', ) plt.tight_layout() plt.show() return {'linkage_matrix': linkage_matrix, 'titles': titles}
def cluster_scatter( corpus: List[str], vectorizer, num_clusters: int = 5, titles: List[str] = None, colors: List[str] = None, stopwords=get_stopwords, cleaning=simple_textcleaning, clustering=KMeans, decomposition=MDS, ngram: Tuple[int, int] = (1, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot scatter plot on similar text clusters. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=None) list of titles, length must same with corpus. colors: List[str], (default=None) list of colors, length must same with num_clusters. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. cleaning: function, (default=malaya.texts.function.simple_textcleaning) function to clean the corpus. batch_size: int, (default=10) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'X': X, 'Y': Y, 'labels': clusters, 'vector': transformed_text_clean, 'titles': titles} """ if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') if colors: if len(colors) != num_clusters: raise ValueError( 'size of colors must be same with number of clusters') validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') try: import matplotlib.pyplot as plt import seaborn as sns sns.set() except BaseException: raise ModuleNotFoundError( 'matplotlib and seaborn not installed. Please install it and try again.' ) if cleaning: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean) features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) km = clustering(n_clusters=num_clusters) dist = 1 - cosine_similarity(transformed_text_clean) km.fit(transformed_text_clean) clusters = km.labels_.tolist() if isinstance(decomposition, MDS): decomposed = decomposition(n_components=2, dissimilarity='precomputed') else: decomposed = decomposition(n_components=2) pos = decomposed.fit_transform(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) if not colors: colors = sns.color_palette(n_colors=num_clusters) X, Y = pos[:, 0], pos[:, 1] plt.figure(figsize=figsize) for i in np.unique(clusters): plt.scatter( X[clusters == i], Y[clusters == i], color=colors[i], label='cluster %d' % (i), ) for i in range(len(X)): plt.text(X[i], Y[i], titles[i], size=8) plt.legend() plt.show() return { 'X': X, 'Y': Y, 'labels': clusters, 'vector': transformed_text_clean, 'titles': titles, }