def test_get_term_idx_and_x(self): docs = [whitespace_nlp('aa aa bb.'), whitespace_nlp('bb aa a.')] df = pd.DataFrame({'category': ['a', 'b'], 'parsed': docs}) #corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed') corpus = corpus_fact.build() kvs = list(corpus_fact._term_idx_store.items()) keys = [k for k, v in kvs] values = [v for k, v in kvs] self.assertEqual(sorted(keys), list(range(7))) self.assertEqual(sorted(values), ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa']) def assert_word_in_doc_cnt(doc, word, count): self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count) assert_word_in_doc_cnt(0, 'aa', 2) assert_word_in_doc_cnt(0, 'bb', 1) assert_word_in_doc_cnt(0, 'aa aa', 1) assert_word_in_doc_cnt(0, 'aa bb', 1) assert_word_in_doc_cnt(0, 'bb aa', 0) assert_word_in_doc_cnt(1, 'bb', 1) assert_word_in_doc_cnt(1, 'aa', 1) assert_word_in_doc_cnt(1, 'a', 1) assert_word_in_doc_cnt(1, 'bb aa', 1) assert_word_in_doc_cnt(1, 'aa aa', 0) assert_word_in_doc_cnt(1, 'aa a', 1) self.assertTrue(isinstance(corpus, ParsedCorpus))
def test_hamlet(self): raw_docs = get_hamlet_docs() categories = [ get_hamlet_snippet_binary_category(doc) for doc in raw_docs ] docs = [whitespace_nlp(doc) for doc in raw_docs] df = pd.DataFrame({'category': categories, 'parsed': docs}) corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus = corpus_fact.build() tdf = corpus.get_term_freq_df() self.assertEqual(list(tdf.ix['play']), [37, 5]) self.assertFalse( any( corpus.search('play').apply( lambda x: 'plfay' in str(x['parsed']), axis=1))) self.assertTrue( all( corpus.search('play').apply( lambda x: 'play' in str(x['parsed']), axis=1))) # !!! to do verify term doc matrix play_term_idx = corpus_fact._term_idx_store.getidx('play') play_X = corpus_fact._X.todok()[:, play_term_idx] self.assertEqual(play_X.sum(), 37 + 5)
def test_get_term_idx_and_x(self): docs = [whitespace_nlp('aa aa bb.'), whitespace_nlp('bb aa a.')] df = pd.DataFrame({'category': ['a', 'b'], 'parsed': docs}) # corpus_fact = CorpusFromParsedDocuments(convention_df, 'category', 'parsed') corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed') corpus = corpus_fact.build() kvs = list(corpus_fact._term_idx_store.items()) keys = [k for k, v in kvs] values = [v for k, v in kvs] self.assertEqual(sorted(keys), list(range(7))) self.assertEqual(sorted(values), ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa']) def assert_word_in_doc_cnt(doc, word, count): self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count) assert_word_in_doc_cnt(0, 'aa', 2) assert_word_in_doc_cnt(0, 'bb', 1) assert_word_in_doc_cnt(0, 'aa aa', 1) assert_word_in_doc_cnt(0, 'aa bb', 1) assert_word_in_doc_cnt(0, 'bb aa', 0) assert_word_in_doc_cnt(1, 'bb', 1) assert_word_in_doc_cnt(1, 'aa', 1) assert_word_in_doc_cnt(1, 'a', 1) assert_word_in_doc_cnt(1, 'bb aa', 1) assert_word_in_doc_cnt(1, 'aa aa', 0) assert_word_in_doc_cnt(1, 'aa a', 1) self.assertTrue(isinstance(corpus, ParsedCorpus))
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'parsed': cls.parsed_docs}) cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
def test_hamlet(self): raw_docs = get_hamlet_docs() categories = [get_hamlet_snippet_binary_category(doc) for doc in raw_docs] docs = [whitespace_nlp(doc) for doc in raw_docs] df = pd.DataFrame({'category': categories, 'parsed': docs}) corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus = corpus_fact.build() tdf = corpus.get_term_freq_df() self.assertEqual(list(tdf.ix['play']), [37, 5]) self.assertFalse(any(corpus.search('play').apply(lambda x: 'plfay' in str(x['parsed']), axis=1))) self.assertTrue(all(corpus.search('play').apply(lambda x: 'play' in str(x['parsed']), axis=1))) # !!! to do verify term doc matrix play_term_idx = corpus_fact._term_idx_store.getidx('play') play_X = corpus_fact._X.todok()[:, play_term_idx] self.assertEqual(play_X.sum(), 37 + 5)
def main(): # convention_df = SampleCorpora.ConventionData2012.get_data() feat_builder = FeatsFromOnlyEmpath() # corpus = CorpusFromParsedDocuments(convention_df, # category_col='party', # parsed_col='text', # feats_from_spacy_doc=feat_builder).build() # html = produce_scattertext_explorer(corpus, # category='democrat', # category_name='Democratic', # not_category_name='Republican', # width_in_pixels=1000, # metadata=convention_df['speaker'], # use_non_text_features=True, # use_full_doc=True, # topic_model_term_lists=feat_builder.get_top_model_term_lists()) # ================================================================================ all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds( ) # print("all_satisfaction_score_comment_in_all_conds",all_satisfaction_score_comment_in_all_conds) # [['negative', 'Satisfaction', 'after a week----mouth ulccers,cudnt talk,eat,drink for 5 days....whole body burnt,headache, fatigue....quit---am slowly getting better, wudnt give to my worst # print("all_satisfaction_score_comment_in_all_conds",len(all_satisfaction_score_comment_in_all_conds)) # 1402 # ================================================================================ columns = ['senti_on_Metfor_oral', 'feature', 'review'] all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame( all_satisfaction_score_comment_in_all_conds, index=None, columns=columns) # ================================================================================ corpus = CorpusFromParsedDocuments( all_satisfaction_score_comment_in_all_conds_df, category_col='senti_on_Metfor_oral', parsed_col='review', feats_from_spacy_doc=feat_builder).build() # ================================================================================ html = produce_scattertext_explorer( corpus, category='negative', category_name='Negative', not_category_name='Positive', width_in_pixels=1000, metadata=all_satisfaction_score_comment_in_all_conds_df['feature'], use_non_text_features=True, use_full_doc=True, topic_model_term_lists=feat_builder.get_top_model_term_lists()) # ================================================================================ open( '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8')) print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'], 'parsed': cls.parsed_docs, 'document_lengths': [len(doc) for doc in cls.documents]}) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def _make_political_corpus(self): clean = clean_function_factory() data = [] for party, speech in iter_party_speech_pairs(): cleaned_speech = clean(speech) if cleaned_speech and cleaned_speech != '': parsed_speech = whitespace_nlp(cleaned_speech) data.append({'party': party, 'text': parsed_speech}) corpus = CorpusFromParsedDocuments(pd.DataFrame(data), category_col='party', parsed_col='text').build() return corpus
def build_hamlet_jz_corpus(): # type: () -> Corpus categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text df = pd.DataFrame({ 'category': categories, 'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents] }) df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)] return CorpusFromParsedDocuments(df=df, category_col='category', parsed_col='parsed').build()
def main(): df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv') df['text'] = df['text'].apply(chinese_nlp) corpus = CorpusFromParsedDocuments(df, category_col='novel', parsed_col='text').build() html = produce_scattertext_explorer(corpus, category='Tale of Two Cities', category_name='Tale of Two Cities', not_category_name='Ulysses', width_in_pixels=1000, metadata=df['novel'], chinese_mode=True) open('./demo_chinese.html', 'w').write(html) print('Open ./demo_chinese.html in Chrome or Firefox.')
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({ 'category': cls.categories, 'parsed': cls.parsed_docs, 'orig': [d.upper() for d in cls.documents] }) cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build() cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
def main(): convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='text', feats_from_spacy_doc=FeatsFromOnlyEmpath()).build() html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True) open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8')) print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
def build_hamlet_jz_corpus_with_meta(): # type: () -> Corpus def empath_mock(doc, **kwargs): toks = doc.split() num_toks = min(3, len(toks)) return { 'cat' + str(len(tok)): val for val, tok in enumerate(toks[:num_toks]) } categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text df = pd.DataFrame({ 'category': categories, 'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents] }) df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)] return CorpusFromParsedDocuments( df=df, category_col='category', parsed_col='parsed', feats_from_spacy_doc=FeatsFromSpacyDocAndEmpath( empath_analyze_function=empath_mock)).build()
0: "parsed_text" }).reset_index(drop=True) original_data = df.reset_index(drop=True) df = pd.concat([original_data, cleaned_texts], axis=1) df['parsed_text'] = df['parsed_text'].apply(chinese_nlp) for i in np.arange(len(df['text'])): df['text'][i] = re.sub(pattern, '', df['text'][i]) df['text'] = df['text'].apply(chinese_nlp) corpus = CorpusFromParsedDocuments(df, category_col='file_name', parsed_col='parsed_text').build() html = produce_scattertext_explorer(corpus, category='安利蛋白粉评论.txt', category_name='安利蛋白粉评论.txt', not_category_name='汤臣倍健蛋白粉评论.txt', width_in_pixels=1000, metadata=df['file_name'], asian_mode=True, alternative_text_field="text") open( 'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein_review_compare.html', 'w', encoding='utf-8').write(html) print(
def get_scattertext_html(): file_names = os.listdir( 'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein review') # Create Dictionary for File Name and Text file_name_and_text = {} for file in file_names: with open( 'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein review/' + file, "r", encoding="UTF-8") as target_file: file_name_and_text[file] = target_file.read() file_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index').reset_index().rename( index=str, columns={ 'index': 'file_name', 0: 'text' })) df = file_data for i in np.arange(len(df)): df['text'][i] = "\n".join( list(dict.fromkeys(df['text'][i].split("\n")))) #Remove duplicates comment = df.text.values.tolist() #load user-defiend dictionary jieba.load_userdict('C:/users/CNU074VP/dict_out.csv') #word segmentation with jieba comment_s = [] # pattern = re.compile(r'[\u4e00-\u9fa5]+') #Get rid of non-Chinese string, need to keep 换行符\n pattern = re.compile('\<.*?\>') #正则去除淘宝评论里<>里边的内容 for line in comment: line.replace(' ', '') # line = ''.join(re.findall(pattern, line)) line = ''.join(re.sub(pattern, '', line)) comment_cut = jieba.lcut(line) comment_s.append(comment_cut) # load user-defined stop words list stopwords = pd.read_excel( 'C:/users/CNU074VP/PycharmProjects/tmall_spider/stopwords.xlsx') stopwords = stopwords.stopword.values.tolist() # get rid of stop words comment_clean = [] for line in comment_s: line_clean = [] for word in line: if word not in stopwords: line_clean.append(word) comment_clean.append(line_clean) comment_doc = [] def get_single_doc(num): for i in np.arange(len(comment_clean[num])): comment_doc = ' '.join([str(item) for item in comment_clean[num]]) return comment_doc l_series = [] for i in np.arange(len(df)): l_series.append(pd.Series(get_single_doc(i))) cleaned_texts = pd.concat( l_series, ignore_index=True).to_frame().rename(columns={ 0: "parsed_text" }).reset_index(drop=True) original_data = df.reset_index(drop=True) df = pd.concat([original_data, cleaned_texts], axis=1) df['parsed_text'] = df['parsed_text'].apply(chinese_nlp) for i in np.arange(len(df['text'])): df['text'][i] = re.sub(pattern, '', df['text'][i]) df['text'] = df['text'].apply(chinese_nlp) corpus = CorpusFromParsedDocuments(df, category_col='file_name', parsed_col='parsed_text').build() html = produce_scattertext_explorer(corpus, category='安利蛋白粉评论.txt', category_name='安利蛋白粉评论.txt', not_category_name='汤臣倍健蛋白粉评论.txt', width_in_pixels=1000, metadata=df['file_name'], asian_mode=True, alternative_text_field="text") result = open('D:/scattertext/protein_review_compare.html', 'w', encoding='utf-8').write(html) return result
def setUp(cls): categories, documents = get_docs_categories() cls.df = pd.DataFrame({'category': categories, 'text': documents}) cls.df['parsed'] = cls.df.text.apply(whitespace_nlp) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
import spacy import numpy as np import pytextrank nlp = spacy.load('en') convention_df = SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(nlp), party=lambda df: df.party.apply({ 'democrat': 'Democratic', 'republican': 'Republican' }.get)) corpus = CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=PyTextRankPhrases()).build().compact( AssociationCompactor(2000, use_non_text_features=True)) print('Aggregate PyTextRank phrase scores') term_category_scores = corpus.get_metadata_freq_df('') print(term_category_scores) term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1 metadata_descriptions = { term: '<br/>' + '<br/>'.join('<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata()) for cat in corpus.get_categories()) for term in corpus.get_metadata()
def build_hamlet_jz_corpus_with_alt_text(): # type: () -> Corpus df = build_hamlet_jz_df_with_alt_text() return CorpusFromParsedDocuments(df=df, category_col='category', parsed_col='parsed').build()