def gitc(self, dataframe): general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas( dataframe, category_col='Document Type', text_col='Text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder).build() html = st.produce_frequency_explorer( corpus, category='submission', category_name='Submission', not_category_name='Standard', use_non_text_features=True, use_full_doc=True, term_scorer=st.LogOddsRatioUninformativeDirichletPrior(), grey_threshold=1.96, width_in_pixels=1000, metadata=dataframe['Document'], topic_model_term_lists=general_inquirer_feature_builder. get_top_model_term_lists()) logger.getLogger().info("Opening GITC-Visual") open(self.gitc_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.gitc_file)
def plot_distinctive_words(x_label='', x_files=[], y_label='', y_files=[], max_words=10000, max_files=100): ''' Create a scatterplot that shows the distinctive words among x_files and y_files. Use x_label as the x axis label and y_label as the y_axis label. Return HTML content that can be rendered to show the distinctive words. ''' rows = [] for i in x_files[:max_files]: rows.append([x_label, ' '.join(open(i).read().split()[:max_words])]) for i in y_files[:max_files]: rows.append([y_label, ' '.join(open(i).read().split()[:max_words])]) df = pandas.DataFrame(rows, columns=['Group', 'Text']) nlp = spacy.load('en') nlp.max_length = 2**64 corpus = scattertext.CorpusFromPandas(df, category_col='Group', text_col='Text', nlp=nlp).build() html = scattertext.produce_scattertext_html(corpus, category=y_label, category_name=y_label, not_category_name=x_label, minimum_term_frequency=5, width_in_pixels=1000) return html
def create_scattertext_plot(df, category_col:str, text_col:str, nlp, filename:str, label_match:str, label_name:str, label_other_name:str, metadata_col:str, **kwargs): """ creates a html file with an interactive scattertext plot Will delete an 'index' column if there is one as the scattertext function needs to create it label_match must be one of 2 entries in the category_col label_name is the user-friendly name given to a match, e.g. if label_match is 'Yes', you might want a more meaningful label such as 'A good week' label_other_name is the label for the other entry - e.g. 'A bad week' **kwargs goes into scattertext.produce_scattertext_explorer, e.g. minimum_term_frequency=8, :returns: nothing, but creates a HTML file""" if 'index' in df.columns: df.drop('index',axis=1,inplace=True) corpus = st.CorpusFromPandas(df,category_col=category_col,text_col=text_col, nlp=nlp).build() html = st.produce_scattertext_explorer(corpus, category=label_match, category_name=label_name, not_category_name=label_other_name, metadata=corpus.get_df()[metadata_col], save_svg_button=True, **kwargs ) html_file = open(filename, 'wb') html_file.write(html.encode('utf-8')) html_file.close()
def getReviewPosNegPhrases(yelpScraperResult): if yelpScraperResult.empty: return pd.DataFrame() df = yelpScraperResult.copy() nlp.Defaults.stop_words |= { 'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first', 'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.', ',' } corpus = st.CorpusFromPandas(df, category_col=2, text_col=1, nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() term_freq_df['highratingscore'] = corpus.get_scaled_f_scores( '5.0 star rating') term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores( '1.0 star rating') dh = term_freq_df.sort_values(by='highratingscore', ascending=False) dh = dh[['highratingscore', 'poorratingscore']] dh = dh.reset_index(drop=False) dh = dh.rename(columns={'highratingscore': 'score'}) dh = dh.drop(columns='poorratingscore') # positive dataframe, negative dataframe return dh.head(10), dh.tail(10)
def vis(): ''' text1 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh.en.txt", "r").read() text2 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh_online.en.txt", "r").read() df = pd.DataFrame( [{'text': text.strip(), 'label': 'text1'} for text in text1.decode('utf-8', errors='ignore').split('\n')] + [{'text': text.strip(), 'label': 'text2'} for text in text2.decode('utf-8', errors='ignore').split('\n')] ) term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, category_col = 'label', text_col = 'text', nlp = ST.whitespace_nlp ).build() filtered_term_doc_mat = (ST.TermDocMatrixFilter(pmi_threshold_coef = 1, minimum_term_freq = 1).filter(term_doc_mat)) scatter_chart_data = (ST.ScatterChart(filtered_term_doc_mat).to_dict('text1', category_name='text1', not_category_name='text2')) viz_data_adapter = ST.viz.VizDataAdapter(scatter_chart_data) html = ST.viz.HTMLVisualizationAssembly(viz_data_adapter).to_html() open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8')) IFrame(src='subj_obj_scatter.html', width = 1000, height=1000) ''' SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz' data = io.BytesIO(urllib.urlopen(SUBJECTIVITY_URL).read()) tarball = tarfile.open(fileobj=data, mode='r:gz') readme = tarball.extractfile('subjdata.README.1.0').read() quote = tarball.extractfile('quote.tok.gt9.5000').read() plot = tarball.extractfile('plot.tok.gt9.5000').read() text1 = open("tmp/flickr_test_1k_zh.en.txt", "r").read() text2 = open("tmp/flickr_test_1k_zh.en.txt", "r").read() # Examples of subjective sentences in corpus #quote.decode('utf-8', errors='ignore').split('\n')[:3] '''Construct subjective vs. objective pandas dataframe, treating review quotes as subjective, and plot points as objective. ''' df = pd.DataFrame( [{ 'text': text.strip(), 'label': 'subjective' } for text in quote.decode('utf-8', errors='ignore').split('\n')] + [{ 'text': text.strip(), 'label': 'objective' } for text in plot.decode('utf-8', errors='ignore').split('\n')]) '''Convert Pandas dataframe to a term-document matrix, indicating the category column is "label" and the text column name is "text".''' nlp = spacy.load('en') corpus = ST.CorpusFromPandas( data_frame=df, category_col='label', text_col='text', # Note: use nlp=spacy.en.English() for text that's not pre-tokenized nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() html = ST.produce_scattertext_explorer(corpus, category='label', category_name='subjective', not_category_name='objective', width_in_pixels=1000) open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))
def get_sct_html(rest_name, city_name): rest_reviews = get_rest_reviews(rest_name, city_name) nlp = spacy.load('en_core_web_sm') corpus = sct.CorpusFromPandas(rest_reviews, category_col='class', text_col='text', nlp=nlp).build() html = sct.produce_scattertext_explorer(corpus, category='good', category_name='Positive', not_category_name='Negative', width_in_pixels=900, metadata=rest_reviews['class']) return open("rest_reviews-Vis.html", 'wb').write(html.encode('utf-8'))
def getReviewPosNegPhrases(df_reviews, topk=10): if df_reviews.empty: return pd.DataFrame(), pd.DataFrame() df = df_reviews.copy() df['stars'] = df['stars'].astype(str) nlp = spacy.load("en_core_web_sm") nlp.Defaults.stop_words |= { 'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first', 'he', 'check-in', 'and', 'some', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz', 'also', 'find', 'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'coffee', 'place', "it 's", "'s", 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '&', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.', ',' } corpus = st.CorpusFromPandas(df, category_col='stars', text_col='text', nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() categories = df['stars'].unique() high, poor = np.array([]), np.array([]) if '5' in categories: high = corpus.get_scaled_f_scores('5') elif '4' in categories: high = corpus.get_scaled_f_scores('4') if '1' in categories: poor = corpus.get_scaled_f_scores('1') elif '2' in categories: poor = corpus.get_scaled_f_scores('2') df_high, df_poor = pd.DataFrame(), pd.DataFrame() columns = ['term', 'score'] if high.shape[0] > 0: df_high = pd.DataFrame([term_freq_df.index.tolist(), high]).T df_high = df_high.sort_values(1, ascending=False).head(topk) df_high.columns = columns if poor.shape[0] > 0: df_poor = pd.DataFrame([term_freq_df.index.tolist(), poor]).T df_poor = df_poor.sort_values(1, ascending=False).head(topk) df_poor.columns = columns # positive dataframe, negative dataframe return df_high.head(topk), df_poor.tail(topk)
def create_corpus(category, speeches_df): """ creates scattertext corpus from speeches dictionary :param category: :param speeches_df: :return: """ corpus = st.CorpusFromPandas(speeches_df, category_col=category, text_col='text', nlp=nlp).build() update_stop = [] for term in STOP_WORDS: if term in corpus._term_idx_store: update_stop.append(term) corpus = corpus.remove_terms(update_stop) return corpus
def create_scatterplot(df, return_corpus=False): '''Creates an HTML file to visualize differences in corpora.''' corpus = st.CorpusFromPandas(df, category_col='author', text_col='text', nlp=nlp).build() if return_corpus: return corpus html = st.produce_scattertext_explorer(corpus, category='EAP', category_name='Edger Allen Poe', not_category_name='HPL/MWS', width_in_pixels=1000, metadata=df['author']) open("Author-Visualization.html", 'wb').write(html.encode('utf-8'))
def standard(self, dataframe): corpus = st.CorpusFromPandas(dataframe, category_col='Document Type', text_col='Text', nlp=self.nlp).build() html = st.produce_scattertext_explorer(corpus, category='1st Document', category_name='1st Document', not_category_name='2nd Document', width_in_pixels=1000) logger.getLogger().info("Opening Standard Visual") open(self.std_file, 'wb').write(html.encode('utf-8')) if os.path.isfile(self.std_file): logger.getLogger().info("Graph file created")
def standard(self, dataframe): corpus = st.CorpusFromPandas(dataframe, category_col='Document Type', text_col='Text', nlp=self.nlp).build() html = st.produce_scattertext_explorer(corpus, category='submission', category_name='Submission', not_category_name='Standard', width_in_pixels=1000, metadata=dataframe['Document']) logger.getLogger().info("Opening Standard Visual") open(self.std_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.std_file)
def __init__(self, list_directory, list_author, language: str = 'fr', encoding='utf-8'): self.list_text = self.read_directory(list_directory, encoding) self.list_author = list_author self.df = pd.DataFrame() self.df["text"] = self.list_text self.df["author"] = self.list_author self.language = language self.nlp = spacy.load(language) self.corpus = st.CorpusFromPandas(self.df, category_col='author', text_col='text', nlp=self.nlp).build()
def word_similarity_graph(self, dataframe, word): corpus = st.CorpusFromPandas(dataframe, category_col='Document Type', text_col='Text', nlp=self.nlp).build() html = word_similarity_explorer(corpus, category='1st Document', category_name='1st Document', not_category_name='2nd Document', target_term=word, minimum_term_frequency=5, pmi_threshold_coefficient=4, width_in_pixels=1000, alpha=0.01, max_p_val=0.05, save_svg_button=True) logger.getLogger().info("Opening Word Similarity Visual") open(self.term_file, 'wb').write(html.encode('utf-8'))
def processor(df_reviews): if len(df_reviews)==0: return None nlp = spacy.load("en_core_web_sm-2.1.0/en_core_web_sm/en_core_web_sm-2.1.0") # add stop words with open('stopwords.txt', 'r') as f: str = f.read() set_stopwords = set(str.split('\n')) nlp.Defaults.stop_words |= set_stopwords corpus = (scattertext.CorpusFromPandas(df_reviews, category_col='rating', text_col='text', nlp=nlp) .build() .remove_terms(nlp.Defaults.stop_words, ignore_absences=True) ) term_freq_df = corpus.get_term_freq_df() term_freq_df['highratingscore'] = corpus.get_scaled_f_scores('5.0 star rating') term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores('1.0 star rating') df_high = term_freq_df.sort_values(by='highratingscore', ascending = False) df_poor = term_freq_df.sort_values(by='poorratingscore', ascending=False) df_high = df_high[['highratingscore', 'poorratingscore']] df_high['highratingscore'] = round(df_high['highratingscore'], 2) df_high['poorratingscore'] = round(df_high['poorratingscore'], 2) df_high = df_high.reset_index(drop=False) df_high = df_high.head(5) df_poor = df_poor[['highratingscore', 'poorratingscore']] df_poor['highratingscore'] = round(df_poor['highratingscore'], 2) df_poor['poorratingscore'] = round(df_poor['poorratingscore'], 2) df_poor = df_poor.reset_index(drop=False) df_poor = df_poor.head(5) df_terms = pd.concat([df_high, df_poor], ignore_index=True) return df_terms
def getYelpWords(yelpScraperResult): df = yelpScraperResult nlp.Defaults.stop_words |= { 'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first', 'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.', ',' } corpus = st.CorpusFromPandas(df, category_col=2, text_col=1, nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() term_freq_df['highratingscore'] = corpus.get_scaled_f_scores( '5.0 star rating') term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores( '1.0 star rating') dh = term_freq_df.sort_values(by='highratingscore', ascending=False) dh = dh[['highratingscore', 'poorratingscore']] dh = dh.reset_index(drop=False) dh = dh.rename(columns={'highratingscore': 'score'}) dh = dh.drop(columns='poorratingscore') positive_df = dh.head(10) negative_df = dh.tail(10) results = { 'positive': [{ 'term': pos_term, 'score': pos_score } for pos_term, pos_score in zip(positive_df['term'], positive_df['score'])], 'negative': [{ 'term': neg_term, 'score': neg_score } for neg_term, neg_score in zip(negative_df['term'], negative_df['score'])] } return results
def wordfreqdf(df): corpus = st.CorpusFromPandas(df, category_col='party', text_col='work_for', nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() result_df = pd.DataFrame(columns=["1", "2", "3", "4", "5", "6"]) parties = df['party'].sort_values().unique() index = 0 for party in parties: party_score_name = party + "_Score" term_freq_df[party_score_name] = corpus.get_scaled_f_scores(party) result_df.loc[index] = list( term_freq_df.sort_values(by=party_score_name, ascending=False).index[:6]) index = index + 1 result_df["Party"] = parties result_df.set_index("Party", inplace=True) return result_df
def chrctrstc(self, dataframe): corpus = (st.CorpusFromPandas(dataframe, category_col='Document Type', text_col='Text', nlp=st.whitespace_nlp_with_sentences). build().get_unigram_corpus().compact( st.ClassPercentageCompactor( term_count=2, term_ranker=st.OncePerDocFrequencyRanker))) html = st.produce_characteristic_explorer( corpus, category='submission', category_name='Submission', not_category_name='Standard', metadata=dataframe['Document']) logger.getLogger().info("Opening Characteristic Visual") open(self.chr_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.chr_file)
def create_visual_corpus(category, speeches_df): """ creates scattertext corpus from speeches dictionary :param category: :param speeches_df: :return: """ corpus = st.CorpusFromPandas(speeches_df, category_col=category, text_col='text', nlp=nlp).build() update_stop = [] STOP_WORDS.update([ "»", "—", "«", "cuyas", "cuyos", "100", "fué", "ido", "hubieran", "hagan", "–", "hubiera", "tuve" ]) for term in STOP_WORDS: if term in corpus._term_idx_store: update_stop.append(term) corpus = corpus.remove_terms(update_stop) return corpus
def scatterplot(df): ''' input: a dataframe with text, CEO, and quarter output: a scatterplot ''' corpus = st.CorpusFromPandas(df, category_col='ceo', text_col='text', nlp=st.whitespace_nlp_with_sentences).build() html = st.produce_scattertext_explorer( corpus, category='Ballmer', category_name='Steve Ballmer Era', not_category_name='Satya Nadella Era', minimum_term_frequency=10, pmi_threshold_coefficient=5, width_in_pixels=1000, metadata=df['quarter'], ) open('../Charts/scattertext_demo.html', 'wb').write(html.encode('utf-8'))
def generate_visual(data, category, category_name, not_category_name, filename='index.html'): import spacy import scattertext as st nlp = spacy.load('en_core_web_sm') corpus = st.CorpusFromPandas(data, category_col='label', text_col='abstract', nlp=nlp).build() html = st.produce_scattertext_explorer(corpus, category=category, category_name=category_name, not_category_name=not_category_name, width_in_pixels=1000, metadata=data['journal']) return html
def ValuePredictor(yelp_url, from_isbn=False): '''Takes a url, scrape site for reviews and calculates the term frequencies sorts and returns the top 10 as a json object containing term, highratingscore, poorratingscore.''' base_url = "https://www.yelp.com/biz/" # add business id api_url = "/review_feed?sort_by=date_desc&start=" bid = yelp_url.replace('https://www.yelp.com/biz/', '') if '?' in yelp_url: #deletes everything after "?" in url bid = yelp_url.split('?')[0] class Scraper(): def __init__(self): self.data = pd.DataFrame() def get_data(self, n, bid=bid): with Session() as s: with s.get( base_url + bid + api_url + str(n * 20) ) as resp: #makes an http get request to given url and returns response as json r = dict( resp.json()) #converts json response into a dictionary _html = html.fromstring( r['review_list']) #loads from dictionary dates = _html.xpath( "//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()" ) reviews = [ el.text for el in _html.xpath( "//div[@class='review-content']/p") ] ratings = _html.xpath( "//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title" ) df = pd.DataFrame([dates, reviews, ratings]).T self.data = pd.concat([self.data, df]) def scrape(self): #makes it faster # multithreaded looping with Executor(max_workers=40) as e: list(e.map(self.get_data, range(10))) s = Scraper() s.scrape() df = s.data #converts scraped data into df.columns = ['date', 'review', 'rating'] df = df.set_index(df.columns.drop('review', 1).tolist()).review.str.split( '.', expand=True).stack().reset_index().rename(columns={ 0: 'review' }).loc[:, df.columns] df = df.replace(',', '') df = df.replace('!', '') df = df.replace('#', '') df = df.replace('.', '') tokenizer = Tokenizer(nlp.vocab) STOP_WORDS = nlp.Defaults.stop_words.union([ 'gets', 'incredible', 'disappoint', 'from', 'perfection', 'loved', 'definitely', 'happy', 'find', 'found', 'simply', 'fantastic', 'recommend', 'feel', 'little', 'i', 'wow', 'absolute', 'favorite', 'excellent', 'delicious', 'great', 'maybe', 'very', 'enjoy', 'list', 'gave', 'date', 'went', 'disappointed', 'nyc', 'got', '#', 'crazy', 'other', 'fairness', 'fair', 'mid', 'from', 'highly', 'perfect', 'perfectly', 'come', 'lovely', 'visit', 'ny', 'nyc', 'best', 'amazing', 'love', 'absolutely', 'like', 'good', 'other', 'from', 'ny', 'restaurant', 'we', 'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first', 'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', 'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '(', ')', '/', '.', ',', '!' ]) # STOP_WORDS df = df[df['review'] != None] tokens = [] for doc in tokenizer.pipe(df['review'], batch_size=500): doc_tokens = [] for token in doc: if (token.text not in STOP_WORDS) & (token.is_punct == False): doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) df['review'] = tokens jointty = lambda x: ' '.join(map(lambda x: str(x), x['review'])) df['review'] = df.apply(jointty, axis=1) df['review'].replace(' ', np.nan, inplace=True) df = df.dropna() corpus = (st.CorpusFromPandas(df, category_col='rating', text_col='review', nlp=nlp).build().remove_terms( STOP_WORDS, ignore_absences=True)) term_freq_df = corpus.get_term_freq_df() term_freq_df['highratingscore'] = corpus.get_scaled_f_scores( '5.0 star rating') term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores( '1.0 star rating') # term_freq_df = term_freq_df[term_freq_df['1.0 star rating freq'] > 3] dp = term_freq_df.sort_values(by='poorratingscore', ascending=False) dp = dp[~dp.index.str.contains('-')] dp = dp[~dp.index.str.contains("'")] dp = dp[~dp.index.str.contains('/')] dh = term_freq_df.sort_values(by='highratingscore', ascending=False) dh = dh[~dh.index.str.contains('-')] dh = dh[~dh.index.str.contains("'")] dh = dh[~dh.index.str.contains('/')] dhi = dh.head(75) dpo = dh.tail(75) dfinal = pd.concat([dhi, dpo]) # dh = dh.reset_index(drop=False) # return dh.to_dict('index') return dfinal.to_dict('index')
from sklearn.decomposition import KernelPCA, NMF from sklearn.preprocessing import RobustScaler from statsmodels.multivariate.pca import PCA import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder, ).build().get_unigram_corpus() html = st.produce_pairplot( corpus, use_metadata=True, category_projector=st.CategoryProjector(compactor=None), topic_model_term_lists=general_inquirer_feature_builder. get_top_model_term_lists(), topic_model_preview_size=100, metadata_descriptions=general_inquirer_feature_builder.get_definitions(), metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot_geninq.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
# Join the two dataframes along the column convention_df = pd.concat([df1, df2], axis=1) # Place all text in same column and create tag for CNN or Fox convention_df = pd.melt(convention_df) convention_df = convention_df.dropna(axis=0, how='any') # Build NLP parsing for corpus English = st.whitespace_nlp_with_sentences # Parse the text and create new column with parsed values convention_df.groupby('variable').apply(lambda x: x.value.apply(lambda x: len(x.split())).sum()) convention_df['parsed'] = convention_df.value.apply(English) convention_df.iloc[:3] # Generate corpus of language from pandas dataframe corpus = st.CorpusFromPandas(convention_df, category_col='variable', text_col='value', nlp = English).build() # Output html doc for visualization ## HTML FILE MUST ALREADY EXIST IN OUTPUT FOLDER TO WRITE ON html = st.produce_scattertext_explorer(corpus, category='CNN', category_name='CNN', not_category_name='Fox', width_in_pixels=1000) file_name = 'output/Trump.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1200, height=700)
import spacy #Utilizado pelo próprio scattertext para fazer o preprocessamento import pandas #Utilizado para formatar o dataset para criação do corpus do scattertext import pickle #Usado apenas para carregar o dataset, mas você pode carregar da forma que quiser #Carregando o dataset data = open("data1", 'rb') documentos, classes = pickle.load(data) #Passando o seu dataset para o formato de DataFrame do pandas, onde uma tabela será criada para gerar o corpus do scattertext, os textos não devem estar preprocessados, pois o scattertext vai usar o spacy para isso dict = {"texto": documentos, "classes": classes} data = pandas.DataFrame(dict) #Gerando o corpus pelo scattertext, a partir disso você terá acesso a diversas informações úteis sobre o seu dataset nlp = spacy.load('en') corpus = scattertext.CorpusFromPandas(data, category_col='classes', text_col='texto', nlp=nlp).build() #Exemplos print("Número de documentos: " + str(corpus.get_num_docs())) print("Tamanho de documentos: " + str(corpus.get_doc_lengths())) print("Número de termos: " + str(corpus.get_num_terms())) print("Palavras que diferem dos corpus comuns: ") x = corpus.get_scaled_f_scores_vs_background() print(list(x.index[0:10])) #Frequência das palavras nas classes term_freq_df = corpus.get_term_freq_df() term_freq_df['positivo'] = corpus.get_scaled_f_scores('positivo') term_freq_df['negativo'] = corpus.get_scaled_f_scores('negativo')
#%% import scattertext as st import spacy from pprint import pprint import feather as fea #%% df = fea.read_dataframe("C:/Users/au615270/Dropbox/CROW_FAR/First_Repository_CROW_FAR/full model files/GloVe Model/dataframe.feather") df.iloc[0] # %% nlp = spacy.load('en_core_web_sm') corpus = st.CorpusFromPandas(df, category_col='RU', text_col='referat', nlp=nlp).build() #%% print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))
def getPosNegLongPhrases(df_reviews, topk=10): nlp = spacy.load("en_core_web_sm") if df_reviews.empty: return pd.DataFrame() df = df_reviews.copy() df['stars'] = df['stars'].astype(str) df = df.dropna() df['only_alphabets'] = df['text'].apply( lambda x: ' '.join(re.findall("[a-zA-Z]+", x))) for key in replace_dict_phrase_count.keys(): df['only_alphabets'] = df['only_alphabets'].str.lower() stopwords = [ 'maybe', 'from', 'first', 'here', 'only', 'put', 'where', 'got', 'sure', 'their', 'us', 'definitely', 'food', 'yet', 'our', 'go', 'since', 'really', 'very', 'two', "don t", 'with', 'if', "hers", 'which', 'came', 'all', 'me', 'makes', 'make', 'were', 'immediately', 'get', 'been', 'ahead', 'also', 'that', 'were', 'one', 'have', 'see', 'what', 'to', 'we', 'had', 'the', "re", 'it', 'or', 'he', 'she', 'we', 'us', 'how', 'went', 'no', 'of', 'has', 'by', 'bit', 'thing', 'place', 'so', 'ok', 'and', 'they', 'none', 'was', 'you', "ve", 'was', 'did', 'be', 'and', 'but', 'is', 'as', 'you', 'has', 'and', 'had', 'was', 'him', 'so', 'my', 'did', 'our', 'there', 'would', 'her', 'him', 'it', 'is', 'by', 'bit', 'thing', 'place', 'while', 'check in', 'they', 'them', 'want', 'good', 'husband', 'want', 'love', 'something', 'your', 'they', 'your', 'cuz', 'him', "i ll", 'her', 'told', 'check', 'im', "his", 'they', 'this', 'it s', 'they', 'this', "won t", 'the', 'it', 'i ve' ] def filter_stopwords(text): for i in str(text): if i not in stopwords: return str(text) # if item in stopwords list partially matches, delete, single letters like 'i' would be deleted # from inside individual words if in list df = df[~df['only_alphabets'].isin(stopwords)] # if the following words fully matches, filter out try: corpus = st.CorpusFromPandas(df, category_col='stars', text_col='only_alphabets', nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() term_freq_df = pd.DataFrame(term_freq_df.to_records( )) # flatten multi-level index to rename columns term_freq_df = term_freq_df.rename(columns={ '5 freq': '5.0', '4 freq': '4.0', '2 freq': '2.0', '1 freq': '1.0' }) categories = df['stars'].unique() freq_word_list = np.array([]) if '5' in categories: freq_word_list = corpus.get_scaled_f_scores('5') elif '4' in categories: freq_word_list = corpus.get_scaled_f_scores('4') if '1' in categories: freq_word_list = corpus.get_scaled_f_scores('1') elif '2' in categories: freq_word_list = corpus.get_scaled_f_scores('2') df_wordFreq = pd.DataFrame() columns = ['term', 'score'] if freq_word_list.shape[0] > 0: df_wordFreq = pd.DataFrame( [term_freq_df.term.tolist(), freq_word_list]).T df_wordFreq = df_wordFreq.sort_values(1, ascending=True) #.head(topk) df_wordFreq.columns = columns except: df['word_list'] = df['only_alphabets'].apply( lambda x: x[1:-1].split(' ')) df['word_list'] = df['word_list'].astype(str) df['word_list'] = df['word_list'].apply( lambda x: ''.join([str(i) for i in x])) df['word_list'] = df['word_list'].str.lower() df_wordFreq = df[['word_list', 'stars']] s = df_wordFreq.apply(lambda x: pd.Series(x['word_list']), axis=1).stack().reset_index(level=1, drop=True) s.name = 'word_list' df_wordFreq = pd.DataFrame(df['word_list'].str.split(',').tolist(), index=df['stars']).stack() df_wordFreq = df_wordFreq.reset_index()[[ 0, 'stars' ]] # var1 variable is currently labeled 0 df_wordFreq.columns = ['term', 'score'] # renaming var1 df_wordFreq = df_wordFreq.reset_index(drop=False) replace_dict_phrase_count = { '[': '', ']': '', '-': '', '!': '', '.': '', "'": '', ' ': '' } for key in replace_dict_phrase_count.keys(): df_wordFreq['term'] = df_wordFreq['term'].str.replace( key, replace_dict_phrase_count[key]) df_wordFreq['term'] = df_wordFreq['term'].str.lower() x, y = df_wordFreq.shape if x > 100: df_wordFreq = pd.concat([df_wordFreq.head(50), df_wordFreq.tail(50)]) x, y = df_wordFreq.shape # updated size top_terms_list = [] for i in range(math.ceil(x / 2)): try: new_df = df[df['only_alphabets'].str.contains( df_wordFreq['term'].iloc[i])] #if word appears # in review, create a dataframe with each row being the word occurring in a different review pos_first_df = new_df.sort_values( by='stars', ascending=False) #rank the dataframe with most # positive reviews first if pos_first_df['text'].iloc[ 0] not in top_terms_list: #get the highest star rating review top_terms_list.append(pos_first_df['text'].iloc[0]) except IndexError as e: pass worst_terms_list = [] for i in reversed(range(math.ceil(x / 2), x)): try: new_df = df[df['only_alphabets'].str.contains( df_wordFreq['term'].iloc[i])] #if word appears # in review, create a dataframe with each row being the word occurring in a different review neg_first_df = new_df.sort_values( by='stars', ascending=True) #rank the dataframe with worst # reviews first if neg_first_df['text'].iloc[ 0] not in worst_terms_list: #get the lowest star rating review worst_terms_list.append( neg_first_df['text'].iloc[0]) #prevent duplicates except IndexError as e: pass del [df, new_df, neg_first_df, pos_first_df] negative_list = [] for i in reversed(range(math.ceil(x / 2), x)): for list_of_words in worst_terms_list: word_list = list_of_words.split(' ') for word in word_list: try: if df_wordFreq['term'].iloc[ i] == word: # find word occurrence in original # comma separated word list of reviews try: index = word_list.index(word) string_from_phrases = ' '.join( word_list[max(0, index - 2):min(index + 4, len(word_list))]) negative_list.append(string_from_phrases) except ValueError as e: pass except IndexError as e: #if there are less than the last half of # the df_wordFreq words fter stopword filtering, just get the first word and # its occurrence in the original review if df_wordFreq['term'].iloc[0] == word: try: index = word_list.index(word) string_from_phrases = ' '.join( word_list[max(0, index - 2):min(index + 4, len(word_list))]) negative_list.append(string_from_phrases) except ValueError as e: pass negative_df = pd.DataFrame(negative_list) negative_df = negative_df.reset_index(drop=False) negative_df = negative_df.rename(columns={'index': 'score', 0: 'term'}) neg_no_dup = negative_df.drop_duplicates(subset='term') negative_phrase_list, y = neg_no_dup.shape if negative_phrase_list <= 10: num_time_append = 10 - negative_phrase_list for i in range(num_time_append): if 'term' not in list(negative_df): negative_df = negative_df.append( pd.DataFrame([.5], columns=['score'])) negative_df['term'] = '' negative_df = negative_df.append( pd.DataFrame([[.5, '']], columns=['score', 'term'])) else: negative_df = neg_no_dup del [neg_no_dup] replace_dict_phrase = { ',': ' ', '\u00a0': '', '\n': '', '!': '', '.': '', "'": '' } for key in replace_dict_phrase.keys(): negative_df['term'] = negative_df['term'].str.replace( key, replace_dict_phrase[key]) #normalize score for positive connotation words going from 0 to 0.5 negative_df['score'] = negative_df['score'].div( (negative_df['score'].max()) * 2, axis=0) negative_df = negative_df.sort_values(by=['score'], ascending=False) negative_df['score'] = negative_df['score'].round(decimals=4) positive_list = [] for i in range(math.ceil(x / 2)): for list_of_words in top_terms_list: word_list = list_of_words.split(' ') for word in word_list: try: if df_wordFreq['term'].iloc[ i] == word: # find word occurrence in original # comma separated word list of reviews try: index = word_list.index(word) string_from_phrases = ','.join( word_list[max(0, index - 2):min(index + 4, len(word_list))]) positive_list.append(string_from_phrases) except ValueError as e: pass except IndexError as e: if df_wordFreq['term'].iloc[ 0] == word: #if there are less than the first half of # the df_wordFreq words fter stopword filtering, just get the first word and # its occurrence in the original review try: index = word_list.index(word) string_from_phrases = ','.join( word_list[max(0, index - 2):min(index + 4, len(word_list))]) positive_list.append(string_from_phrases) except ValueError as e: pass positive_df = pd.DataFrame(positive_list) positive_df = positive_df.reset_index(drop=False) positive_df = positive_df.rename(columns={'index': 'score', 0: 'term'}) pos_no_dup = positive_df.drop_duplicates(subset='term') positive_phrase_list, y = pos_no_dup.shape if positive_phrase_list <= 10: num_time_append = 10 - positive_phrase_list for i in range(num_time_append): if 'term' not in list(positive_df): positive_df = positive_df.append( pd.DataFrame([.5], columns=['score'])) positive_df['term'] = '' positive_df = positive_df.append( pd.DataFrame([[.5, '']], columns=['score', 'term'])) else: positive_df = pos_no_dup del [pos_no_dup] for key in replace_dict_phrase.keys(): positive_df['term'] = positive_df['term'].str.replace( key, replace_dict_phrase[key]) #normalize score for positive connotation words going from 0.5 to 1 positive_df['score'] = positive_df['score'].div( ((positive_df['score'].max()) * 2), axis=0) + 0.5 positive_df = positive_df.sort_values(by=['score'], ascending=False) positive_df['score'] = positive_df['score'].round(decimals=4) return positive_df.head(topk), negative_df.tail(topk)
from sklearn.decomposition import PCA, FastICA, SparsePCA import scattertext as st from scattertext import CategoryProjector, RankDifference, ScaledFScorePresetsNeg1To1 from scattertext.cartegoryprojector.OptimalProjection import get_optimal_category_projection from scattertext.termcompaction.AssociationCompactor import ScorePercentileCompactor, AssociationCompactor from scattertext.termscoring import ScaledFScore movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_unigram_corpus() ''' category_projection = get_optimal_category_projection( corpus, n_dims=2, n_steps=20, projector=lambda n_terms, n_dims: CategoryProjector(AssociationCompactor(n_terms, scorer=RankDifference), projector=PCA(n_dims))) ''' html = st.produce_pairplot(corpus, #category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'])
import scattertext as st import scattertext.categoryprojector.pairplot convention_df = st.SampleCorpora.ConventionData2012.get_data() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus() html = scattertext.categoryprojector.pairplot.produce_pairplot( corpus, metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st from scattertext import LogOddsRatioInformativeDirichletPrior fn = 'rotten_fresh2.html' df = st.SampleCorpora.RottenTomatoes.get_data() corpus = (st.CorpusFromPandas(df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences).build()) priors = (st.PriorFactory(corpus, category='fresh', not_categories=['rotten'], starting_count=1).use_general_term_frequencies(). use_all_categories().get_priors()) (open(fn, 'wb').write( st.produce_fightin_words_explorer( corpus, category='fresh', not_categories=['rotten'], metadata=df['movie_name'], term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10), ).encode('utf-8'))) print(fn)
def ValuePredictor(yelp_url, from_isbn=False): '''Takes a url, scrape site for reviews and calculates the term frequencies sorts and returns the top 10 as a json object containing term, highratingscore, poorratingscore.''' base_url = "https://www.yelp.com/biz/" # add business id api_url = "/review_feed?sort_by=date_desc&start=" bid = "flower-child-addison-2" # business id class Scraper(): def __init__(self): self.data = pd.DataFrame() def get_data(self, n, bid=bid): with Session() as s: with s.get( base_url + bid + api_url + str(n * 20) ) as resp: #makes an http get request to given url and returns response as json r = loads(resp.content ) #converts json response into a dictionary _html = html.fromstring( r['review_list']) #loads from dictionary dates = _html.xpath( "//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()" ) reviews = [ el.text for el in _html.xpath( "//div[@class='review-content']/p") ] ratings = _html.xpath( "//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title" ) df = pd.DataFrame([dates, reviews, ratings]).T self.data = pd.concat([self.data, df]) def scrape(self): #makes it faster # multithreaded looping with Executor(max_workers=40) as e: list(e.map(self.get_data, range(10))) s = Scraper() s.scrape() df = s.data df = df.sample(100) nlp.Defaults.stop_words |= { 'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins', '1 check', 'I', 'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.', ',' } corpus = st.CorpusFromPandas(df, category_col=2, text_col=1, nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() term_freq_df['highratingscore'] = corpus.get_scaled_f_scores( '5.0 star rating') term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores( '1.0 star rating') df = term_freq_df.sort_values(by='poorratingscore', ascending=False) df['highratingscore'] = round(df['highratingscore'], 2) df['poorratingscore'] = round(df['poorratingscore'], 2) list1 = [] for i in df.index[:10]: list1.append(i) return json.dumps(list1)