parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) corpus = st.CorpusWithoutCategoriesFromParsedDocuments( df, parsed_col='parse').build().get_unigram_corpus().remove_infrequent_words( minimum_term_count=6) dispersion = st.Dispersion(corpus) dispersion_df = dispersion.get_df().assign( X=lambda df: df.Frequency, Xpos=lambda df: st.Scalers.log_scale(df.X), Y=lambda df: dispersion.rosengrens(), Ypos=lambda df: st.Scalers.scale(df.Y), ) html = st.dataframe_scattertext( corpus, plot_df=dispersion_df, metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')', ignore_categories=True, x_label='Log Frequency', y_label="Rosengren's S", y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'], ) fn = 'demo_dispersion_basic.html' open(fn, 'w').write(html) print('open ./%s in Chrome' % fn)
ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual)) line_df = pd.DataFrame({ 'x': dispersion_df.Xpos.values, 'y': dispersion_df.Expected.values, }).sort_values(by='x') html = st.dataframe_scattertext( corpus, plot_df=dispersion_df, metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')', ignore_categories=True, x_label='Log Frequency', y_label='DA', y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'], color_score_column='ColorScore', tooltip_columns=['Frequency', 'DA'], header_names={ 'upper': 'Lower than Expected', 'lower': 'More than Expected' }, left_list_column='Residual', background_color='#e5e5e3', line_coordinates=line_df.to_dict('records')) fn = 'demo_dispersion.html' open(fn, 'w').write(html) print('open ./%s in Chrome' % fn) residual_dispersion_df = dispersion_df.assign( Expected=lambda df: Lowess().fit_predict(df.X.values, df.Y.values),
Xpos=lambda df: st.Scalers.log_scale(df.X), Y=lambda df: dispersion.rosengrens(), Ypos=lambda df: st.Scalers.scale(df.Y), ) dispersion_df = dispersion_df.assign( Expected=lambda df: KNeighborsRegressor(n_neighbors=10).fit( df.X.values.reshape(-1, 1), df.Y ).predict(df.X.values.reshape(-1, 1)), Residual=lambda df: df.Y - df.Expected, ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual) ) html = st.dataframe_scattertext( corpus, plot_df=dispersion_df, metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')', ignore_categories=True, x_label='Log Frequency', y_label="Rosengren's S", y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'], color_score_column='ColorScore', header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'}, left_list_column='Residual', background_color='#e5e5e3' ) fn = 'demo_dispersion.html' open(fn, 'w').write(html) print('open ./%s in Chrome' % fn)
Ypos=lambda df: st.Scalers.dense_rank(df.Y), Xpos=lambda df: st.Scalers.dense_rank(df.X), SuppressDisplay=False, ColorScore=lambda df: st.Scalers.scale_center_zero(df.Ypos - df.Xpos), ) html = st.dataframe_scattertext( corpus, plot_df=plot_df, category='democrat', category_name='Democratic', not_category_name='Republican', width_in_pixels=1000, suppress_text_column='Display', metadata=corpus.get_df()['speaker'], use_non_text_features=True, ignore_categories=False, use_offsets=True, unified_context=False, color_score_column='ColorScore', left_list_column='ColorScore', y_label='Democarats', x_label='Republicans', header_names={'upper': 'Top Democratic', 'lower': 'Top Republican', 'right': 'Most Frequent'}, subword_encoding='RoBERTa' ) fn = 'roberta_sentence_piece.html' with open(fn, 'w') as of: of.write(html) print("Open ./" + fn + ' in Chrome.')
ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.AdjustedDA)) line_df = pd.DataFrame({ 'x': dispersion_df.Xpos.values, 'y': dispersion_df.Expected.values, }).sort_values(by='x') html = st.dataframe_scattertext( corpus, plot_df=dispersion_df, metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')', ignore_categories=True, x_label='Log Frequency', y_label='DA', y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'], color_score_column='ColorScore', tooltip_columns=['Frequency', 'DA'], header_names={ 'upper': 'Lower than Expected', 'lower': 'More than Expected' }, left_list_column='AdjustedDA', background_color='#e5e5e3', line_coordinates=line_df.to_dict('records')) fn = 'demo_dispersion.html' open(fn, 'w').write(html) print('open ./%s in Chrome' % fn) residual_dispersion_df = dispersion_df.assign( Expected=lambda df: Lowess().fit_predict(df.X.values, df.Y.values),