def __init__(self, selector=AssociationCompactor(1000, RankDifference), normalizer=LengthNormalizeScaleStandardize(), projector=PCA(2)): ''' :param selector: instance of a compactor class, if None, no compaction will be done. :param projector: instance an sklearn class with fit_transform :param normalizer: instance of an sklearn class with fit_transform to normalize term X category corpus. ''' self.selector = selector self.projector_ = projector self.normalizer_ = normalizer
def get_optimal_category_projection( corpus, n_dims=3, n_steps=10, projector=lambda n_terms, n_dims: CategoryProjector( AssociationCompactor(n_terms, scorer=RankDifference), projector=PCA(n_dims)), verbose=False): try: from astropy.stats import RipleysKEstimator except: raise Exception("Please install astropy") ripley = RipleysKEstimator(area=1., x_max=1., y_max=1., x_min=0., y_min=0.) min_dev = None best_k = None best_x = None best_y = None best_projector = None for k in np.power( 2, np.linspace( np.log(corpus.get_num_categories()) / np.log(2), np.log(corpus.get_num_terms()) / np.log(2), n_steps)).astype(int): r = np.linspace(0, np.sqrt(2), 100) category_projector = projector(k, n_dims) category_projection = category_projector.project(corpus) for dim_1 in range(0, n_dims): for dim_2 in range(dim_1 + 1, n_dims): proj = category_projection.projection[:, [dim_1, dim_2]] scaled_proj = np.array( [stretch_0_to_1(proj.T[0]), stretch_0_to_1(proj.T[1])]).T dev = np.sum( np.abs( ripley(scaled_proj, r, mode='ripley') - ripley.poisson(r))) if min_dev is None or dev < min_dev: min_dev = dev best_k = k best_projector = category_projector best_x, best_y = (dim_1, dim_2) if verbose: print(k, dim_1, dim_2, dev, best_k, best_x, best_y, min_dev) if verbose: print(best_k, best_x, best_y) return best_projector.project(corpus, best_x, best_y)
def get_optimal_category_projection( corpus, n_dims=3, n_steps=10, projector=lambda n_terms, n_dims: CategoryProjector( selector=AssociationCompactor(n_terms, scorer=RankDifference), projector=PCA(n_dims)), optimizer=ripley_poisson_difference, verbose=False): min_dev = None best_k = None best_x = None best_y = None best_projector = None for k in np.power( 2, np.linspace( np.log(corpus.get_num_categories()) / np.log(2), np.log(corpus.get_num_terms()) / np.log(2), n_steps)).astype(int): category_projector = projector(k, n_dims) category_projection = category_projector.project(corpus) for dim_1 in range(0, n_dims): for dim_2 in range(dim_1 + 1, n_dims): proj = category_projection.projection[:, [dim_1, dim_2]] scaled_proj = np.array( [stretch_0_to_1(proj.T[0]), stretch_0_to_1(proj.T[1])]).T dev = optimizer(scaled_proj) #dev = np.sum(np.abs(ripley(scaled_proj, r, mode='ripley') - ripley.poisson(r))) if min_dev is None or dev < min_dev: min_dev = dev best_k = k best_projector = category_projector best_x, best_y = (dim_1, dim_2) if verbose: print(k, dim_1, dim_2, dev, best_k, best_x, best_y, min_dev) if verbose: print(best_k, best_x, best_y) return best_projector.project(corpus, best_x, best_y)
def produce_pairplot(corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata( corpus, x_dim=x_dim, y_dim=y_dim) term_projection = category_projector else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = ScatterChartExplorer( category_projection.category_corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, filter_unigrams=False, jitter=0, max_terms=None, term_ranker=term_ranker, use_non_text_features=True, term_significance=None, terms_to_include=None) proj_df = category_projection.get_pandas_projection() category_scatter_chart_explorer.inject_coordinates( x_coords=scaler(proj_df['x']), y_coords=scaler(proj_df['y']), original_x=proj_df['x'], original_y=proj_df['y']) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_top_terms=False, show_characteristic=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, unified_context=True, show_category_headings=False, show_cross_axes=True, horizontal_line_y_position=0, vertical_line_x_position=0, y_label='', x_label='', full_data='getCategoryDataAndInfo()', alternative_term_func= '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})', div_name='cat-plot') compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, ).hide_terms(terms_to_hide) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1)) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs) term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, asian_mode=asian_mode, use_non_text_features=use_metadata, show_top_terms=True, show_characteristic=False, get_tooltip_content=None, show_category_headings=False, use_full_doc=use_metadata, horizontal_line_y_position=0, vertical_line_x_position=0, topic_model_preview_size=topic_model_preview_size, y_label=initial_category, x_label='Not ' + initial_category, full_data='getTermDataAndInfo()', div_name='d3-div-1', ) return PairPlotFromScatterplotStructure(category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol).to_html()
def produce_pairplot( corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), category_focused=False, verbose=False, use_full_doc=True, default_to_term_comparison=True, category_x_label='', category_y_label='', category_show_axes_and_cross_hairs=False, highlight_selected_category=True, term_x_label=None, # used if default_to_term_comparison term_y_label=None, # used if default_to_term_comparison wordfish_style=False, **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata( corpus, x_dim=x_dim, y_dim=y_dim) else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = _get_category_scatter_chart_explorer( category_projection, scaler, term_ranker, verbose) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' initial_category_idx = corpus.get_categories().index(initial_category) term_plot_change_func = _get_term_plot_change_js_func( wordfish_style, category_focused, initial_category_idx) category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_characteristic=False, x_label=category_x_label, y_label=category_y_label, show_axes_and_cross_hairs=category_show_axes_and_cross_hairs, full_data='getCategoryDataAndInfo()', show_top_terms=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, horizontal_line_y_position=0, vertical_line_x_position=0, unified_context=True, show_category_headings=False, show_cross_axes=True, div_name='cat-plot', alternative_term_func=term_plot_change_func, highlight_selected_category=highlight_selected_category) compacted_corpus = AssociationCompactor( terms_to_show, use_non_text_features=use_metadata).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) if verbose: print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( category_projection.get_corpus(), minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, verbose=verbose).hide_terms(terms_to_hide) if default_to_term_comparison: if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1)) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs) y_label = initial_category, x_label = 'Not ' + initial_category, color_func = None show_top_terms = True show_axes = False else: term_projection = category_projection.get_term_projection() original_x = term_projection['x'] original_y = term_projection['y'] x_coords = scaler(term_projection['x']) y_coords = scaler(term_projection['y']) x_label = term_x_label if term_x_label is not None else '' y_label = term_y_label if term_y_label is not None else '' show_axes = True horizontal_line_y_position = 0 vertical_line_x_position = 0 term_scatter_chart_explorer.inject_coordinates(x_coords, y_coords, original_x=original_x, original_y=original_y) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, category_name=initial_category, include_term_category_counts=True, # transform=dense_rank, ) color_func = '(function(x) {return "#5555FF"})' show_top_terms = False term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, use_full_doc=use_metadata or use_full_doc, asian_mode=asian_mode, use_non_text_features=use_metadata, show_characteristic=False, x_label=x_label, y_label=y_label, full_data='getTermDataAndInfo()', show_top_terms=show_top_terms, get_tooltip_content=None, color_func=color_func, # horizontal_line_y_position=0, # vertical_line_x_position=0, show_axes=show_axes, topic_model_preview_size=topic_model_preview_size, show_category_headings=False, div_name='d3-div-1', unified_context=True, highlight_selected_category=highlight_selected_category) return PairPlotFromScatterplotStructure(category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol).to_html()
def test_compact(self): term_doc_mat = get_hamlet_term_doc_matrix() new_tdm = AssociationCompactor(max_terms=213).compact(term_doc_mat) self.assertEqual(len(term_doc_mat.get_terms()), 26875) self.assertEqual(len(new_tdm.get_terms()), 213)
from scattertext.Scalers import dense_rank from scattertext.termscoring.RankDifference import RankDifference from scattertext.termcompaction.AssociationCompactor import AssociationCompactor from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact( AssociationCompactor(4000)) html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=convention_df['speaker'], term_scorer=RankDifference(), transform=dense_rank) open('./demo_dense_rank.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_dense_rank.html in Chrome or Firefox.')
def test_get_term_ranks(self): term_doc_mat = get_hamlet_term_doc_matrix() ranks = AssociationCompactor(max_terms=213).get_term_ranks(term_doc_mat) self.assertEqual(len(ranks), term_doc_mat.get_num_terms()) self.assertGreaterEqual(ranks.min().min(), 0)
def test_compact(self): term_doc_mat = get_hamlet_term_doc_matrix() new_tdm = AssociationCompactor(max_terms=213).compact(term_doc_mat) self.assertEqual(len(term_doc_mat.get_terms()), 26875) self.assertEqual(len(new_tdm.get_terms()), 213)
def produce_pairplot(corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata(corpus, x_dim=x_dim, y_dim=y_dim) term_projection = category_projector else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = ScatterChartExplorer(category_projection.category_corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, filter_unigrams=False, jitter=0, max_terms=None, term_ranker=term_ranker, use_non_text_features=True, term_significance=None, terms_to_include=None) proj_df = category_projection.get_pandas_projection() category_scatter_chart_explorer.inject_coordinates(x_coords=scaler(proj_df['x']), y_coords=scaler(proj_df['y']), original_x=proj_df['x'], original_y=proj_df['y']) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_top_terms=False, show_characteristic=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, unified_context=True, show_category_headings=False, show_cross_axes=True, horizontal_line_y_position=0, vertical_line_x_position=0, y_label='', x_label='', full_data='getCategoryDataAndInfo()', alternative_term_func='(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})', div_name='cat-plot' ) compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, ).hide_terms(terms_to_hide) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists(topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions(metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1) ) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs ) term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, asian_mode=asian_mode, use_non_text_features=use_metadata, show_top_terms=True, show_characteristic=False, get_tooltip_content=None, show_category_headings=False, use_full_doc=use_metadata, horizontal_line_y_position=0, vertical_line_x_position=0, topic_model_preview_size=topic_model_preview_size, y_label=initial_category, x_label='Not ' + initial_category, full_data='getTermDataAndInfo()', div_name='d3-div-1', ) return PairPlotFromScatterplotStructure( category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol ).to_html()