def __init__(self, data: pd.DataFrame): # This takes the dataframe and computes all the inputs to the Facets # Overview plots such as: # - numeric variables: histogram bins, mean, min, median, max, etc.. # - categorical variables: num unique, counts per category for bar chart, # top category, etc. gfsg = GenericFeatureStatisticsGenerator() self._proto = gfsg.ProtoFromDataFrames([{ 'name': 'data', 'table': data }], )
def _display_overview(self, data, update=None): gfsg = GenericFeatureStatisticsGenerator() proto = gfsg.ProtoFromDataFrames([{'name': 'data', 'table': data}]) protostr = base64.b64encode(proto.SerializeToString()).decode('utf-8') if update: script = _OVERVIEW_SCRIPT_TEMPLATE.format(display_id=update, protostr=protostr) display_javascript(Javascript(script)) else: html = _OVERVIEW_HTML_TEMPLATE.format( display_id=self._overview_display_id, protostr=protostr) display(HTML(html))
def _display_overview(self, data, update=None): if (not data.empty and self._include_window_info and all(column in data.columns for column in ('event_time', 'windows', 'pane_info'))): data = data.drop(['event_time', 'windows', 'pane_info'], axis=1) gfsg = GenericFeatureStatisticsGenerator() proto = gfsg.ProtoFromDataFrames([{'name': 'data', 'table': data}]) protostr = base64.b64encode(proto.SerializeToString()).decode('utf-8') if update: script = _OVERVIEW_SCRIPT_TEMPLATE.format( display_id=update._overview_display_id, protostr=protostr) display_javascript(Javascript(script)) else: html = _OVERVIEW_HTML_TEMPLATE.format( display_id=self._overview_display_id, protostr=protostr) display(HTML(html))
def overview(tables: typing.Union[pandas.DataFrame, typing.Mapping[str, pandas.DataFrame]]) -> HTML: # Element ID MUST be unique elem_id = _generate_element_id() if isinstance(tables, pandas.DataFrame): tables = {"default": tables} table_list = [] for name, table in tables.items(): # Convert PandasExtensionDType column to object column because facets currently doesn't support it. view = table.copy() for k, v in view.dtypes.iteritems(): if not isinstance(v, numpy.dtype): view[k] = view[k].astype(object) table_list.append({'name': name, 'table': view}) proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames(table_list) proto_str = base64.b64encode(proto.SerializeToString()).decode("utf-8") return HTML(FACETS_OVERVIEW_TEMPLATE.format(elem_id=elem_id, proto_str=proto_str))
def generate_facets(config, df): proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{ 'name': 'facets-iss', 'table': df }]) protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8") HTML_TEMPLATE = """ <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script> <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" > <facets-overview id="elem"></facets-overview> <script> document.querySelector("#elem").protoInput = "{protostr}"; </script>""" html = HTML_TEMPLATE.format(protostr=protostr) return html
def generate_html(self, datasets: List[Dict[Text, pd.DataFrame]]) -> str: """Generates html for facet. Args: datasets: List of dicts of dataframes to be visualized as stats. Returns: HTML template with proto string embedded. """ proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames( datasets) protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8") template = os.path.join( os.path.abspath(os.path.dirname(__file__)), "stats.html", ) html_template = fileio.read_file_contents_as_string(template) html_ = html_template.replace("protostr", protostr) return html_
def tables1(): target = os.path.join(APP_ROOT, "/home/aayushi/ml-simu") # print(target) if not os.path.isdir(target): os.mkdir(target) file = request.files["file"] # print(file) destination = os.path.join(target, file.filename) print(destination) # d = destination file.save(destination) data = pd.read_csv(destination) from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator import base64 gfsg = GenericFeatureStatisticsGenerator() proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': data}]) protostr1 = base64.b64encode(proto.SerializeToString()).decode("utf-8") # print(protostr1) return render_template("examples/tables.html", data=protostr1)
def setUp(self): self.gfsg = GenericFeatureStatisticsGenerator()
class GenericFeatureStatisticsGeneratorTest(googletest.TestCase): def setUp(self): self.gfsg = GenericFeatureStatisticsGenerator() def testProtoFromDataFrames(self): data = [[1, 'hi'], [2, 'hello'], [3, 'hi']] df = pd.DataFrame(data, columns=['testFeatureInt', 'testFeatureString']) dataframes = [{'table': df, 'name': 'testDataset'}] p = self.gfsg.ProtoFromDataFrames(dataframes) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(2, len(test_data.features)) if test_data.features[0].name == 'testFeatureInt': numfeat = test_data.features[0] stringfeat = test_data.features[1] else: numfeat = test_data.features[1] stringfeat = test_data.features[0] self.assertEqual('testFeatureInt', numfeat.name) self.assertEqual(self.gfsg.fs_proto.INT, numfeat.type) self.assertEqual(1, numfeat.num_stats.min) self.assertEqual(3, numfeat.num_stats.max) self.assertEqual('testFeatureString', stringfeat.name) self.assertEqual(self.gfsg.fs_proto.STRING, stringfeat.type) self.assertEqual(2, stringfeat.string_stats.unique) def testNdarrayToEntry(self): arr = np.array([1.0, 2.0, None, float('nan'), 3.0], dtype=float) entry = self.gfsg.NdarrayToEntry(arr) self.assertEqual(2, entry['missing']) arr = np.array(['a', 'b', float('nan'), 'c'], dtype=str) entry = self.gfsg.NdarrayToEntry(arr) self.assertEqual(1, entry['missing']) def testNdarrayToEntryTimeTypes(self): arr = np.array( [np.datetime64('2005-02-25'), np.datetime64('2006-02-25')], dtype=np.datetime64) entry = self.gfsg.NdarrayToEntry(arr) self.assertEqual([1109289600000000000, 1140825600000000000], entry['vals']) arr = np.array( [np.datetime64('2009-01-01') - np.datetime64('2008-01-01')], dtype=np.timedelta64) entry = self.gfsg.NdarrayToEntry(arr) self.assertEqual([31622400000000000], entry['vals']) def testDTypeToType(self): self.assertEqual(self.gfsg.fs_proto.INT, self.gfsg.DtypeToType(np.dtype(np.int32))) # Boolean and time types treated as int self.assertEqual(self.gfsg.fs_proto.INT, self.gfsg.DtypeToType(np.dtype(np.bool))) self.assertEqual(self.gfsg.fs_proto.INT, self.gfsg.DtypeToType(np.dtype(np.datetime64))) self.assertEqual(self.gfsg.fs_proto.INT, self.gfsg.DtypeToType(np.dtype(np.timedelta64))) self.assertEqual(self.gfsg.fs_proto.FLOAT, self.gfsg.DtypeToType(np.dtype(np.float32))) self.assertEqual(self.gfsg.fs_proto.STRING, self.gfsg.DtypeToType(np.dtype(np.str))) # Unsupported types treated as string for now self.assertEqual(self.gfsg.fs_proto.STRING, self.gfsg.DtypeToType(np.dtype(np.void))) def testGetDatasetsProtoFromEntriesLists(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 3], 'counts': [1, 1, 1], 'missing': 0, 'type': self.gfsg.fs_proto.INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = self.gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeature', numfeat.name) self.assertEqual(self.gfsg.fs_proto.INT, numfeat.type) self.assertEqual(1, numfeat.num_stats.min) self.assertEqual(3, numfeat.num_stats.max) hist = numfeat.num_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(self.gfsg.histogram_proto.QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.3, buckets[0].sample_count) self.assertEqual(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(.3, buckets[9].sample_count) def testGetDatasetsProtoSequenceExampleHistogram(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 2, 3], 'counts': [1, 2, 1], 'feat_lens': [1, 2, 1], 'missing': 0, 'type': self.gfsg.fs_proto.INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = self.gfsg.GetDatasetsProto(datasets) hist = p.datasets[0].features[ 0].num_stats.common_stats.feature_list_length_histogram buckets = hist.buckets self.assertEqual(self.gfsg.histogram_proto.QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.3, buckets[0].sample_count) self.assertEqual(1.8, buckets[9].low_value) self.assertEqual(2, buckets[9].high_value) self.assertEqual(.3, buckets[9].sample_count) def testGetDatasetsProtoWithWhitelist(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 3], 'counts': [1, 1, 1], 'missing': 0, 'type': self.gfsg.fs_proto.INT } entries['ignoreFeature'] = { 'vals': [5, 6], 'counts': [1, 1], 'missing': 1, 'type': self.gfsg.fs_proto.INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = self.gfsg.GetDatasetsProto(datasets, features=['testFeature']) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeature', numfeat.name) self.assertEqual(1, numfeat.num_stats.min) def testGetDatasetsProtoWithMaxHistigramLevelsCount(self): # Selected entries' lengths make it easy to compute average length data = [['hi'], ['good'], ['hi'], ['hi'], ['a'], ['a']] df = pd.DataFrame(data, columns=['testFeatureString']) dataframes = [{'table': df, 'name': 'testDataset'}] # Getting proto from ProtoFromDataFrames instead of GetDatasetsProto # directly to avoid any hand written values ex: size of dataset. p = self.gfsg.ProtoFromDataFrames(dataframes, histogram_categorical_levels_count=2) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(6, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeatureString', numfeat.name) top_values = numfeat.string_stats.top_values self.assertEqual(3, top_values[0].frequency) self.assertEqual('hi', top_values[0].value) self.assertEqual(3, numfeat.string_stats.unique) self.assertEqual(2, numfeat.string_stats.avg_length) rank_hist = numfeat.string_stats.rank_histogram buckets = rank_hist.buckets self.assertEqual(2, len(buckets)) self.assertEqual('hi', buckets[0].label) self.assertEqual(3, buckets[0].sample_count) self.assertEqual('a', buckets[1].label) self.assertEqual(2, buckets[1].sample_count)
# -*- coding: utf-8 -*- import pandas as pd import dash import dash_html_components as html import base64 from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator DEBUG = True data = pd.read_csv("dataset.csv") external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] gfsg = GenericFeatureStatisticsGenerator() proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': data}]) protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8") app = dash.Dash('') app.layout = html.Div(children=[ html.Iframe( width="1200", height="800", srcDoc= """ <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script> <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" > <facets-overview id="elem"></facets-overview> <script> document.querySelector("#elem").protoInput = "{protostr}"; </script>""".format(protostr=protostr) ), ]) server = app.server
#%% my_pipeline = make_pipeline(Imputer(), RandomForestRegressor()) my_pipeline.fit(train_X, train_y) predictions = my_pipeline.predict(test_X) print("Error:" + str(mean_absolute_error(predictions, test_y))) #%% train = pd.read_csv('ML/train.csv') test = pd.read_csv('ML/test.csv') from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator #%% proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{ 'name': 'test', 'table': test }]) #%% from IPython.core.display import display, HTML import base64 protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8") HTML_TEMPLATE = """<link rel="import" href="/nbextensions/facets-dist/facets-jupyter.html" > <facets-overview id="elem"></facets-overview> <script> document.querySelector("#elem").protoInput = "{protostr}"; </script>""" html = HTML_TEMPLATE.format(protostr=protostr) display(HTML(html)) #%%