Пример #1
0
 def __init__(self, data: pd.DataFrame):
     # This takes the dataframe and computes all the inputs to the Facets
     # Overview plots such as:
     # - numeric variables: histogram bins, mean, min, median, max, etc..
     # - categorical variables: num unique, counts per category for bar chart,
     #     top category, etc.
     gfsg = GenericFeatureStatisticsGenerator()
     self._proto = gfsg.ProtoFromDataFrames([{
         'name': 'data',
         'table': data
     }], )
Пример #2
0
 def _display_overview(self, data, update=None):
     gfsg = GenericFeatureStatisticsGenerator()
     proto = gfsg.ProtoFromDataFrames([{'name': 'data', 'table': data}])
     protostr = base64.b64encode(proto.SerializeToString()).decode('utf-8')
     if update:
         script = _OVERVIEW_SCRIPT_TEMPLATE.format(display_id=update,
                                                   protostr=protostr)
         display_javascript(Javascript(script))
     else:
         html = _OVERVIEW_HTML_TEMPLATE.format(
             display_id=self._overview_display_id, protostr=protostr)
         display(HTML(html))
Пример #3
0
  def _display_overview(self, data, update=None):
    if (not data.empty and self._include_window_info and
        all(column in data.columns
            for column in ('event_time', 'windows', 'pane_info'))):
      data = data.drop(['event_time', 'windows', 'pane_info'], axis=1)

    gfsg = GenericFeatureStatisticsGenerator()
    proto = gfsg.ProtoFromDataFrames([{'name': 'data', 'table': data}])
    protostr = base64.b64encode(proto.SerializeToString()).decode('utf-8')
    if update:
      script = _OVERVIEW_SCRIPT_TEMPLATE.format(
          display_id=update._overview_display_id, protostr=protostr)
      display_javascript(Javascript(script))
    else:
      html = _OVERVIEW_HTML_TEMPLATE.format(
          display_id=self._overview_display_id, protostr=protostr)
      display(HTML(html))
Пример #4
0
def overview(tables: typing.Union[pandas.DataFrame, typing.Mapping[str, pandas.DataFrame]]) -> HTML:
    # Element ID MUST be unique
    elem_id = _generate_element_id()

    if isinstance(tables, pandas.DataFrame):
        tables = {"default": tables}

    table_list = []
    for name, table in tables.items():
        # Convert PandasExtensionDType column to object column because facets currently doesn't support it.
        view = table.copy()
        for k, v in view.dtypes.iteritems():
            if not isinstance(v, numpy.dtype):
                view[k] = view[k].astype(object)

        table_list.append({'name': name, 'table': view})

    proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames(table_list)
    proto_str = base64.b64encode(proto.SerializeToString()).decode("utf-8")
    return HTML(FACETS_OVERVIEW_TEMPLATE.format(elem_id=elem_id, proto_str=proto_str))
Пример #5
0
def generate_facets(config, df):

    proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{
        'name':
        'facets-iss',
        'table':
        df
    }])
    protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

    HTML_TEMPLATE = """
            <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
            <facets-overview id="elem"></facets-overview>
            <script>
            document.querySelector("#elem").protoInput = "{protostr}";
            </script>"""
    html = HTML_TEMPLATE.format(protostr=protostr)

    return html
Пример #6
0
    def generate_html(self, datasets: List[Dict[Text, pd.DataFrame]]) -> str:
        """Generates html for facet.

        Args:
            datasets: List of dicts of dataframes to be visualized as stats.

        Returns:
            HTML template with proto string embedded.
        """
        proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames(
            datasets)
        protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

        template = os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            "stats.html",
        )
        html_template = fileio.read_file_contents_as_string(template)

        html_ = html_template.replace("protostr", protostr)
        return html_
Пример #7
0
def tables1():
    target = os.path.join(APP_ROOT, "/home/aayushi/ml-simu")
    # print(target)

    if not os.path.isdir(target):
        os.mkdir(target)

    file = request.files["file"]
    # print(file)
    destination = os.path.join(target, file.filename)
    print(destination)
    # d = destination
    file.save(destination)
    data = pd.read_csv(destination)
    from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
    import base64

    gfsg = GenericFeatureStatisticsGenerator()
    proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': data}])
    protostr1 = base64.b64encode(proto.SerializeToString()).decode("utf-8")
    # print(protostr1)

    return render_template("examples/tables.html", data=protostr1)
 def setUp(self):
     self.gfsg = GenericFeatureStatisticsGenerator()
class GenericFeatureStatisticsGeneratorTest(googletest.TestCase):
    def setUp(self):
        self.gfsg = GenericFeatureStatisticsGenerator()

    def testProtoFromDataFrames(self):
        data = [[1, 'hi'], [2, 'hello'], [3, 'hi']]
        df = pd.DataFrame(data,
                          columns=['testFeatureInt', 'testFeatureString'])
        dataframes = [{'table': df, 'name': 'testDataset'}]
        p = self.gfsg.ProtoFromDataFrames(dataframes)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(2, len(test_data.features))

        if test_data.features[0].name == 'testFeatureInt':
            numfeat = test_data.features[0]
            stringfeat = test_data.features[1]
        else:
            numfeat = test_data.features[1]
            stringfeat = test_data.features[0]

        self.assertEqual('testFeatureInt', numfeat.name)
        self.assertEqual(self.gfsg.fs_proto.INT, numfeat.type)
        self.assertEqual(1, numfeat.num_stats.min)
        self.assertEqual(3, numfeat.num_stats.max)
        self.assertEqual('testFeatureString', stringfeat.name)
        self.assertEqual(self.gfsg.fs_proto.STRING, stringfeat.type)
        self.assertEqual(2, stringfeat.string_stats.unique)

    def testNdarrayToEntry(self):
        arr = np.array([1.0, 2.0, None, float('nan'), 3.0], dtype=float)

        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual(2, entry['missing'])

        arr = np.array(['a', 'b', float('nan'), 'c'], dtype=str)
        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual(1, entry['missing'])

    def testNdarrayToEntryTimeTypes(self):
        arr = np.array(
            [np.datetime64('2005-02-25'),
             np.datetime64('2006-02-25')],
            dtype=np.datetime64)
        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual([1109289600000000000, 1140825600000000000],
                         entry['vals'])

        arr = np.array(
            [np.datetime64('2009-01-01') - np.datetime64('2008-01-01')],
            dtype=np.timedelta64)
        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual([31622400000000000], entry['vals'])

    def testDTypeToType(self):
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.int32)))
        # Boolean and time types treated as int
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.bool)))
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.datetime64)))
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.timedelta64)))
        self.assertEqual(self.gfsg.fs_proto.FLOAT,
                         self.gfsg.DtypeToType(np.dtype(np.float32)))
        self.assertEqual(self.gfsg.fs_proto.STRING,
                         self.gfsg.DtypeToType(np.dtype(np.str)))
        # Unsupported types treated as string for now
        self.assertEqual(self.gfsg.fs_proto.STRING,
                         self.gfsg.DtypeToType(np.dtype(np.void)))

    def testGetDatasetsProtoFromEntriesLists(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 3],
            'counts': [1, 1, 1],
            'missing': 0,
            'type': self.gfsg.fs_proto.INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = self.gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeature', numfeat.name)
        self.assertEqual(self.gfsg.fs_proto.INT, numfeat.type)
        self.assertEqual(1, numfeat.num_stats.min)
        self.assertEqual(3, numfeat.num_stats.max)
        hist = numfeat.num_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(self.gfsg.histogram_proto.QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.3, buckets[0].sample_count)
        self.assertEqual(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(.3, buckets[9].sample_count)

    def testGetDatasetsProtoSequenceExampleHistogram(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 2, 3],
            'counts': [1, 2, 1],
            'feat_lens': [1, 2, 1],
            'missing': 0,
            'type': self.gfsg.fs_proto.INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = self.gfsg.GetDatasetsProto(datasets)
        hist = p.datasets[0].features[
            0].num_stats.common_stats.feature_list_length_histogram
        buckets = hist.buckets
        self.assertEqual(self.gfsg.histogram_proto.QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.3, buckets[0].sample_count)
        self.assertEqual(1.8, buckets[9].low_value)
        self.assertEqual(2, buckets[9].high_value)
        self.assertEqual(.3, buckets[9].sample_count)

    def testGetDatasetsProtoWithWhitelist(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 3],
            'counts': [1, 1, 1],
            'missing': 0,
            'type': self.gfsg.fs_proto.INT
        }
        entries['ignoreFeature'] = {
            'vals': [5, 6],
            'counts': [1, 1],
            'missing': 1,
            'type': self.gfsg.fs_proto.INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = self.gfsg.GetDatasetsProto(datasets, features=['testFeature'])

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeature', numfeat.name)
        self.assertEqual(1, numfeat.num_stats.min)

    def testGetDatasetsProtoWithMaxHistigramLevelsCount(self):
        # Selected entries' lengths make it easy to compute average length
        data = [['hi'], ['good'], ['hi'], ['hi'], ['a'], ['a']]
        df = pd.DataFrame(data, columns=['testFeatureString'])
        dataframes = [{'table': df, 'name': 'testDataset'}]
        # Getting proto from ProtoFromDataFrames instead of GetDatasetsProto
        # directly to avoid any hand written values ex: size of dataset.
        p = self.gfsg.ProtoFromDataFrames(dataframes,
                                          histogram_categorical_levels_count=2)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(6, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeatureString', numfeat.name)

        top_values = numfeat.string_stats.top_values
        self.assertEqual(3, top_values[0].frequency)
        self.assertEqual('hi', top_values[0].value)

        self.assertEqual(3, numfeat.string_stats.unique)
        self.assertEqual(2, numfeat.string_stats.avg_length)

        rank_hist = numfeat.string_stats.rank_histogram
        buckets = rank_hist.buckets
        self.assertEqual(2, len(buckets))
        self.assertEqual('hi', buckets[0].label)
        self.assertEqual(3, buckets[0].sample_count)
        self.assertEqual('a', buckets[1].label)
        self.assertEqual(2, buckets[1].sample_count)
Пример #10
0
# -*- coding: utf-8 -*-
import pandas as pd
import dash
import dash_html_components as html
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator


DEBUG = True
data = pd.read_csv("dataset.csv")
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': data}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

app = dash.Dash('')

app.layout = html.Div(children=[
    html.Iframe(
        width="1200",
        height="800",
        srcDoc= """
       <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>""".format(protostr=protostr)
    ),
])
server = app.server
Пример #11
0
#%%

my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())
my_pipeline.fit(train_X, train_y)
predictions = my_pipeline.predict(test_X)
print("Error:" + str(mean_absolute_error(predictions, test_y)))

#%%
train = pd.read_csv('ML/train.csv')
test = pd.read_csv('ML/test.csv')

from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

#%%
proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{
    'name': 'test',
    'table': test
}])

#%%
from IPython.core.display import display, HTML
import base64
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")
HTML_TEMPLATE = """<link rel="import" href="/nbextensions/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))

#%%