Exemplo n.º 1
0
    def test_store_key_used(self):
        summary_link = DailySummary(read_key='test_input',
                                    store_key='test_output',
                                    feature_cols=['a'],
                                    datetime_col='dt')

        summary_link.initialize()
        summary_link.execute()

        ds = process_manager.service(DataStore)
        self.assertIn('test_output', ds)
Exemplo n.º 2
0
    def test_partitionby_cols_kept(self):
        summary_link = DailySummary(read_key='test_input',
                                    store_key='test_output',
                                    feature_cols=['a'],
                                    datetime_col='dt',
                                    partitionby_cols=['b'])

        summary_link.initialize()
        summary_link.execute()

        ds = process_manager.service(DataStore)
        self.assertIn('b', ds['test_output'].columns)
Exemplo n.º 3
0
    def test_function_execution(self):
        summary_link = DailySummary(read_key='test_input',
                                    store_key='test_output',
                                    feature_cols={'a': [f.sum, f.count]},
                                    datetime_col='dt')

        summary_link.initialize()
        summary_link.execute()

        ds = process_manager.service(DataStore)
        pdf = ds['test_output'].toPandas()

        self.assertEqual(list(pdf['a_sum_0d']), [3, 5])
        self.assertEqual(list(pdf['a_count_0d']), [2, 1])
Exemplo n.º 4
0
    def test_specific_summary_column_dict(self):
        summary_link = DailySummary(read_key='test_input',
                                    store_key='test_output',
                                    feature_cols={'a': [f.sum]},
                                    datetime_col='dt')

        summary_link.initialize()
        summary_link.execute()

        ds = process_manager.service(DataStore)
        self.assertNotIn('a_min_0d', ds['test_output'].columns)
        self.assertNotIn('a_mean_0d', ds['test_output'].columns)
        self.assertNotIn('a_max_0d', ds['test_output'].columns)
        self.assertNotIn('a_stddev_0d', ds['test_output'].columns)
        self.assertNotIn('a_count_0d', ds['test_output'].columns)
        self.assertIn('a_sum_0d', ds['test_output'].columns)
Exemplo n.º 5
0
    def test_partitionby_partitions(self):
        summary_link = DailySummary(read_key='test_input',
                                    store_key='test_output',
                                    feature_cols=['a'],
                                    datetime_col='dt',
                                    partitionby_cols=['b'])

        summary_link.initialize()
        summary_link.execute()

        ds = process_manager.service(DataStore)
        pdf = ds['test_output'].toPandas()

        self.assertIn(1, pdf['b'])
        self.assertIn(2, pdf['b'])
        self.assertEqual(len(pdf[pdf['b'] == 1]), 2)
        self.assertEqual(len(pdf[pdf['b'] == 2]), 1)
Exemplo n.º 6
0
read_link.read_meth_kwargs['csv'] = {'header': True, 'inferSchema': True}

read_chain.add(read_link)

# STEP 2: Resample the data to make it daily
daily_chain = Chain('daily_aggregation')

# Option 1: Use the default aggregations, giving the columns to aggregate as a
# list. The default aggregations are min, mean, max, stddev, count, and sum.
# We also give `machine` as the column to partition by, since we want to get the
# summaries of individual machines separately.

summary_link1 = DailySummary(
    name='daily_summary1',
    read_key='df_input',
    store_key='df_daily',
    feature_cols=[settings['feature']],
    new_date_col='date',  # won't have timestamps, but dates
    datetime_col='ts',  # column with timestamp
    partitionby_cols=['machine'])  # column to partition by

# Option 2: Choose our own aggregations for resampling, given with a dictionary
# of the format `{'col_name':[list of aggregation functions]}`. We can still use
# the default aggregations for some columns just by passing an empty list.

aggregation_dict = {
    settings['feature']: [],  # use all the default aggregations
    'failure': [f.sum]
}  # only care about number of failures

summary_link2 = DailySummary(name='daily_summary2',
                             read_key='df_input',