def standardize_train_test_data(train_df, columns): ''' Add normalised columns to the input dataframe. formula = [(X - mean) / std_dev] Inputs : training dataframe, list of column name strings to be normalised Returns : dataframe with new normalised columns, averages and std deviation dataframes ''' # Find the Mean and the Standard Deviation for each column aggExpr = [] aggStd = [] print(columns) for column in columns: print(column) aggExpr.append(np.mean(train_df[column]).alias(column)) aggStd.append(stddev(train_df[column]).alias(column + '_stddev')) averages = train_df.agg(*aggExpr).collect()[0] std_devs = train_df.agg(*aggStd).collect()[0] # Standardise each dataframe, column by column for column in columns: # Standardise the TRAINING data train_df = train_df.withColumn(column + '_norm', ((train_df[column] - averages[column]) / std_devs[column + '_stddev'])) # Standardise the TEST data (using the training mean and std_dev) # test_df = test_df.withColumn(column + '_norm', ((test_df[column] - averages[column]) / # std_devs[column + '_stddev'])) return train_df, averages, std_devs
def test_summary_stddev(pyspark, summarizers, tests_utils, price, forecast): expected_pdf = make_pdf([( 0, 1.802775638, )], ["time", "price_stddev"]) joined = price.leftJoin(forecast, key="id") result = joined.summarize(summarizers.stddev("price")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_stddev(self): from ts.flint import summarizers price = self.price() forecast = self.forecast() expected_pdf = test_utils.make_pdf([( 0, 1.802775638, )], ["time", "price_stddev"]) joined = price.leftJoin(forecast, key="id") result = joined.summarize(summarizers.stddev("price")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_stddev(self): from ts.flint import summarizers price = self.price() forecast = self.forecast() expected_pdf = make_pdf([ (0, 1.802775638,) ], ["time", "price_stddev"]) joined = price.leftJoin(forecast, key="id") result = joined.summarize(summarizers.stddev("price")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_compose(self): from ts.flint import summarizers price = self.price() expected_pdf = make_pdf([ (0, 6.0, 0.5, 3.25, 1.802775638,) ], ["time", "price_max", "price_min", "price_mean", "price_stddev"]) result = price.summarize([summarizers.max("price"), summarizers.min("price"), summarizers.mean("price"), summarizers.stddev("price")]).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_compose(pyspark, summarizers, tests_utils, price): expected_pdf = make_pdf([( 0, 6.0, 0.5, 3.25, 1.802775638, )], ["time", "price_max", "price_min", "price_mean", "price_stddev"]) result = price.summarize([ summarizers.max("price"), summarizers.min("price"), summarizers.mean("price"), summarizers.stddev("price") ]).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_compose(self): from ts.flint import summarizers price = self.price() expected_pdf = test_utils.make_pdf([( 0, 6.0, 0.5, 3.25, 1.802775638, )], ["time", "price_max", "price_min", "price_mean", "price_stddev"]) result = price.summarize([ summarizers.max("price"), summarizers.min("price"), summarizers.mean("price"), summarizers.stddev("price") ]).toPandas() pdt.assert_frame_equal(result, expected_pdf)