class TestFeatureAdder(unittest2.TestCase): def setUp(self): self.stats_calc = StatsCalculator() self.preprocessor = Preprocessor() self.feature_adder = FeatureAdder() self.col_names = [f'feature_{i}' for i in range(FEATURES)] def tearDown(self): self.stats_calc = None self.preprocessor = None self.feature_adder = None def _get_df(self): df = pd.read_csv('data/train.tsv', sep='\t') df = self.preprocessor.split_features(df) df = self.preprocessor.f_to_int(df) return df def test_max_index_feature(self): """ Test that new feature 'max_feature_2_index' lies in proper range and has dtype 'int64' """ df = self._get_df() new_feature = 'max_feature_2_index' df = self.feature_adder.max_index_feature(df) valid_range, valid_dtype = (0, 255), 'int64' assert df[new_feature].between(*valid_range).all() and df[new_feature].dtype == valid_dtype, \ "max_feature_2_index feature not in range OR has wrong dtype" def test_abs_mean_diff_feature(self): """ Test that new feature 'max_feature_2_abs_mean_diff' is valid """ df = self._get_df() df = self.feature_adder.max_index_feature(df) new_feature = 'max_feature_2_abs_mean_diff' cols = np.array(self.col_names)[df['max_feature_2_index'].values] train_stats = find_train_stats('data/train.tsv', chunksize=10000) df = self.feature_adder.abs_mean_diff_feature( df.loc[:, df.columns != 'id_job'], train_stats) results = [] for i, col in enumerate(cols): # keep in mind outliers in test data lower_bound, upper_bound = 0, train_stats[col]['std'] results.append(lower_bound <= df[new_feature][i] <= upper_bound) self.assertTrue( np.all(results), "max_feature_2_index feature not in expected range OR has wrong dtype" )
class TestStatsCalculator(unittest2.TestCase): def setUp(self): self.stats_calc = StatsCalculator() self.preprocessor = Preprocessor() self.col_names = [f'feature_{i}' for i in range(FEATURES)] def tearDown(self): self.stats_calc = None self.preprocessor = None def _get_df(self): df = pd.read_csv('data/train.tsv', sep='\t') df = self.preprocessor.split_features(df) df = self.preprocessor.f_to_int(df) return df def test_mean_calc(self): df = self._get_df() col = random.choice(self.col_names) res = self.stats_calc.calc_mean(df, col) valid_res = np.mean(df[col]) self.assertEqual(res, valid_res, "Wrong mean calculation") def test_std_calc(self): df = self._get_df() col = random.choice(self.col_names) res = self.stats_calc.calc_std(df, col) valid_res = np.std(df[col]) self.assertEqual(res, valid_res, "Wrong std calculation") def test_speed(self): """ Test parallelized mean calculation """ df = self._get_df() col = random.choice(self.col_names) def wrapper(func): def inner(df, col, multiproc=False): start = time.time() result = func(df, col) end = time.time() print(f'\nResult of calculation: {result}') if multiproc: print(f'Timing of calc in parallel: {end - start}') else: print(f'Timing of sequential calc: {end - start}') return result return inner seq_calc = wrapper(self.stats_calc.calc_mean) res = seq_calc(df, col) parallel_calc = wrapper(self.stats_calc.calc_mean) parallel_calc(df, col, multiproc=True) true_value = np.mean(df[col]) self.assertEqual(res, true_value, "Wrong mean calculation")