def test_fit_and_transform(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", kind_to_fc_parameters=self.kind_to_fc_parameters) # Fit should do nothing returned_df = augmenter.fit() six.assertCountEqual(self, returned_df.__dict__, augmenter.__dict__) self.assertRaises(RuntimeError, augmenter.transform, None) augmenter.set_timeseries_container(self.test_df) # Add features to all time series X_with_index = pd.DataFrame([{"feature_1": 1}]*2, index=[10, 500]) X_transformed = augmenter.transform(X_with_index) # Require same shape for i in X_transformed.index: self.assertIn(i, X_with_index.index) for i in X_with_index.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (2, 3)) # Preserve old features six.assertCountEqual(self, list(X_transformed.columns), ["feature_1", "a__length", "b__length"]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print((index, row)) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"]))
def test_add_features_to_only_a_part(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", settings=self.settings) augmenter.set_timeseries_container(self.test_df) X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[1]) X_transformed = augmenter.transform(X_with_not_all_ids) for i in X_transformed.index: self.assertIn(i, X_with_not_all_ids.index) for i in X_with_not_all_ids.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (1, 3)) self.assertEqual(X_transformed.index, [1]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print(index, row) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"]))
def test_add_features_to_only_a_part(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", kind_to_fc_parameters=self.kind_to_fc_parameters, n_jobs=0, disable_progressbar = True) augmenter.set_timeseries_container(self.test_df) X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[10]) X_transformed = augmenter.transform(X_with_not_all_ids) for i in X_transformed.index: self.assertIn(i, X_with_not_all_ids.index) for i in X_with_not_all_ids.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (1, 3)) self.assertEqual(X_transformed.index, [10]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print((index, row)) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"]))
def test_no_ids_present(self): augmenter = FeatureAugmenter( column_value="val", column_id="id", column_sort="sort", column_kind="kind", kind_to_fc_parameters=self.kind_to_fc_parameters, n_jobs=0, disable_progressbar=True) augmenter.set_timeseries_container(self.test_df) X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[-999]) self.assertRaisesRegex(AttributeError, r"The ids of the time series container", augmenter.transform, X_with_not_all_ids)
def run(filename='data/clean_data.csv', city_regions_file='data/CityRegions.csv', load_from_file=True, grid_search=False, baseline=False): if city_regions_file == None: temp = [['Abiline', 'Texas','South'],['West Jordon', 'Utah', 'West' ], ['Yonkers','New York', 'Northeast']] city_regions = pd.DataFrame(temp, columns=['City', 'State','Region']) else: city_regions = pd.read_csv(city_regions_file, header=0).reset_index(drop=True) FEATURE_EXTRACTION='data/data_with_features.csv' if not os.path.isfile(FEATURE_EXTRACTION): df = pd.read_csv(filename, header=0) df.dropna(inplace=True) X_labels = ['City', 'State', 'dt', 'AverageTemperature', 'CityIndex'] df = df[X_labels] df = df.dropna() #city_state = df[['City', 'State']] # Sadness because multiple cities with same name....... #df['CityIndex'] = city_state.apply(number_cities, axis=1) #df.to_csv('data/clean_data.csv', index=False) orig_cities = city_regions[['City','State']] print "Total cities ", len(orig_cities) y_regions = city_regions['Region'] y_regions = y_regions.apply(number_regions) feature_extraction_settings = FeatureExtractionSettings() feature_extraction_settings.IMPUTE = impute feat_extractor = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex', column_sort='dt', column_value='AverageTemperature') empty_df = pd.DataFrame(index=y_regions.index) feat_extractor.set_timeseries_container(df) output = feat_extractor.fit_transform(empty_df,y_regions) output['City'] = city_regions['City'] output['State'] = city_regions['State'] output['Region'] = city_regions['Region'] output.to_csv(FEATURE_EXTRACTION, index=False) else: output = pd.read_csv(FEATURE_EXTRACTION) output = output.drop(['City', 'State', 'Region'], axis=1) if baseline: output = output['AverageTemperature__mean'].to_frame() train, test, validation = split_data(output, city_regions) """ aug = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex', column_sort='dt', column_value='AverageTemperature', timeseries_container=train['df']) output = aug.fit_transform(train['X'], train['y']) output['City_Name'] = train['city_names'] output.to_csv('data/features_from_tsfresh.csv', index=False) """ if load_from_file: clf = joblib.load('./model.joblib.pkl') else: clf = DecisionTreeClassifier(criterion='entropy', max_features=None, min_samples_split=0.1, max_depth=50, class_weight=None) # feat_extractor = RelevantFeatureAugmenter(column_id='CityIndex', column_sort='dt', column_value='AverageTemperature') # for the fit on the train test set, we set the fresh__timeseries_container to `df_train` if grid_search and not baseline: grid = {'max_features': [2, 10, 20, 30, 50, 100, 200, None], 'max_depth': [1, 25, 50, 100], 'class_weight': [None, 'balanced'], 'min_samples_split': [0.1, 0.25, 0.75, 1.0]} scorer = metrics.make_scorer(partial(metrics.accuracy_score)) clf = GridSearchCV(clf, grid, scoring=scorer, n_jobs=multiprocessing.cpu_count()) clf.fit(train['X'], train['y']) # pipeline.set_params(augmenter__timeseries_container=train['df']) # pipeline.fit(train['X'], train['y']) y_pred = pd.Series(clf.predict(train['X'])) y_true = pd.Series(np.array(train['y'])) result = train['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_train.csv', index=False) if grid_search and not baseline: print "Best Parameters found from grid search: " print clf.best_params_ print "train accuracy ", accuracy_score(y_true, y_pred) cm_train = confusion_matrix(y_true, y_pred) print "Confusion matrix for training\n", cm_train # for the predict on the test test set, we set the fresh__timeseries_container to `df_test` joblib.dump(clf, './model.joblib.pkl') #### ENDIF y_pred = pd.Series(clf.predict(test['X'])) y_true = pd.Series(np.array(test['y'])) result = test['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_test.csv', index=False) print "test accuracy ", accuracy_score(y_true, y_pred) cm_test = confusion_matrix(y_true, y_pred) print "Confusion matrix for testing\n", cm_test class_names = ['Northeast', 'Midwest', 'West', 'South'] if not load_from_file: plot_confusion_matrix(cm_train, class_names) plt.tight_layout() plt.savefig('train_cm.png') plt.hold(False) plot_confusion_matrix(cm_test, class_names) plt.tight_layout() plt.savefig('test_cm.png') if not load_from_file and not grid_search: features = output.columns.values importances = clf.feature_importances_ with open("tree_viz.dot", "w") as f: f = tree.export_graphviz(clf, out_file=f) top_n = 20 ndx = np.argsort(importances)[::-1] sorted_features = features[ndx][:20] sorted_importances = importances[ndx][:20] print '%80s & %s' %('Feature', 'Importance') for f, i in zip(sorted_features, sorted_importances): # print '%80s & %.2f \\\\' % (f[20:], i) print '%s & %.2f \\\\' % (f[20:], i) y_pred = clf.predict(validation['X']) y_true = np.array(validation['y']) y_pred = pd.Series(clf.predict(validation['X'])) y_true = pd.Series(np.array(validation['y'])) result = validation['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_val.csv', index=False) print "validation accuracy ", accuracy_score(y_true, y_pred) cm_val = confusion_matrix(y_true, y_pred) print "Confusion matrix for validation\n", cm_val print "done" class_names = ['Northeast', 'Midwest', 'West', 'South'] plt.hold(False) plot_confusion_matrix(cm_val, class_names) plt.tight_layout() plt.savefig('val_cm.png')