def test_functional_equality(self): """ `extract_relevant_features` should be equivalent to running first `extract_features` with impute and `select_features` afterwards. Meaning it should produce the same relevant features and the values of these features should be identical. :return: """ df, y = self.create_test_data_sample_with_target() relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', column_sort='sort') extraction_settings = FeatureExtractionSettings() extraction_settings.IMPUTE = impute extracted_features = extract_features( df, feature_extraction_settings=extraction_settings, column_id='id', column_value='val', column_kind='kind', column_sort='sort') selected_features = select_features(extracted_features, y) self.assertEqual( set(relevant_features.columns), set(selected_features.columns), "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format( relevant_features.columns, selected_features.columns)) self.assertTrue( (relevant_features.values == selected_features.values).all().all(), "Should calculate the same feature values")
def __init__(self, evaluate_only_added_features=True, feature_selection_settings=None, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None, timeseries_container=None): """ Create a new RelevantFeatureAugmenter instance. :param settings: The extraction settings to use. Leave empty to use the default ones. :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param evaluate_only_added_features: Whether to touch the manually-created features during feature selection or not. :type evaluate_only_added_features: bool :param feature_selection_settings: The feature selection settings. :type feature_selection_settings: tsfresh.feature_selection.settings.FeatureSelectionSettings :param feature_extraction_settings: The feature extraction settings. :type feature_selection_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param column_id: The column with the id. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_id: basestring :param column_sort: The column with the sort data. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_sort: basestring :param column_kind: The column with the kind data. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_kind: basestring :param column_value: The column with the values. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_value: basestring """ # We require to have IMPUTE! if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() # Range will be our default imputation strategy feature_extraction_settings.IMPUTE = impute_dataframe_range self.feature_extractor = FeatureAugmenter(feature_extraction_settings, column_id, column_sort, column_kind, column_value) self.feature_selector = FeatureSelector(feature_selection_settings) self.evaluate_only_added_features = evaluate_only_added_features self.timeseries_container = timeseries_container
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :return: The (maybe imputed) DataFrame with the extracted features. :rtype: pandas.DataFrame """ # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Extract the time series features for every type of time series and concatenate them together. all_possible_unique_id_values = set(id_value for kind, df in kind_to_df_map.items() for id_value in df[column_id]) df_with_ids = pd.DataFrame(index=all_possible_unique_id_values) pool = Pool(feature_extraction_settings.n_processes) partial_extract_features_for_one_time_series = partial( _extract_features_for_one_time_series, column_id=column_id, column_value=column_value, settings=feature_extraction_settings) extracted_features = pool.map(partial_extract_features_for_one_time_series, kind_to_df_map.items()) # Add time series features to result result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\ .astype(np.float64) # Impute the result if requested if feature_extraction_settings.IMPUTE is not None: feature_extraction_settings.IMPUTE(result) # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling( profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result
def run(filename='data/clean_data.csv', city_regions_file='data/CityRegions.csv', load_from_file=True, grid_search=False, baseline=False): if city_regions_file == None: temp = [['Abiline', 'Texas','South'],['West Jordon', 'Utah', 'West' ], ['Yonkers','New York', 'Northeast']] city_regions = pd.DataFrame(temp, columns=['City', 'State','Region']) else: city_regions = pd.read_csv(city_regions_file, header=0).reset_index(drop=True) FEATURE_EXTRACTION='data/data_with_features.csv' if not os.path.isfile(FEATURE_EXTRACTION): df = pd.read_csv(filename, header=0) df.dropna(inplace=True) X_labels = ['City', 'State', 'dt', 'AverageTemperature', 'CityIndex'] df = df[X_labels] df = df.dropna() #city_state = df[['City', 'State']] # Sadness because multiple cities with same name....... #df['CityIndex'] = city_state.apply(number_cities, axis=1) #df.to_csv('data/clean_data.csv', index=False) orig_cities = city_regions[['City','State']] print "Total cities ", len(orig_cities) y_regions = city_regions['Region'] y_regions = y_regions.apply(number_regions) feature_extraction_settings = FeatureExtractionSettings() feature_extraction_settings.IMPUTE = impute feat_extractor = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex', column_sort='dt', column_value='AverageTemperature') empty_df = pd.DataFrame(index=y_regions.index) feat_extractor.set_timeseries_container(df) output = feat_extractor.fit_transform(empty_df,y_regions) output['City'] = city_regions['City'] output['State'] = city_regions['State'] output['Region'] = city_regions['Region'] output.to_csv(FEATURE_EXTRACTION, index=False) else: output = pd.read_csv(FEATURE_EXTRACTION) output = output.drop(['City', 'State', 'Region'], axis=1) if baseline: output = output['AverageTemperature__mean'].to_frame() train, test, validation = split_data(output, city_regions) """ aug = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex', column_sort='dt', column_value='AverageTemperature', timeseries_container=train['df']) output = aug.fit_transform(train['X'], train['y']) output['City_Name'] = train['city_names'] output.to_csv('data/features_from_tsfresh.csv', index=False) """ if load_from_file: clf = joblib.load('./model.joblib.pkl') else: clf = DecisionTreeClassifier(criterion='entropy', max_features=None, min_samples_split=0.1, max_depth=50, class_weight=None) # feat_extractor = RelevantFeatureAugmenter(column_id='CityIndex', column_sort='dt', column_value='AverageTemperature') # for the fit on the train test set, we set the fresh__timeseries_container to `df_train` if grid_search and not baseline: grid = {'max_features': [2, 10, 20, 30, 50, 100, 200, None], 'max_depth': [1, 25, 50, 100], 'class_weight': [None, 'balanced'], 'min_samples_split': [0.1, 0.25, 0.75, 1.0]} scorer = metrics.make_scorer(partial(metrics.accuracy_score)) clf = GridSearchCV(clf, grid, scoring=scorer, n_jobs=multiprocessing.cpu_count()) clf.fit(train['X'], train['y']) # pipeline.set_params(augmenter__timeseries_container=train['df']) # pipeline.fit(train['X'], train['y']) y_pred = pd.Series(clf.predict(train['X'])) y_true = pd.Series(np.array(train['y'])) result = train['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_train.csv', index=False) if grid_search and not baseline: print "Best Parameters found from grid search: " print clf.best_params_ print "train accuracy ", accuracy_score(y_true, y_pred) cm_train = confusion_matrix(y_true, y_pred) print "Confusion matrix for training\n", cm_train # for the predict on the test test set, we set the fresh__timeseries_container to `df_test` joblib.dump(clf, './model.joblib.pkl') #### ENDIF y_pred = pd.Series(clf.predict(test['X'])) y_true = pd.Series(np.array(test['y'])) result = test['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_test.csv', index=False) print "test accuracy ", accuracy_score(y_true, y_pred) cm_test = confusion_matrix(y_true, y_pred) print "Confusion matrix for testing\n", cm_test class_names = ['Northeast', 'Midwest', 'West', 'South'] if not load_from_file: plot_confusion_matrix(cm_train, class_names) plt.tight_layout() plt.savefig('train_cm.png') plt.hold(False) plot_confusion_matrix(cm_test, class_names) plt.tight_layout() plt.savefig('test_cm.png') if not load_from_file and not grid_search: features = output.columns.values importances = clf.feature_importances_ with open("tree_viz.dot", "w") as f: f = tree.export_graphviz(clf, out_file=f) top_n = 20 ndx = np.argsort(importances)[::-1] sorted_features = features[ndx][:20] sorted_importances = importances[ndx][:20] print '%80s & %s' %('Feature', 'Importance') for f, i in zip(sorted_features, sorted_importances): # print '%80s & %.2f \\\\' % (f[20:], i) print '%s & %.2f \\\\' % (f[20:], i) y_pred = clf.predict(validation['X']) y_true = np.array(validation['y']) y_pred = pd.Series(clf.predict(validation['X'])) y_true = pd.Series(np.array(validation['y'])) result = validation['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_val.csv', index=False) print "validation accuracy ", accuracy_score(y_true, y_pred) cm_val = confusion_matrix(y_true, y_pred) print "Confusion matrix for validation\n", cm_val print "done" class_names = ['Northeast', 'Midwest', 'West', 'South'] plt.hold(False) plot_confusion_matrix(cm_val, class_names) plt.tight_layout() plt.savefig('val_cm.png')
print X.shape print X_empty.shape """ pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='time')), ('classifier', DecisionTreeClassifier())]) pipeline.set_params(augmenter__timeseries_container=df_ts) pipeline.fit(X, y) quit() """ print y_regions.shape feature_extraction_settings = FeatureExtractionSettings() feature_extraction_settings.IMPUTE = impute pipeline = Pipeline([('augmenter', FeatureAugmenter(feature_extraction_settings, column_id='City', column_sort='dt', column_value='AverageTemperature')), ('classifier', DecisionTreeClassifier(criterion='entropy'))]) pipeline.set_params(augmenter__timeseries_container=X_train) pipeline.fit(X_empty, y_regions) """ aug = RelevantFeatureAugmenter(column_id='City', column_sort='dt', column_value="AverageTemperature", timeseries_container=X_train) new_X = aug.fit_transform(X_empty, y_regions) clf = DecisionTreeClassifier(criterion='entropy') """ y_pred = pipeline.predict(X_empty)
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None, parallelization=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param parallelization: Either ``'per_sample'`` or ``'per_kind'`` , see :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`, :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and :ref:`parallelization-label` for details. :type parallelization: str :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # Choose the parallelization according to a rule-of-thumb if parallelization is None: parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \ else 'per_kind' _logger.info( 'Parallelizing feature calculation {}'.format(parallelization)) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Calculate the result if parallelization == 'per_kind': result = _extract_features_parallel_per_kind( kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'per_sample': result = _extract_features_parallel_per_sample( kind_to_df_map, feature_extraction_settings, column_id, column_value) else: raise ValueError( "Argument parallelization must be one of: 'per_kind', 'per_sample'" ) # Impute the result if requested if feature_extraction_settings.IMPUTE is not None: feature_extraction_settings.IMPUTE(result) # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling( profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result