def test_functional_equality(self):
        """
        `extract_relevant_features` should be equivalent to running first `extract_features` with impute and
        `select_features` afterwards.
        Meaning it should produce the same relevant features and the values of these features should be identical.
        :return:
        """
        df, y = self.create_test_data_sample_with_target()

        relevant_features = extract_relevant_features(df,
                                                      y,
                                                      column_id='id',
                                                      column_value='val',
                                                      column_kind='kind',
                                                      column_sort='sort')

        extraction_settings = FeatureExtractionSettings()
        extraction_settings.IMPUTE = impute
        extracted_features = extract_features(
            df,
            feature_extraction_settings=extraction_settings,
            column_id='id',
            column_value='val',
            column_kind='kind',
            column_sort='sort')
        selected_features = select_features(extracted_features, y)

        self.assertEqual(
            set(relevant_features.columns), set(selected_features.columns),
            "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(
                relevant_features.columns, selected_features.columns))
        self.assertTrue(
            (relevant_features.values == selected_features.values).all().all(),
            "Should calculate the same feature values")
Exemplo n.º 2
0
    def __init__(self,
                 evaluate_only_added_features=True,
                 feature_selection_settings=None,
                 feature_extraction_settings=None,
                 column_id=None,
                 column_sort=None,
                 column_kind=None,
                 column_value=None,
                 timeseries_container=None):
        """
        Create a new RelevantFeatureAugmenter instance.

        :param settings: The extraction settings to use. Leave empty to use the default ones.
        :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

        :param evaluate_only_added_features: Whether to touch the manually-created features during feature selection or
                                             not.
        :type evaluate_only_added_features: bool
        :param feature_selection_settings: The feature selection settings.
        :type feature_selection_settings: tsfresh.feature_selection.settings.FeatureSelectionSettings
        :param feature_extraction_settings: The feature extraction settings.
        :type feature_selection_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings
        :param column_id: The column with the id. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_id: basestring
        :param column_sort: The column with the sort data. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_sort: basestring
        :param column_kind: The column with the kind data. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_kind: basestring
        :param column_value: The column with the values. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_value: basestring
        """

        # We require to have IMPUTE!
        if feature_extraction_settings is None:
            feature_extraction_settings = FeatureExtractionSettings()

        # Range will be our default imputation strategy
        feature_extraction_settings.IMPUTE = impute_dataframe_range

        self.feature_extractor = FeatureAugmenter(feature_extraction_settings,
                                                  column_id, column_sort,
                                                  column_kind, column_value)

        self.feature_selector = FeatureSelector(feature_selection_settings)

        self.evaluate_only_added_features = evaluate_only_added_features

        self.timeseries_container = timeseries_container
Exemplo n.º 3
0
def extract_features(timeseries_container,
                     feature_extraction_settings=None,
                     column_id=None,
                     column_sort=None,
                     column_kind=None,
                     column_value=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param column_id: The name of the id column to group by.
    :type column_id: str
    :param column_sort: The name of the sort column.
    :type column_sort: str
    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str
    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :return: The (maybe imputed) DataFrame with the extracted features.
    :rtype: pandas.DataFrame
    """

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort,
                                                                       column_kind, column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Extract the time series features for every type of time series and concatenate them together.
    all_possible_unique_id_values = set(id_value
                                        for kind, df in kind_to_df_map.items()
                                        for id_value in df[column_id])
    df_with_ids = pd.DataFrame(index=all_possible_unique_id_values)

    pool = Pool(feature_extraction_settings.n_processes)
    partial_extract_features_for_one_time_series = partial(
        _extract_features_for_one_time_series,
        column_id=column_id,
        column_value=column_value,
        settings=feature_extraction_settings)
    extracted_features = pool.map(partial_extract_features_for_one_time_series,
                                  kind_to_df_map.items())

    # Add time series features to result
    result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\
        .astype(np.float64)

    # Impute the result if requested
    if feature_extraction_settings.IMPUTE is not None:
        feature_extraction_settings.IMPUTE(result)

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(
            profiler,
            filename=feature_extraction_settings.PROFILING_FILENAME,
            sorting=feature_extraction_settings.PROFILING_SORTING)

    return result
Exemplo n.º 4
0
def run(filename='data/clean_data.csv', city_regions_file='data/CityRegions.csv', load_from_file=True, grid_search=False, baseline=False):
    if city_regions_file == None:
        temp = [['Abiline', 'Texas','South'],['West Jordon', 'Utah', 'West' ], ['Yonkers','New York', 'Northeast']]
        city_regions = pd.DataFrame(temp, columns=['City', 'State','Region'])
    else:
        city_regions = pd.read_csv(city_regions_file, header=0).reset_index(drop=True)

    FEATURE_EXTRACTION='data/data_with_features.csv'
    if not os.path.isfile(FEATURE_EXTRACTION):
        df = pd.read_csv(filename, header=0)
        df.dropna(inplace=True)

        X_labels = ['City', 'State', 'dt', 'AverageTemperature', 'CityIndex']
        df = df[X_labels]
        df = df.dropna()
        #city_state = df[['City', 'State']]
        # Sadness because multiple cities with same name.......
        #df['CityIndex'] = city_state.apply(number_cities, axis=1)
        #df.to_csv('data/clean_data.csv', index=False)

        orig_cities = city_regions[['City','State']]
        print "Total cities ", len(orig_cities)
        y_regions = city_regions['Region']
        y_regions = y_regions.apply(number_regions)

        feature_extraction_settings = FeatureExtractionSettings()
        feature_extraction_settings.IMPUTE = impute
        feat_extractor = FeatureAugmenter(feature_extraction_settings,
                                          column_id='CityIndex', column_sort='dt', column_value='AverageTemperature')

        empty_df = pd.DataFrame(index=y_regions.index)
        feat_extractor.set_timeseries_container(df)
        output = feat_extractor.fit_transform(empty_df,y_regions)
        output['City'] = city_regions['City']
        output['State'] = city_regions['State']
        output['Region'] = city_regions['Region']

        output.to_csv(FEATURE_EXTRACTION, index=False)
    else:
        output = pd.read_csv(FEATURE_EXTRACTION)

    output = output.drop(['City', 'State', 'Region'], axis=1)

    if baseline:
        output = output['AverageTemperature__mean'].to_frame()

    train, test, validation = split_data(output, city_regions)

    """
    aug = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex',
                    column_sort='dt', column_value='AverageTemperature',
                    timeseries_container=train['df'])
    output = aug.fit_transform(train['X'], train['y'])
    output['City_Name'] = train['city_names']
    output.to_csv('data/features_from_tsfresh.csv', index=False)
    """
    if load_from_file:
        clf = joblib.load('./model.joblib.pkl')
    else:
        clf = DecisionTreeClassifier(criterion='entropy', max_features=None,
                                     min_samples_split=0.1, max_depth=50, class_weight=None)
        # feat_extractor = RelevantFeatureAugmenter(column_id='CityIndex', column_sort='dt', column_value='AverageTemperature')

        # for the fit on the train test set, we set the fresh__timeseries_container to `df_train`
        if grid_search and not baseline:
            grid = {'max_features': [2, 10, 20, 30, 50, 100, 200, None],
                    'max_depth': [1, 25, 50, 100],
                    'class_weight': [None, 'balanced'],
                    'min_samples_split': [0.1, 0.25, 0.75, 1.0]}
            scorer = metrics.make_scorer(partial(metrics.accuracy_score))
            clf = GridSearchCV(clf, grid, scoring=scorer, n_jobs=multiprocessing.cpu_count())

        clf.fit(train['X'], train['y'])
        # pipeline.set_params(augmenter__timeseries_container=train['df'])
        # pipeline.fit(train['X'], train['y'])

        y_pred = pd.Series(clf.predict(train['X']))
        y_true = pd.Series(np.array(train['y']))
        result = train['city_names']
        result.reset_index(drop=True, inplace=True)
        result['Orig'] = y_true
        result['Pred'] = y_pred
        wrongs = y_true == y_pred
        result['Correct'] = wrongs
        result.to_csv('data/results_train.csv', index=False)
        

        if grid_search and not baseline:
            print "Best Parameters found from grid search: "
            print clf.best_params_

        print "train accuracy ", accuracy_score(y_true, y_pred)
        cm_train = confusion_matrix(y_true, y_pred)
        print "Confusion matrix for training\n", cm_train
        # for the predict on the test test set, we set the fresh__timeseries_container to `df_test`
        joblib.dump(clf, './model.joblib.pkl')
    #### ENDIF

    y_pred = pd.Series(clf.predict(test['X']))
    y_true = pd.Series(np.array(test['y']))
    result = test['city_names']
    result.reset_index(drop=True, inplace=True)
    result['Orig'] = y_true
    result['Pred'] = y_pred
    wrongs = y_true == y_pred
    result['Correct'] = wrongs
    result.to_csv('data/results_test.csv', index=False)
    
    print "test accuracy ", accuracy_score(y_true, y_pred)
    cm_test = confusion_matrix(y_true, y_pred)
    print "Confusion matrix for testing\n", cm_test

    class_names = ['Northeast', 'Midwest', 'West', 'South']
    if not load_from_file:
        plot_confusion_matrix(cm_train, class_names)
        plt.tight_layout()
        plt.savefig('train_cm.png')
    plt.hold(False)
    plot_confusion_matrix(cm_test, class_names)
    plt.tight_layout()
    plt.savefig('test_cm.png')

    if not load_from_file and not grid_search:
        features = output.columns.values
        importances = clf.feature_importances_
        with open("tree_viz.dot", "w") as f:
            f = tree.export_graphviz(clf, out_file=f)
        top_n = 20
        ndx = np.argsort(importances)[::-1]
        sorted_features = features[ndx][:20]
        sorted_importances = importances[ndx][:20]
        print '%80s & %s' %('Feature', 'Importance')
        for f, i in zip(sorted_features, sorted_importances):
            # print '%80s & %.2f \\\\' % (f[20:], i)
            print '%s & %.2f \\\\' % (f[20:], i)

    y_pred = clf.predict(validation['X'])
    y_true = np.array(validation['y'])

    y_pred = pd.Series(clf.predict(validation['X']))
    y_true = pd.Series(np.array(validation['y']))
    result = validation['city_names']
    result.reset_index(drop=True, inplace=True)
    result['Orig'] = y_true
    result['Pred'] = y_pred
    wrongs = y_true == y_pred
    result['Correct'] = wrongs
    result.to_csv('data/results_val.csv', index=False)
    

    print "validation accuracy ", accuracy_score(y_true, y_pred)
    cm_val = confusion_matrix(y_true, y_pred)
    print "Confusion matrix for validation\n", cm_val
    print "done"

    class_names = ['Northeast', 'Midwest', 'West', 'South']
    plt.hold(False)
    plot_confusion_matrix(cm_val, class_names)
    plt.tight_layout()
    plt.savefig('val_cm.png')
Exemplo n.º 5
0
print X.shape
print X_empty.shape

"""
pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='time')),
                ('classifier', DecisionTreeClassifier())])
pipeline.set_params(augmenter__timeseries_container=df_ts)
pipeline.fit(X, y)
quit()

"""


print y_regions.shape
feature_extraction_settings = FeatureExtractionSettings()
feature_extraction_settings.IMPUTE = impute
pipeline = Pipeline([('augmenter', FeatureAugmenter(feature_extraction_settings, column_id='City', column_sort='dt', column_value='AverageTemperature')),
                ('classifier', DecisionTreeClassifier(criterion='entropy'))])

pipeline.set_params(augmenter__timeseries_container=X_train)
pipeline.fit(X_empty, y_regions)

"""
aug = RelevantFeatureAugmenter(column_id='City', column_sort='dt', column_value="AverageTemperature", timeseries_container=X_train)
new_X = aug.fit_transform(X_empty, y_regions)

clf = DecisionTreeClassifier(criterion='entropy')
"""


y_pred = pipeline.predict(X_empty)
Exemplo n.º 6
0
def extract_features(timeseries_container,
                     feature_extraction_settings=None,
                     column_id=None,
                     column_sort=None,
                     column_kind=None,
                     column_value=None,
                     parallelization=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param parallelization: Either ``'per_sample'`` or ``'per_kind'``   , see
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`,
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and
                            :ref:`parallelization-label` for details.
    :type parallelization: str

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort,
                                                                       column_kind, column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # Choose the parallelization according to a rule-of-thumb
    if parallelization is None:
        parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \
            else 'per_kind'

    _logger.info(
        'Parallelizing feature calculation {}'.format(parallelization))

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Calculate the result
    if parallelization == 'per_kind':
        result = _extract_features_parallel_per_kind(
            kind_to_df_map, feature_extraction_settings, column_id,
            column_value)
    elif parallelization == 'per_sample':
        result = _extract_features_parallel_per_sample(
            kind_to_df_map, feature_extraction_settings, column_id,
            column_value)
    else:
        raise ValueError(
            "Argument parallelization must be one of: 'per_kind', 'per_sample'"
        )

    # Impute the result if requested
    if feature_extraction_settings.IMPUTE is not None:
        feature_extraction_settings.IMPUTE(result)

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(
            profiler,
            filename=feature_extraction_settings.PROFILING_FILENAME,
            sorting=feature_extraction_settings.PROFILING_SORTING)

    return result