def _extract_features_per_kind(kind_to_df_map, settings, column_id, column_value, serial=False): """ Parallelize the feature extraction per kind. :param kind_to_df_map: The time series to compute the features for in our internal format :type kind_to_df_map: dict of pandas.DataFrame :param column_id: The name of the id column to group by. :type column_id: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param settings: settings object that controls which features are calculated :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param serial: Do not parallelize the extraction. This can be handy if (1) you want to debug something (2) you want to profile something or (3) your environment does not support multiprocessing :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series, column_id=column_id, column_value=column_value, settings=settings) pool = Pool(settings.n_processes) chunksize = helper_functions.calculate_best_chunksize(kind_to_df_map, settings) total_number_of_expected_results = len(kind_to_df_map) if serial: map_function = map else: map_function = partial(pool.imap_unordered, chunksize=chunksize) extracted_features = tqdm(map_function(partial_extract_features_for_one_time_series, kind_to_df_map.items()), total=total_number_of_expected_results, desc="Feature Extraction", disable=settings.disable_progressbar) pool.close() # Concatenate all partial results result = pd.concat(extracted_features, axis=1, join='outer').astype(np.float64) # Impute the result if requested if settings.IMPUTE is not None: settings.IMPUTE(result) pool.join() return result
def _extract_features_parallel_per_sample(kind_to_df_map, settings, column_id, column_value): """ Parallelize the feature extraction per kind and per sample. As the splitting of the dataframes per kind along column_id is quite costly, we settled for an async map in this function. The result objects are temporarily stored in a fifo queue from which they can be retrieved in order of submission. :param kind_to_df_map: The time series to compute the features for in our internal format :type kind_to_df_map: dict of pandas.DataFrame :param column_id: The name of the id column to group by. :type column_id: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param settings: settings object that controls which features are calculated :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series, column_id=column_id, column_value=column_value, settings=settings) pool = Pool(settings.n_processes) total_number_of_expected_results = 0 # Submit map jobs per kind per sample results_fifo = Queue() for kind, df_kind in kind_to_df_map.items(): df_grouped_by_id = df_kind.groupby(column_id) total_number_of_expected_results += len(df_grouped_by_id) chunksize = helper_functions.calculate_best_chunksize(df_grouped_by_id, settings) results_fifo.put( pool.imap_unordered( partial_extract_features_for_one_time_series, [(kind, df_group) for _, df_group in df_grouped_by_id], chunksize=chunksize ) ) pool.close() # Wait for the jobs to complete and concatenate the partial results dfs_per_kind = [] # Do this all with a progress bar with tqdm(total=total_number_of_expected_results, desc="Feature Extraction", disable=settings.disable_progressbar) as progress_bar: # We need some sort of measure, when a new result is there. So we wrap the # map_results into another iterable which updates the progress bar each time # a new result is there def iterable_with_tqdm_update(queue, progress_bar): for element in queue: progress_bar.update(1) yield element result = pd.DataFrame() while not results_fifo.empty(): map_result = results_fifo.get() dfs_kind = iterable_with_tqdm_update(map_result, progress_bar) df_tmp = pd.concat(dfs_kind, axis=0).astype(np.float64) result = pd.concat([result, df_tmp], axis=1).astype(np.float64) # Impute the result if requested if settings.IMPUTE is not None: settings.IMPUTE(result) pool.join() return result
def check_fs_sig_bh(X, y, settings=None): """ The wrapper function that calls the significance test functions in this package. In total, for each feature from the input pandas.DataFrame a univariate feature significance test is conducted. Those tests generate p values that are then evaluated by the Benjamini Hochberg procedure to decide which features to keep and which to delete. We are testing :math:`H_0` = the Feature is not relevant and cannot be added against :math:`H_1` = the Feature is relevant and should be kept or in other words :math:`H_0` = Target and Feature are independent / the Feature has no influence on the target :math:`H_1` = Target and Feature are associated / dependent When the target is binary this becomes :math:`H_0 = \\left( F_{\\text{target}=1} = F_{\\text{target}=0} \\right)` :math:`H_1 = \\left( F_{\\text{target}=1} \\neq F_{\\text{target}=0} \\right)` Where :math:`F` is the distribution of the target. In the same way we can state the hypothesis when the feature is binary :math:`H_0 = \\left( T_{\\text{feature}=1} = T_{\\text{feature}=0} \\right)` :math:`H_1 = \\left( T_{\\text{feature}=1} \\neq T_{\\text{feature}=0} \\right)` Here :math:`T` is the distribution of the target. TODO: And for real valued? :param X: The DataFrame containing all the features and the target :type X: pandas.DataFrame :param y: The target vector :type y: pandas.Series :param settings: The feature selection settings to use to perform the tests. :type settings: FeatureSignificanceTestsSettings :return: A pandas.DataFrame with each column of the input DataFrame X as index with information on the significance of this particular feature. The DataFrame has the columns "Feature", "type" (binary, real or const), "p_value" (the significance of this feature as a p-value, lower means more significant) "rejected" (if the Benjamini Hochberg procedure rejected this feature) :rtype: pandas.DataFrame """ if settings is None: settings = FeatureSignificanceTestsSettings() target_is_binary = len(set(y)) == 2 # todo: solve the multiclassification case. for a multi classification the algorithm considers the target to be # regression. Instead one could perform a binary one versus all classification. # Only allow entries for which the target is known! y = y.astype(np.float) X = X.copy().loc[~(y == np.NaN), :] # Create the DataFrame df_features containing the information about the different hypotheses # Every row contains information over one feature column from X df_features = pd.DataFrame() df_features['Feature'] = list(set(X.columns)) df_features = df_features.set_index('Feature', drop=False) # Add relevant columns to df_features df_features["rejected"] = np.nan df_features["type"] = np.nan df_features["p_value"] = np.nan # Calculate the feature significance in parallel pool = Pool(settings.n_processes) # Helper function which wraps the _calculate_p_value with many arguments already set f = partial(_calculate_p_value, y=y, settings=settings, target_is_binary=target_is_binary) chunksize = helper_functions.calculate_best_chunksize(df_features, settings) total_number_of_features = len(df_features) results = tqdm(pool.imap_unordered(f, [X[feature] for feature in df_features['Feature']], chunksize=chunksize), total=total_number_of_features, desc="Feature Selection") p_values_of_features = pd.DataFrame(list(results)) df_features.update(p_values_of_features) pool.close() pool.join() # Perform the real feature rejection if "const" in set(df_features.type): df_features_bh = benjamini_hochberg_test(df_features.loc[~(df_features.type == "const")], settings) df_features = pd.concat([df_features_bh, df_features.loc[df_features.type == "const"]]) else: df_features = benjamini_hochberg_test(df_features, settings) # It is very important that we have a boolean "rejected" column, so we do a cast here to be sure df_features["rejected"] = df_features["rejected"].astype("bool") if settings.write_selection_report: # Write results of BH - Test to file if not os.path.exists(settings.result_dir): os.mkdir(settings.result_dir) with open(os.path.join(settings.result_dir, "fs_bh_results.txt"), 'w') as file_out: file_out.write(("Performed BH Test to control the false discovery rate(FDR); \n" "FDR-Level={0};Hypothesis independent={1}\n" ).format(settings.fdr_level, settings.hypotheses_independent)) df_features.to_csv(index=False, path_or_buf=file_out, sep=';', float_format='%.4f') return df_features