def test_retail_binary(ftens_file='retail_binary_files/ftens.csv', labels_file='retail_binary_files/labels.csv', fl_file='retail_binary_files/fl.p'): ftens, labels, fl = construct_retail_example(ftens_file, labels_file, fl_file) baseline_ftens = (ftens.reset_index( 'customer_id', drop=False).drop_duplicates('customer_id', keep='last').set_index('customer_id')) baseline_ftens, baseline_fl = ft.encode_features(baseline_ftens, fl) baseline_ftens, baseline_fl = remove_low_information_features( baseline_ftens, baseline_fl) train_customers, test_customers = train_test_split( baseline_ftens.index.values, shuffle=True, test_size=0.1) train_labels = labels.loc[train_customers] test_labels = labels.loc[test_customers] train_ftens = ftens.loc[(train_customers, slice(None)), :] test_ftens = ftens.loc[(test_customers, slice(None)), :] baseline_train_fm = baseline_ftens.loc[train_customers, :] baseline_test_fm = baseline_ftens.loc[test_customers, :] dl_model = DLDB(regression=False, classes=[False, True], recurrent_layer_sizes=(32, ), dense_layer_sizes=(32, 32), categorical_max_vocab=10) dl_model.fit(train_ftens, train_labels, fl=fl, epochs=1, batch_size=32) predictions = dl_model.predict(test_ftens) score = roc_auc_score(test_labels, predictions) baseline_scores = score_baseline_pipeline(baseline_train_fm, train_labels, baseline_test_fm, test_labels) return score, baseline_scores
def remove_li_features(df): """Remove low information features""" old_shape = df.shape[1] df = selection.remove_low_information_features(df) print('Removed features from df: {}'.format(old_shape - df.shape[1])) return df
def create_feature_set(data: pd.DataFrame, train_table: str, test_table: str): es = create_entity_set(data, train_table, test_table) print(f"\nBeginning automated feature engineering using entity set") print(f" MAX_FEATURES={MAX_FEATURES}") print(f" MAX_FT_DEPTH={MAX_FT_DEPTH}") start = time.monotonic() feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='combined_train_test', max_depth=MAX_FT_DEPTH, max_features=MAX_FEATURES, verbose=True) end = time.monotonic() print( f"Automated feature engineering completed in {round(end - start)} seconds" ) feature_matrix = selection.remove_low_information_features(feature_matrix) print(f" Found {feature_matrix.shape[1]} features") train_data: pd.DataFrame = pd.DataFrame( feature_matrix[feature_matrix['DATA_SET'] == 0]) test_data: pd.DataFrame = pd.DataFrame( feature_matrix[feature_matrix['DATA_SET'] == 1]) return train_data, test_data
def autoFeatureEngineering(es, target_entityName): from featuretools.selection import remove_low_information_features fm, features = ft.dfs( entityset=es, target_entity=target_entityName, #agg_primitives=['Sum', 'Mean', 'Percent_True'], trans_primitives=['divide_numeric', 'multiply_numeric'], #'add_numeric', # trans_primitives=['Hour'], max_depth=1, # approximate='2m', #cutoff_time=cutoff_times[1000:], ignore_variables={'toolgkpi': ['MFG_DATE', targetColumn]}, verbose=True) # -------------------------用one_hot编码特征矩阵------------------------- fm_enc, f_enc = ft.encode_features(fm, features) #print("用one_hot编码特征矩阵 Number of features %s" % len(fm_enc)) # -------------------------na 用0取代------------------------- fm_enc = fm_enc.fillna(0) #print("fillna Number of features %s" % len(fm_enc)) # -------------------------移除较小信息的特征------------------------- fm_enc = remove_low_information_features(fm_enc) # print("移除较小信息的特征 Number of features %s" % len(fm_enc)) # ------------------------- # feature = feature_names[14] # ft.graph_feature(feature) # ft.describe_feature(feature) fm_enc.replace([np.inf, -np.inf], np.nan) # np.inf都用np.nan代替 print(fm_enc.isnull().sum()) # print(fm_enc,f_enc) print(fm_enc.columns) return fm_enc
def _fit_and_return_result(self, *, timeout: float = None, iterations: int = None): if self._entityset is None: raise ValueError( 'Must call .set_training_data() before calling .fit()') ignore_variables = {self._target_entity: [self._target]} time_index = self._entityset[self._target_entity].time_index index = self._entityset[self._target_entity].index cutoff_time = None if time_index: target_df = self._entityset[self._target_entity].df cutoff_time = target_df[[index, time_index]] ignore_variables = None features_only = (not self.hyperparams['encode'] and not self.hyperparams['remove_low_information']) agg_primitives = [ name[12:] for name, value in self.hyperparams.items() if name.startswith('aggregation_') and value ] trans_primitives = [ name[10:] for name, value in self.hyperparams.items() if name.startswith('transform_') and value ] res = ft.dfs(entityset=self._entityset, target_entity=self._target_entity, cutoff_time=cutoff_time, cutoff_time_in_index=False, features_only=features_only, ignore_variables=ignore_variables, max_depth=self.hyperparams['max_depth'], agg_primitives=agg_primitives, trans_primitives=trans_primitives) if not features_only: if self.hyperparams['encode']: fm, self._features = ft.encode_features( *res, top_n=self.hyperparams['top_n'], include_unknown=self.hyperparams['include_unknown']) if self.hyperparams['remove_low_information']: fm, self._features = remove_low_information_features( fm, self._features) self._fitted = True return fm else: self._fitted = True self._features = res
def test_remove_low_information_features(es, feature_matrix): features = [Feature(v) for v in es['test'].variables] feature_matrix, features = remove_low_information_features(feature_matrix, features) assert feature_matrix.shape == (3, 5) assert len(features) == 5 for f in features: assert f.get_name() in feature_matrix.columns assert 'one_value' not in feature_matrix.columns assert 'all_null' not in feature_matrix.columns
def test_remove_low_information_features(test_es, feature_matrix): features = [Feature(v) for v in test_es['test'].variables] feature_matrix, features = remove_low_information_features( feature_matrix, features) assert feature_matrix.shape == (3, 5) assert len(features) == 5 for f in features: assert f.get_name() in feature_matrix.columns assert 'one_value' not in feature_matrix.columns assert 'all_null' not in feature_matrix.columns
def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): if not entities and not entityset: target_entity = 'X' else: target_entity = target_entity or self.target_entity if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) if self.training_window is not None: entityset.add_last_time_indexes() cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] self.features = ft.dfs( cutoff_time=cutoff_time, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, max_features=self.max_features, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode or self.remove_low_information: X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features( X, self.features)
def create_features(es, label='Outcome', custom_agg=[]): cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', label]] fm, features = ft.dfs(entityset=es, target_entity='transactions', agg_primitives=[Sum, Mean] + custom_agg, trans_primitives=[Hour], max_depth=3, approximate='2m', cutoff_time=cutoff_times, verbose=True) fm_enc, _ = ft.encode_features(fm, features) fm_enc = fm_enc.fillna(0) fm_enc = remove_low_information_features(fm_enc) labels = fm.pop(label) return (fm_enc, labels)
def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): if not entities and not entityset: target_entity = 'X' else: target_entity = target_entity or self.target_entity if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) instance_ids = None cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] elif self.index: instance_ids = X[self.index] else: instance_ids = X.index.values self.features = ft.dfs( cutoff_time=cutoff_time, instance_ids=instance_ids, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives ) X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, instance_ids=instance_ids, ) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features(X, self.features)
def produce(self, X, instance_ids=None, include_unknown=True, remove_low_information=True, **kwargs): if instance_ids is not None: feature_matrix = ft.calculate_feature_matrix( self.features, instance_ids=instance_ids, **kwargs) feature_matrix = (feature_matrix.reset_index('time').loc[ instance_ids, :].set_index('time', append=True)) else: feature_matrix = ft.calculate_feature_matrix(self.features, cutoff_time=X, **kwargs) for f in self.features: if issubclass(f.variable_type, vtypes.Discrete): feature_matrix[f.get_name()] = feature_matrix[ f.get_name()].astype(object) elif issubclass(f.variable_type, vtypes.Numeric): feature_matrix[f.get_name()] = pd.to_numeric( feature_matrix[f.get_name()]) elif issubclass(f.variable_type, vtypes.Datetime): feature_matrix[f.get_name()] = pd.to_datetime( feature_matrix[f.get_name()]) encoded_fm, encoded_fl = ft.encode_features(feature_matrix, self.features) if remove_low_information: encoded_fm, encoded_fl = remove_low_information_features( encoded_fm, encoded_fl) encoded_fm.reset_index('time', drop=True, inplace=True) return encoded_fm.fillna(0)
def dfs(self, X=None, target_entity='X', entityset=None, entities=None, relationships=None): if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) target = entityset[target_entity] time_index = target.time_index index = target.index cutoff_time = None if time_index: cutoff_time = target.df[[index, time_index]] instance_ids = X[index].values.copy() self.features = ft.dfs(cutoff_time=cutoff_time, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, instance_ids=instance_ids) X = ft.calculate_feature_matrix(self.features, entityset=entityset, instance_ids=instance_ids) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features( X, self.features)
def feature_tool(df_x): ''' :param df_x: df :return: 80,089 features: 283 + ( 283/2*283 ) * 2 #https://danwertheimer.github.io/rapid-model-prototyping-with-deep-feature-synthesis-and-xgboost ''' print(f'start featuretools') # Make an entityset and add the entity es = ft.EntitySet(id='sp500') es = es.entity_from_dataframe(entity_id='sp500', dataframe=df_x, make_index=True, index='index') # es.normalize_entity(base_entity_id='sp500', # new_entity_id='sessions', # index ='session' # ) primitives_aggregate = [ Std, Count ] #'std', 'min', 'count', 'max', 'mean', 'median', 'mode', 'num_true', 'num_unique', 'sum','skew', 'percent_true', 'last', 'trend', 'n_most_common', 'time_since_last','avg_time_between'] #create a single value primitives_where = ['std', 'min', 'max', 'mean', 'count'] primitives_groupby = [ 'cum_sum', 'cum_count', 'cum_mean', 'cum_min', 'cum_max' ] #group by id # [1, 2, 3, 4, 5]).tolist() = [1, 3, 6, 10, 15] primitives_transform = [ #'add_numeric' #Element-wise addition of 2 lists. create 283/2*283 = 40,044 new features # MultiplyNumeric # , ModuloNumeric #, 'multiply_numeric' #Element-wise multiplication of 2 lists. create 283/2*283 = 40,044 new features # 'subtract_numeric' #Element-wise subtraction of 2 lists. create 283/2*283 = 40,044 new features # , 'modulo_numeric' #Element-wise modulo of 2 lists. create 283/2*283 = 40,044 new features #, 'and' #Element-wise logical AND of 2 lists. create 283/2*283 = 40,044 new features #, 'or' #Element-wise logical OR of 2 lists. create 283/2*283 = 40,044 new features 'absolute', 'percentile' #, 'cum_count', 'cum_sum', 'cum_mean', 'cum_min', 'cum_max', 'cum_mean' ] # 'absolute','percentile', 'cum_count', 'cum_sum', 'cum_mean', 'cum_min', 'cum_max', 'cum_mean', 'subtract', 'divide','time_since_previous', 'latitude', 'longitude', isin is_null is_weekend year week log] # Run deep feature synthesis with transformation primitives feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity='sp500', agg_primitives=primitives_aggregate, trans_primitives=primitives_transform, groupby_trans_primitives=primitives_groupby, where_primitives=primitives_where, max_features=89000 #, drop_contains = 'target' #, seed_features = ['sepal length'] , max_depth=1, n_jobs=1 #-1 will use all cores , verbose=True) print(f'finished featuretools. feature_matrix=\n{feature_matrix.head()}') #print(f'finished2 es={es}') #print(f'finished3 es={es["sp500"]}') #print(f'feature_matrix.columns.tolist()={feature_matrix.columns.tolist()}') #print(f'ft.list_primitives() {ft.list_primitives()}') #print(f'ft.list_primitives() {ft.show_info()}') feature_matrix = selection.remove_low_information_features(feature_matrix) return feature_matrix
def test_remove_low_information_feature_names(feature_matrix): feature_matrix = remove_low_information_features(feature_matrix) assert feature_matrix.shape == (3, 5) assert 'one_value' not in feature_matrix.columns assert 'all_null' not in feature_matrix.columns
def test_remove_low_information_feature_names(feature_matrix): feature_matrix = remove_low_information_features(feature_matrix) assert feature_matrix.shape == (3, 5) assert 'one_value' not in feature_matrix.columns assert 'all_null' not in feature_matrix.columns
def _reduce_feats(self, df): df = remove_low_information_features(df) df = remove_single_value_features(df, count_nan_as_value=True) df.drop(duplicate_columns(df), 1, inplace=True) return df
def run_dfs(self, max_depth=1, features_only=True, ignore_variables=None, reduce_mem=False, reduce_feats=True, trans_primitives=None, agg_primitives=None, chunk_size=None, n_jobs=1, **kwargs): """Deep Feature Synthesisf agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation Feature types to apply. Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"] DateTime: ['time_since_last', 'time_since_first', 'trend'] trans_primitives (list[str or TransformPrimitive], optional): List of Transform Feature functions to apply. Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"] groupby_trans_primitives (list[str or :class:`.primitives.TransformPrimitive`], optional): list of Transform primitives to make GroupByTransformFeatures with """ if ignore_variables is None: # ignore_variables = [self.target_entity_id, self.index] # ignore_variables = ["__id"] # 忽略单值id 会少了一些count特征 ignore_variables = [] if trans_primitives is None: trans_primitives = [ "year", "month", "day", "hour", "minute", "week", "weekday", "is_weekend", 'time_since_previous', # diff # https://stackoverflow.com/questions/60324672/how-is-time-since-previous-computed-in-featuretools Quarter(), ] _ = ft.dfs( entityset=self.es, target_entity=self. target_entity_id, # 具有唯一ID: 不重复id的base_es或者normalize_entity生成的唯一id es features_only=features_only, max_depth=max_depth, ignore_variables={self.entity_id: ignore_variables}, chunk_size=chunk_size, n_jobs=n_jobs, verbose=1, agg_primitives=agg_primitives, trans_primitives=trans_primitives, **kwargs) if features_only: return _ else: df_ = _[0].add_prefix(f'{self.entity_id}_').reset_index() if reduce_feats: cprint("remove_low_information_features") df_ = remove_low_information_features(df_) cprint("remove_single_value_features") df_ = remove_single_value_features(df_, count_nan_as_value=True) cprint("remove_duplicate_features") dups = duplicate_columns(df_) df_ = df_.drop(dups, 1) if reduce_mem: df_ = reduce_mem_usage(df_) return df_
def transform( self, groups: Optional[Dict[str, Sequence[str]]] = None, use_forgotten: bool = False, trans_primitives: Optional[Sequence[str]] = None, max_depth: int = 1, entity_set_folder_name: Optional[str] = None, features_file_name: Optional[str] = None, n_jobs: int = 1, verbose: bool = True, ) -> pd.DataFrame: """ Create new features. Wraps Featuretools Deep Feature Synthesis. Default Featuretools trans primitives are: - "add_numeric" - "subtract_numeric" - "multiply_numeric" - "divide_numeric" - "greater_than" - "less_than" - "and" - "or" Use relationship groups to relate variables. This avoid wasting time creating features from other totally unrelated features. This is specially useful when working with datasets with several features. Be careful with bias. This method does not support multiples entities (consequently agg_primitives) yet. Groups are not entities, but only clusters of related features. Args: groups: Dict of related features groups. None to not use relationships. (default: None) use_forgotten: Create a relationship group for the forgotten features in the the arg groups. (default: None) trans_primitives: Featuretools trans primitives to use. None to use default. (default: None) max_depth: Number of iterations in the feature creation process. (default: 1) entity_set_folder_name: Folder name to store entity set with created features. (default: None) features_file_name: File name to store created features names. Must be JSON. (default: None) n_jobs: Number of parallel workers. (default: 1) verbose: Verbosity. (default: False) Returns: DataFrame with new features. """ # Manage groups. if not groups: groups = self._set_group(self.features) groups = self._fix_groups(features=self.features, groups=groups, use_forgotten=use_forgotten) es = self._set_entity_set(data=self._x, groups=groups) old_n_features = self._x.shape[1] # For comparing later. if not trans_primitives: trans_primitives = self._TRANS_PRIMITIVES index_name = self._index_name(self._x) # Define kwargs outside the function just to improve readability. dfs_kwargs = { "entityset": es, "ignore_variables": {group: [index_name] for group in groups}, "trans_primitives": trans_primitives, "max_depth": max_depth, "n_jobs": n_jobs, "verbose": False, } # Create features for each group. dfs = [ ft.dfs(target_entity=key, **dfs_kwargs) for key in groups.keys() ] # DFS returns a tuple (df and features). Split them. features = [features for _, features in dfs for features in features] dfs = [matrix for matrix, _ in dfs] # Concat all params from all groups to form the new dataset. self._x = pd.concat(dfs, axis=1) # Do a little cleaning just to remove useless features. self._x = selection.remove_low_information_features(self._x) # Keep only feature names that are still in the dataset. # noinspection PyProtectedMember features = [ feature for feature in features if feature._name in self._x.columns ] # Update property. # noinspection PyProtectedMember self.features = [feature._name for feature in features] # Export params. if entity_set_folder_name: es.to_csv(entity_set_folder_name) if features_file_name: ft.save_features(features, features_file_name) # Compare number of features. n_new_features = self._x.shape[1] - old_n_features if verbose: print(f"{n_new_features} features created.") return self._x
def pillar(name='busi', countries=['Chad']): url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/' df = pd.read_csv(url + name + '_train.csv') df = df.drop(['Unnamed: 0'], axis=1) for i in df.columns: if i.find('year') > -1: df = df.drop([i], axis=1) y = df[name] df = df.drop(['rank_' + name, name], axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) X = df problem_type = 'regression' objective = 'auto' automl = evalml.automl.AutoMLSearch(problem_type=problem_type, objective=objective) best_pipeline = automl.load(name + '_best_pipeline') df = pd.read_csv(url + name + '_test.csv') df = df.drop(['Unnamed: 0'], axis=1) for i in df.columns: if i.find('year') > -1: df = df.drop([i], axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) predictions = best_pipeline.predict(df) result = pd.DataFrame() result[name] = predictions df = pd.read_csv(url + name + '_test.csv') temp = df[['country', 'year']] result = pd.merge(left=temp, right=result, how="left", on=[temp.index, result.index]) result = result.drop(['key_0', 'key_1'], axis=1) result['rank_' + name] = result.groupby("year")[name].rank( "dense", ascending=False) result['rank_' + name] = result['rank_' + name].astype('int') result = result[result['country'].isin(countries)] metric = pd.read_csv( 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/' + name + '_metrics.csv') return result, metric
def prosperity( countries=['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia']): url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/' df = pd.read_csv(url + 'merged.csv') df = df.drop(['Unnamed: 0'], axis=1) metrics = [ 'educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove', 'envi' ] ranks = ['rank_' + metric for metric in metrics] drop = metrics + ranks + ['year', 'prosperity_score'] y = df['prosperity_score'] df = df.drop(drop, axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) X = df problem_type = 'regression' objective = 'auto' automl = evalml.automl.AutoMLSearch(problem_type=problem_type, objective=objective) #automl.search(X,y) #best_pipeline = automl.best_pipeline #best_pipeline.fit(X,y) #best_pipeline.save('prosperity_best_pipeline') best_pipeline = automl.load('prosperity_best_pipeline') test = pd.read_csv(url + 'test.csv', index_col=0) drop = ['year'] df = test.copy() df = df.drop(drop, axis=1) df = remove_low_information_features(df) df = remove_highly_null_features(df) df = remove_single_value_features(df) df = remove_highly_correlated_features(df) X = df predictions = best_pipeline.predict(X) result = pd.DataFrame() result['prosperity'] = predictions df = pd.read_csv(url + 'test.csv') temp = df[['country', 'year']] result = pd.merge(left=temp, right=result, how="left", on=[temp.index, result.index]) result = result.drop(['key_0', 'key_1'], axis=1) result['rank_prosperity'] = result.groupby("year")["prosperity"].rank( "dense", ascending=False) result['rank_prosperity'] = result['rank_prosperity'].astype('int') result = result[result['country'].isin(countries)] metric = pd.read_csv( 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/prosperity_metrics.csv' ) return result, metric
# Feature importances can be used for dimensionality reduction. They can also be used to help us better understand a problem. For example, we could use the most important features in order to concentrate on these aspects of a client when evaluating a potential loan. Let's look at the number of features with 0 importance which almost certainly can be removed from the featureset. # In[32]: print('There are %d features with 0 importance' % sum(fi['importance'] == 0.0)) # ## Remove Low Importance Features # # Feature selection is an entire topic by itself, but one thing we can do is remove any features that have only a single unique value or are all null. Featuretools has a default method for doing this available in the `selection` module. # In[33]: from featuretools import selection # Remove features with only one unique value feature_matrix2 = selection.remove_low_information_features(feature_matrix) print('Removed %d features' % (feature_matrix.shape[1] - feature_matrix2.shape[1])) # ## Align Train and Test Sets # # We also want to make sure the train and test sets have the same exact features. We can first one-hot encode the data (we'll have to do this anyway for our model) and then align the dataframes on the columns. # In[34]: # Separate out the train and test sets train = feature_matrix2[feature_matrix2['set'] == 'train'] test = feature_matrix2[feature_matrix2['set'] == 'test'] # One hot encoding
# Adjust Entity Set cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', 'Outcome']] pd.options.display.max_columns = 500 fm, features = ft.dfs(entityset=es, target_entity='transactions', agg_primitives=aggPrimitives, trans_primitives=transPrimitives, max_depth=maxDepth, cutoff_time=cutoff_times[1000:], verbose=True) if encodeOutput == "1": # Encode the feature matrix using One-Hot encoding fm_enc, f_enc = ft.encode_features(fm, features) fm_enc = fm_enc.fillna(0) fm_enc = remove_low_information_features(fm_enc) # Write Output to CSV fm_enc.to_csv("output.csv") else: # Write Output to CSV fm.to_csv("output.csv") # Remove Pickle Directory shutil.rmtree(dir_name) # Close the input file inFile.close()