def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_pickle_features(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_s3_test_profile(es, s3_client, s3_bucket, setup_test_profile): features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True) ft.save_features(features_original, TEST_S3_URL, profile_name='test') obj = list(s3_bucket.objects.all())[0].key s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL='public-read-write') features_deserialized = ft.load_features(TEST_S3_URL, profile_name='test') assert_features(features_original, features_deserialized)
def construct_retail_example(ftens_file='retail_binary_files/ftens.csv', labels_file='retail_binary_files/labels.csv', fl_file='retail_binary_files/fl.p'): es = ft.demo.load_retail() if os.path.exists(ftens_file): ftens = pd.read_csv(ftens_file, index_col=['customer_id', 'time'], parse_dates=['time']) labels = pd.read_csv(labels_file, index_col='customer_id')['label'] fl = ft.load_features(fl_file, es) else: labels = create_labels(es, min_training_data='8 days', lead='7 days', window='30 days', reduce='sum', binarize=None, iterate_by=None) labels_binary = labels.copy() labels_binary['label'] = labels_binary['label'] > 300 sampled = sample_labels(labels_binary, n=1) sampled = sampled[['customer_id', 'time', 'label']] sampled = sampled.sample(300) ftens, fl = ft.tdfs(target_entity='customers', entityset=es, cutoffs=sampled, window_size='30d', num_windows=5, verbose=True) ftens = (ftens.reset_index( 'customer_id', drop=False).reset_index(drop=False).merge( sampled[['customer_id', 'label']], on='customer_id', how='left').set_index('customer_id').set_index('time', append=True)) labels = (ftens['label'].reset_index( 'customer_id', drop=False).drop_duplicates('customer_id').set_index('customer_id') ) del ftens['label'] ftens.to_csv(ftens_file) labels.to_csv(labels_file) labels = labels['label'] ft.save_features(fl, fl_file) return ftens, labels, fl
def test_pickle_features(es): features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserialized = ft.load_features(filepath) for feat_1, feat_2 in zip(features_original, features_deserialized): assert feat_1.unique_name() == feat_2.unique_name() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) os.remove(filepath)
def test_serialize(self, es): features = dfs( entityset=es, target_dataframe_name="log", trans_primitives=[self.primitive], max_features=-1, max_depth=3, features_only=True, ) feat_to_serialize = None for feature in features: if feature.primitive.__class__ == self.primitive: feat_to_serialize = feature break for base_feature in feature.get_dependencies(deep=True): if base_feature.primitive.__class__ == self.primitive: feat_to_serialize = base_feature break assert feat_to_serialize is not None # Skip calculating feature matrix for long running primitives skip_primitives = ["elmo"] if self.primitive.name not in skip_primitives: df1 = calculate_feature_matrix([feat_to_serialize], entityset=es) new_feat = load_features(save_features([feat_to_serialize]))[0] assert isinstance(new_feat, ft.FeatureBase) if self.primitive.name not in skip_primitives: df2 = calculate_feature_matrix([new_feat], entityset=es) assert df1.equals(df2)
def get_train_data(project, train_file, prediction_key, prediction_target, variable_types={}, drop_columns=None): # Read the training data print("==========Reading the training file {}".format(train_file)) train_data = pd.read_csv(train_file) train_data.head(5) print("==========Preparing training labels for target {}".format( prediction_target)) train_labels = train_data[prediction_target].values train_data = train_data.drop(prediction_target, axis=1) if drop_columns is not None: print("==========dropping columns {}".format(drop_columns)) train_data = train_data.drop(drop_columns, axis=1) print("==========Generating the feature with featuretools") es = ft.EntitySet(project) entities = get_ft_entities(es=es, project=project, prediction_key=prediction_key, data=train_data, variable_types=variable_types) print("==========entities are:") print(entities) feature_matrix, feature_defs = ft.dfs(entityset=entities, target_entity=project) feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) print("==========columns are:") print(feature_matrix_enc.columns) print("==========saving features to {}".format(project)) ft.save_features(feature_defs, "data/{}/ft_features".format(project)) return feature_matrix_enc, train_labels
def build_card_one_hot(): """ Reads in the raw data from train.csv and creates one-hot encodings for the feature and date fields. :return: Data frame with one-hot encoding """ logger = logging.getLogger(__name__) logger.info("Reading in data.") df = pd.read_csv('data/raw/train.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") logger.info("Creating entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity="transactions") logger.info("Creating one-hot training data") train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Creating one-hot test data") df = pd.read_csv('data/raw/test.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") df['target'] = 0 es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) test_feature_matrix_enc.drop(columns='target', inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def test_custom_feature_names_retained_during_serialization(pd_es, tmpdir): class MultiCumulative(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 3 multi_output_trans_feat = ft.Feature( pd_es["log"].ww["value"], primitive=MultiCumulative ) groupby_trans_feat = ft.GroupByTransformFeature( pd_es["log"].ww["value"], primitive=MultiCumulative, groupby=pd_es["log"].ww["product_id"], ) multi_output_agg_feat = ft.Feature( pd_es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) slice = FeatureOutputSlice(multi_output_trans_feat, 1) stacked_feat = ft.Feature(slice, primitive=Negate) trans_names = ["cumulative_sum", "cumulative_max", "cumulative_min"] multi_output_trans_feat.set_feature_names(trans_names) groupby_trans_names = ["grouped_sum", "grouped_max", "grouped_min"] groupby_trans_feat.set_feature_names(groupby_trans_names) agg_names = ["first_most_common", "second_most_common"] multi_output_agg_feat.set_feature_names(agg_names) features = [ multi_output_trans_feat, multi_output_agg_feat, groupby_trans_feat, stacked_feat, ] file = os.path.join(tmpdir, "features.json") ft.save_features(features, file) deserialized_features = ft.load_features(file) new_trans, new_agg, new_groupby, new_stacked = deserialized_features assert new_trans.get_feature_names() == trans_names assert new_agg.get_feature_names() == agg_names assert new_groupby.get_feature_names() == groupby_trans_names assert new_stacked.get_feature_names() == ["-(cumulative_max)"]
def pickle_features_test_helper(es_size, features_original, dir_path): filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserializedA = ft.load_features(filepath) assert os.path.getsize(filepath) < es_size os.remove(filepath) with open(filepath, "w") as f: ft.save_features(features_original, f) features_deserializedB = ft.load_features(open(filepath)) assert os.path.getsize(filepath) < es_size os.remove(filepath) features = ft.save_features(features_original) features_deserializedC = ft.load_features(features) assert asizeof(features) < es_size features_deserialized_options = [features_deserializedA, features_deserializedB, features_deserializedC] for features_deserialized in features_deserialized_options: assert_features(features_original, features_deserialized)
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, filters=[], agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(features_no_pickle[0].entityset, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath, es) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < getsize(feat_1.entityset) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def pickle_features_test_helper(es_size, features_original): dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserializedA = ft.load_features(filepath) assert os.path.getsize(filepath) < es_size os.remove(filepath) with open(filepath, "w") as f: ft.save_features(features_original, f) features_deserializedB = ft.load_features(open(filepath)) assert os.path.getsize(filepath) < es_size os.remove(filepath) features = ft.save_features(features_original) features_deserializedC = ft.load_features(features) assert asizeof(features) < es_size features_deserialized_options = [features_deserializedA, features_deserializedB, features_deserializedC] for features_deserialized in features_deserialized_options: for feat_1, feat_2 in zip(features_original, features_deserialized): assert feat_1.unique_name() == feat_2.unique_name() assert feat_1.entityset == feat_2.entityset
def test_serialize_url(es): features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True) error_text = "Writing to URLs is not supported" with pytest.raises(ValueError, match=error_text): ft.save_features(features_original, URL)
def deploy_features_create(features_enc, model_path): ''' you can save self.features_def to feature_definitions.json for deploying, ''' ft.save_features(features_enc, model_path)
target_entity="users", cutoff_time=label_times, training_window=ft.Timedelta("60 days"), entities=es, verbose=True) fm_encode, f_encode = es.feature_encoder(feature_matrix=feature_matrix, features=features) print("Number of featyres %s" % len(fm_encode)) print(fm_encode.head(10)) # 机器学习 X = merge_features_labels(fm_encode, label_times) X.drop(["user_id", "time"], axis=1, inplace=True) X = X.fillna(0) y = X.pop("label") # 随机森林的使用 clf = RandomForestClassifier(n_estimators=400, n_jobs=-1) # 树的个数 scores = cross_val_score( estimator=clf, X=X, y=y, cv=3, # cv 3折交叉验证 scoring="roc_auc", verbose=True) print("AUC %.2f +/- %.2f" % (scores.mean(), scores.std())) clf.fit(X, y) #训练 top_feature = feature_importtance(clf, f_encode, n=20) ft.save_features(top_feature, "top_features")
def main(users_from, users_till): # ### DEFINE THE PIPELINE PARAMETERS # In[2]: show_report = False save_model = True # the timeframe of extracted users # users_from = '2016-10-01' # users_till = '2017-09-30' cohort_size = 3000 # the timeframe of extracted behavioral data interval = '3 weeks' # the type of the prediction problem # 'regression', 'binary classification', 'multiclass classification' prediction_problem_type = 'binary classification' # multiclass values medium_value = 5 high_value = 50 # number of the most important features to extract number_of_features = 20 print("Pipeline parameters defined") # ### CONNECT TO THE DATABASE # In[3]: conn, cur = utils.connect_to_db() # ### BUILD ENTITY TABLES AND LABELS # #### Cohorts entity # In[4]: cohorts = utils_bux.build_cohorts_entity(cur=cur, users_from=users_from, users_till=users_till) # #### Users entity # In[5]: users = utils_bux.build_users_entity(cur=cur, users_from=users_from, users_till=users_till, interval=interval, cohorts=cohorts, cohort_size=cohort_size) # #### Transactions entity # In[6]: transactions = utils_bux.build_transactions_entity(cur=cur, interval=interval) # #### Labels # In[7]: labels = utils_bux.build_target_values(cur=cur, medium_value=medium_value, high_value=high_value) # ### CREATE THE ENTITY SET # In[8]: es = utils_bux.create_bux_entity_set(cohorts, users, transactions) es # ### FEATURE ENGINEERING (DFS) FOR ALL FEATURES # In[9]: from featuretools.primitives import (Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique, Day, Week, Month, Weekday, Weekend) trans_primitives = [Day, Week, Month, Weekday, Weekend] agg_primitives = [Sum, Std, Max, Min, Mean, Count, PercentTrue, NUnique] fm_encoded, features_encoded = utils.calculate_feature_matrix( es, "users", trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=2) X = fm_encoded.reset_index().merge(labels) # ### TRAINING ON ALL FEATURES # In[10]: # define the labels based on the prediction problem type X, y = utils.make_labels(X, prediction_problem_type) # split the data into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # train the model model = utils.rf_train(X_train, y_train, prediction_problem_type) # extract the most important features top_features = utils.feature_importances(model, features_encoded, n=number_of_features) # save the top features ft.save_features(top_features, "top_features") print("All features built and the most important features saved") # ### FEATURE ENGINEERING (DFS) FOR TOP FEATURES # In[11]: fm = utils.calculate_feature_matrix_top_features(es, top_features) X = fm.reset_index().merge(labels) print("Top features built") # ### TRAINING AND PREDICTION ON TOP FEATURES # In[12]: # define the labels based on the prediction problem type X, y = utils.make_labels(X, prediction_problem_type) # split the data into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # fit the model model = utils.rf_train(X_train, y_train, prediction_problem_type) print("Model trained on top features") # ### SAVE THE MODEL # In[13]: if save_model == True: joblib.dump(model, 'models/model.pkl') print("Model saved") else: print("Model not saved") # ### REPORT # In[ ]: if show_report: utils.show_report(model, X, y, X_train, y_train, X_test, y_test, prediction_problem_type, top_features)
def build_transaction_data(): """ Builds a data set from raw card and transaction data using the featuretools package. The resulting data set will be strictly concerned with transactions shown in the historical transactions CSV, and linking them to the proper card. :return: training, testing feature matrices """ logger = logging.getLogger(__name__) logger.info("Reading in card data") customer_df = pd.read_csv("data/raw/train.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") customer_df.drop(columns='target', inplace=True) logger.info("Reading in transactions") transactions_df = pd.read_csv("data/raw/historical_transactions.csv", dtype=TRANSACTION_LOAD_DTYPES) transactions_df['authorized_flag'] = np.where( transactions_df['authorized_flag'] == 'Y', 1, 0) transactions_df.reset_index(inplace=True) logger.info("Creating training entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) del customer_df gc.collect() logger.info("Defining relationships") relationship = ft.Relationship(es_train['customer']['card_id'], es_train['transactions']['card_id']) es_train = es_train.add_relationship(relationship) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity='customer') train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Loading test data") customer_df = pd.read_csv("data/raw/test.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") logger.info("Creating testing entity set") es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) es_test = es_test.add_relationship(relationship) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) for col in train_feature_matrix_enc.columns: logger.debug(f"Normalizing feature [{col}]") old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max']) if (old_min == old_max): logger.debug(f"Droping feature [{col}] due to lack of variation") train_feature_matrix_enc.drop(columns=col, inplace=True) test_feature_matrix_enc.drop(columns=col, inplace=True) continue train_feature_matrix_enc[col] = normalize_series( series=train_feature_matrix_enc[col], min_max=(old_min, old_max)) assert col in test_feature_matrix_enc.columns test_feature_matrix_enc[col] = normalize_series( series=test_feature_matrix_enc[col], min_max=(old_min, old_max)) logger.info("Dropping SKEW features.") # TODO: Determine why these have lower counts than other features drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c] train_feature_matrix_enc.drop(columns=drop_cols, inplace=True) test_feature_matrix_enc.drop(columns=drop_cols, inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def save_features_to_json(self): ft.save_features(self.feature_names, os.path.join(self.path, self.feature_names_file))
es primitives = ft.list_primitives() pd.options.display.max_colwidth = 100 default_agg_primitives = ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"] default_trans_primitives = ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"] feature_names = ft.dfs(entityset = es, target_entity = 'app', trans_primitives = default_trans_primitives, agg_primitives=default_agg_primitives, where_primitives = [], seed_features = [], max_depth = 2, n_jobs = -1, verbose = 1, features_only=True) ft.save_features(feature_names, '../input/features.txt') #Run Deep-Feature Synthesis- will take a lot of time to process- use pararell processors print('Total size of entityset: {:.5f} gb.'.format(sys.getsizeof(es) / 1e9)) import psutil print('Total number of cpus detected: {}.'.format(psutil.cpu_count())) print('Total size of system memory: {:.5f} gb.'.format(psutil.virtual_memory().total / 1e9)) # feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='app', # agg_primitives = agg_primitives, # trans_primitives = trans_primitives,
def transform( self, groups: Optional[Dict[str, Sequence[str]]] = None, use_forgotten: bool = False, trans_primitives: Optional[Sequence[str]] = None, max_depth: int = 1, entity_set_folder_name: Optional[str] = None, features_file_name: Optional[str] = None, n_jobs: int = 1, verbose: bool = True, ) -> pd.DataFrame: """ Create new features. Wraps Featuretools Deep Feature Synthesis. Default Featuretools trans primitives are: - "add_numeric" - "subtract_numeric" - "multiply_numeric" - "divide_numeric" - "greater_than" - "less_than" - "and" - "or" Use relationship groups to relate variables. This avoid wasting time creating features from other totally unrelated features. This is specially useful when working with datasets with several features. Be careful with bias. This method does not support multiples entities (consequently agg_primitives) yet. Groups are not entities, but only clusters of related features. Args: groups: Dict of related features groups. None to not use relationships. (default: None) use_forgotten: Create a relationship group for the forgotten features in the the arg groups. (default: None) trans_primitives: Featuretools trans primitives to use. None to use default. (default: None) max_depth: Number of iterations in the feature creation process. (default: 1) entity_set_folder_name: Folder name to store entity set with created features. (default: None) features_file_name: File name to store created features names. Must be JSON. (default: None) n_jobs: Number of parallel workers. (default: 1) verbose: Verbosity. (default: False) Returns: DataFrame with new features. """ # Manage groups. if not groups: groups = self._set_group(self.features) groups = self._fix_groups(features=self.features, groups=groups, use_forgotten=use_forgotten) es = self._set_entity_set(data=self._x, groups=groups) old_n_features = self._x.shape[1] # For comparing later. if not trans_primitives: trans_primitives = self._TRANS_PRIMITIVES index_name = self._index_name(self._x) # Define kwargs outside the function just to improve readability. dfs_kwargs = { "entityset": es, "ignore_variables": {group: [index_name] for group in groups}, "trans_primitives": trans_primitives, "max_depth": max_depth, "n_jobs": n_jobs, "verbose": False, } # Create features for each group. dfs = [ ft.dfs(target_entity=key, **dfs_kwargs) for key in groups.keys() ] # DFS returns a tuple (df and features). Split them. features = [features for _, features in dfs for features in features] dfs = [matrix for matrix, _ in dfs] # Concat all params from all groups to form the new dataset. self._x = pd.concat(dfs, axis=1) # Do a little cleaning just to remove useless features. self._x = selection.remove_low_information_features(self._x) # Keep only feature names that are still in the dataset. # noinspection PyProtectedMember features = [ feature for feature in features if feature._name in self._x.columns ] # Update property. # noinspection PyProtectedMember self.features = [feature._name for feature in features] # Export params. if entity_set_folder_name: es.to_csv(entity_set_folder_name) if features_file_name: ft.save_features(features, features_file_name) # Compare number of features. n_new_features = self._x.shape[1] - old_n_features if verbose: print(f"{n_new_features} features created.") return self._x
def test_deserializer_uses_common_primitive_instances_with_args(es, tmp_path): # Single argument scalar1 = MultiplyNumericScalar(value=1) scalar5 = MultiplyNumericScalar(value=5) features = ft.dfs( entityset=es, target_dataframe_name="products", features_only=True, agg_primitives=["sum"], trans_primitives=[scalar1, scalar5], ) scalar1_features = [ f for f in features if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name() ] scalar5_features = [ f for f in features if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name() ] # Make sure we have multiple features of each type assert len(scalar1_features) > 1 assert len(scalar5_features) > 1 # DFS should use the the passed in primitive instance for all features assert all([f.primitive is scalar1 for f in scalar1_features]) assert all([f.primitive is scalar5 for f in scalar5_features]) file = os.path.join(tmp_path, "features.json") ft.save_features(features, file) deserialized_features = ft.load_features(file) new_scalar1_features = [ f for f in deserialized_features if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name() ] new_scalar5_features = [ f for f in deserialized_features if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name() ] # After deserialization all features that share a primitive should use the same primitive instance new_scalar1_primitive = new_scalar1_features[0].primitive new_scalar5_primitive = new_scalar5_features[0].primitive assert all([f.primitive is new_scalar1_primitive for f in new_scalar1_features]) assert all([f.primitive is new_scalar5_primitive for f in new_scalar5_features]) assert new_scalar1_primitive.value == 1 assert new_scalar5_primitive.value == 5 # Test primitive with multiple args - pandas only due to primitive compatibility if es.dataframe_type == Library.PANDAS.value: distance_to_holiday = DistanceToHoliday( holiday="Victoria Day", country="Canada" ) features = ft.dfs( entityset=es, target_dataframe_name="customers", features_only=True, agg_primitives=[], trans_primitives=[distance_to_holiday], ) distance_features = [ f for f in features if f.primitive.name == "distance_to_holiday" ] assert len(distance_features) > 1 # DFS should use the the passed in primitive instance for all features assert all([f.primitive is distance_to_holiday for f in distance_features]) file = os.path.join(tmp_path, "distance_features.json") ft.save_features(distance_features, file) new_distance_features = ft.load_features(file) # After deserialization all features that share a primitive should use the same primitive instance new_distance_primitive = new_distance_features[0].primitive assert all( [f.primitive is new_distance_primitive for f in new_distance_features] ) assert new_distance_primitive.holiday == "Victoria Day" assert new_distance_primitive.country == "Canada" # Test primitive with list arg is_in = IsIn(list_of_outputs=[5, True, "coke zero"]) features = ft.dfs( entityset=es, target_dataframe_name="customers", features_only=True, agg_primitives=[], trans_primitives=[is_in], ) is_in_features = [f for f in features if f.primitive.name == "isin"] assert len(is_in_features) > 1 # DFS should use the the passed in primitive instance for all features assert all([f.primitive is is_in for f in is_in_features]) file = os.path.join(tmp_path, "distance_features.json") ft.save_features(is_in_features, file) new_is_in_features = ft.load_features(file) # After deserialization all features that share a primitive should use the same primitive instance new_is_in_primitive = new_is_in_features[0].primitive assert all([f.primitive is new_is_in_primitive for f in new_is_in_features]) assert new_is_in_primitive.list_of_outputs == [5, True, "coke zero"]