def pickle_features_test_helper(es_size, features_original, dir_path): filepath = os.path.join(dir_path, "test_feature") ft.save_features(features_original, filepath) features_deserializedA = ft.load_features(filepath) assert os.path.getsize(filepath) < es_size os.remove(filepath) with open(filepath, "w") as f: ft.save_features(features_original, f) features_deserializedB = ft.load_features(open(filepath)) assert os.path.getsize(filepath) < es_size os.remove(filepath) features = ft.save_features(features_original) features_deserializedC = ft.load_features(features) assert asizeof(features) < es_size features_deserialized_options = [ features_deserializedA, features_deserializedB, features_deserializedC, ] for features_deserialized in features_deserialized_options: assert_features(features_original, features_deserialized)
def pickle_features_test_helper(es_size, features_original): dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserializedA = ft.load_features(filepath) assert os.path.getsize(filepath) < es_size os.remove(filepath) with open(filepath, "w") as f: ft.save_features(features_original, f) features_deserializedB = ft.load_features(open(filepath)) assert os.path.getsize(filepath) < es_size os.remove(filepath) features = ft.save_features(features_original) features_deserializedC = ft.load_features(features) assert asizeof(features) < es_size features_deserialized_options = [ features_deserializedA, features_deserializedB, features_deserializedC ] for features_deserialized in features_deserialized_options: for feat_1, feat_2 in zip(features_original, features_deserialized): assert feat_1.unique_name() == feat_2.unique_name() assert feat_1.entityset == feat_2.entityset
def test_pickle_features(es): features_no_pickle = ft.dfs(target_entity='sessions', entityset=es, features_only=True) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_pickle_features_with_custom_primitive(es): NewMax = make_agg_primitive( lambda x: max(x), name="NewMax", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") features_no_pickle = ft.dfs(target_entity='sessions', entityset=es, agg_primitives=["Last", "Mean", NewMax], features_only=True) assert any( [isinstance(feat.primitive, NewMax) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_pickle_features(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, filters=[], agg_primitives=[Last, Mean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(features_no_pickle[0].entityset, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath, es) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < getsize(feat_1.entityset) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_deserializer_uses_common_primitive_instances_no_args(es, tmp_path): features = ft.dfs( entityset=es, target_dataframe_name="products", features_only=True, agg_primitives=["sum"], trans_primitives=["is_null"], ) is_null_features = [f for f in features if f.primitive.name == "is_null"] sum_features = [f for f in features if f.primitive.name == "sum"] # Make sure we have multiple features of each type assert len(is_null_features) > 1 assert len(sum_features) > 1 # DFS should use the same primitive instance for all features that share a primitive is_null_primitive = is_null_features[0].primitive sum_primitive = sum_features[0].primitive assert all([f.primitive is is_null_primitive for f in is_null_features]) assert all([f.primitive is sum_primitive for f in sum_features]) file = os.path.join(tmp_path, "features.json") ft.save_features(features, file) deserialized_features = ft.load_features(file) new_is_null_features = [ f for f in deserialized_features if f.primitive.name == "is_null" ] new_sum_features = [f for f in deserialized_features if f.primitive.name == "sum"] # After deserialization all features that share a primitive should use the same primitive instance new_is_null_primitive = new_is_null_features[0].primitive new_sum_primitive = new_sum_features[0].primitive assert all([f.primitive is new_is_null_primitive for f in new_is_null_features]) assert all([f.primitive is new_sum_primitive for f in new_sum_features])
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_deserialize_features_s3(pd_es, url, profile_name): agg_primitives = [ Sum, Std, Max, Skew, Min, Mean, Count, PercentTrue, NumUnique, Mode, ] trans_primitives = [Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters] features_original = ft.dfs( target_dataframe_name="sessions", entityset=pd_es, features_only=True, agg_primitives=agg_primitives, trans_primitives=trans_primitives, ) features_deserialized = ft.load_features(url, profile_name=profile_name) assert_features(features_original, features_deserialized)
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < getsize(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_feature_serialization(universal_sentence_encoder, tmpdir): sentences = pd.Series([ "", "I like to eat pizza", "The roller coaster was built in 1885.", "When will humans go to mars?", "Mitochondria is the powerhouse of the cell", ]) es = ft.EntitySet("es") df = pd.DataFrame({"id": [0, 1, 2, 3, 4], "sentences": sentences}) es.add_dataframe( dataframe=df, dataframe_name="dataframe", index="id", logical_types={"sentences": NaturalLanguage}, ) fm, features = ft.dfs( entityset=es, target_dataframe_name="dataframe", trans_primitives=[universal_sentence_encoder], ) filename = str(tmpdir.join("features.txt")) ft.save_features(features, filename) loaded_features = ft.load_features(filename) fm_serialized = ft.calculate_feature_matrix(loaded_features, entityset=es) pd.testing.assert_frame_equal(fm, fm_serialized)
def test_serialize(self, es): features = dfs( entityset=es, target_dataframe_name="log", trans_primitives=[self.primitive], max_features=-1, max_depth=3, features_only=True, ) feat_to_serialize = None for feature in features: if feature.primitive.__class__ == self.primitive: feat_to_serialize = feature break for base_feature in feature.get_dependencies(deep=True): if base_feature.primitive.__class__ == self.primitive: feat_to_serialize = base_feature break assert feat_to_serialize is not None # Skip calculating feature matrix for long running primitives skip_primitives = ["elmo"] if self.primitive.name not in skip_primitives: df1 = calculate_feature_matrix([feat_to_serialize], entityset=es) new_feat = load_features(save_features([feat_to_serialize]))[0] assert isinstance(new_feat, ft.FeatureBase) if self.primitive.name not in skip_primitives: df2 = calculate_feature_matrix([new_feat], entityset=es) assert df1.equals(df2)
def test_pickle_features(es): dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def _load_features(features, profile_name=None): """ :param features: str 或文件对象, 特征所在的位置 :param profile_name: str bool 型 :return: list """ feature = ft.load_features(features=features, profile_name=profile_name) return feature
def load_features_create(model_path): ''' you can load features_enc; Example ---------- 1.features = load_features_create('feature_definitions.json') 2.feature_matrix = make_features(features) ''' return ft.load_features(model_path)
def test_deserialize_features_s3(es, url, profile_name): features_original = sorted(ft.dfs(target_entity='sessions', entityset=es, features_only=True), key=lambda x: x.unique_name()) features_deserialized = sorted(ft.load_features(url, profile_name=profile_name), key=lambda x: x.unique_name()) assert_features(features_original, features_deserialized)
def select_features(self): all_features = ft.load_features( os.path.join(self.path, self.feature_names_file)) features_from_selector = list( pd.read_csv(self.file_name_with_selected_features, sep=self.sep).columns) for feature in all_features: if feature.get_name() in features_from_selector: self.selected_features.append(feature)
def test_s3_test_profile(es, s3_client, s3_bucket, setup_test_profile): features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True) ft.save_features(features_original, TEST_S3_URL, profile_name='test') obj = list(s3_bucket.objects.all())[0].key s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL='public-read-write') features_deserialized = ft.load_features(TEST_S3_URL, profile_name='test') assert_features(features_original, features_deserialized)
def test_serialize_features_mock_anon_s3(es, s3_client, s3_bucket): features_original = ft.dfs( target_dataframe_name="sessions", entityset=es, features_only=True ) ft.save_features(features_original, TEST_S3_URL, profile_name=False) obj = list(s3_bucket.objects.all())[0].key s3_client.ObjectAcl(BUCKET_NAME, obj).put(ACL="public-read-write") features_deserialized = ft.load_features(TEST_S3_URL, profile_name=False) assert_features(features_original, features_deserialized)
def construct_retail_example(ftens_file='retail_binary_files/ftens.csv', labels_file='retail_binary_files/labels.csv', fl_file='retail_binary_files/fl.p'): es = ft.demo.load_retail() if os.path.exists(ftens_file): ftens = pd.read_csv(ftens_file, index_col=['customer_id', 'time'], parse_dates=['time']) labels = pd.read_csv(labels_file, index_col='customer_id')['label'] fl = ft.load_features(fl_file, es) else: labels = create_labels(es, min_training_data='8 days', lead='7 days', window='30 days', reduce='sum', binarize=None, iterate_by=None) labels_binary = labels.copy() labels_binary['label'] = labels_binary['label'] > 300 sampled = sample_labels(labels_binary, n=1) sampled = sampled[['customer_id', 'time', 'label']] sampled = sampled.sample(300) ftens, fl = ft.tdfs(target_entity='customers', entityset=es, cutoffs=sampled, window_size='30d', num_windows=5, verbose=True) ftens = (ftens.reset_index( 'customer_id', drop=False).reset_index(drop=False).merge( sampled[['customer_id', 'label']], on='customer_id', how='left').set_index('customer_id').set_index('time', append=True)) labels = (ftens['label'].reset_index( 'customer_id', drop=False).drop_duplicates('customer_id').set_index('customer_id') ) del ftens['label'] ftens.to_csv(ftens_file) labels.to_csv(labels_file) labels = labels['label'] ft.save_features(fl, fl_file) return ftens, labels, fl
def test_pickle_features(es): features_original = ft.dfs(target_entity='sessions', entityset=es, features_only=True) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') ft.save_features(features_original, filepath) features_deserialized = ft.load_features(filepath) for feat_1, feat_2 in zip(features_original, features_deserialized): assert feat_1.unique_name() == feat_2.unique_name() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) os.remove(filepath)
def feature_matrix_from_entity_set(es: ft.EntitySet, dp: str, mt: str) -> None: """ 计算特征矩阵,并保存到指定目录 :param es: 实体集 :param dp: 路径 :param mt: 主表 :return: 无 """ feature_defs = Data_Val.feature_defs feature_defs = ft.load_features(open(feature_defs, 'rb')) feature_matrix = ft.calculate_feature_matrix(feature_defs, entityset=es, n_jobs=1, verbose=0) feature_matrix.to_csv(os.path.join(dp, 'p.csv'), index=True)
def load(entity_set_folder_name: str, features_file_name: str, verbose: bool = True) -> pd.DataFrame: """ Load dataframe from featuretools params. Args: entity_set_folder_name: Entity set folder path. features_file_name: Features file path. verbose: Verbosity, Returns: Dataframe. """ es = ft.read_entityset(entity_set_folder_name) features = ft.load_features(features_file_name) return ft.calculate_feature_matrix(features, es, verbose=verbose)
def build_card_one_hot(): """ Reads in the raw data from train.csv and creates one-hot encodings for the feature and date fields. :return: Data frame with one-hot encoding """ logger = logging.getLogger(__name__) logger.info("Reading in data.") df = pd.read_csv('data/raw/train.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") logger.info("Creating entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity="transactions") logger.info("Creating one-hot training data") train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Creating one-hot test data") df = pd.read_csv('data/raw/test.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") df['target'] = 0 es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) test_feature_matrix_enc.drop(columns='target', inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def test_custom_feature_names_retained_during_serialization(pd_es, tmpdir): class MultiCumulative(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 3 multi_output_trans_feat = ft.Feature( pd_es["log"].ww["value"], primitive=MultiCumulative ) groupby_trans_feat = ft.GroupByTransformFeature( pd_es["log"].ww["value"], primitive=MultiCumulative, groupby=pd_es["log"].ww["product_id"], ) multi_output_agg_feat = ft.Feature( pd_es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) slice = FeatureOutputSlice(multi_output_trans_feat, 1) stacked_feat = ft.Feature(slice, primitive=Negate) trans_names = ["cumulative_sum", "cumulative_max", "cumulative_min"] multi_output_trans_feat.set_feature_names(trans_names) groupby_trans_names = ["grouped_sum", "grouped_max", "grouped_min"] groupby_trans_feat.set_feature_names(groupby_trans_names) agg_names = ["first_most_common", "second_most_common"] multi_output_agg_feat.set_feature_names(agg_names) features = [ multi_output_trans_feat, multi_output_agg_feat, groupby_trans_feat, stacked_feat, ] file = os.path.join(tmpdir, "features.json") ft.save_features(features, file) deserialized_features = ft.load_features(file) new_trans, new_agg, new_groupby, new_stacked = deserialized_features assert new_trans.get_feature_names() == trans_names assert new_agg.get_feature_names() == agg_names assert new_groupby.get_feature_names() == groupby_trans_names assert new_stacked.get_feature_names() == ["-(cumulative_max)"]
def test_deserialize_features_s3(es, url, profile_name): agg_primitives = [ Sum, Std, Max, Skew, Min, Mean, Count, PercentTrue, NumUnique, Mode ] trans_primitives = [ Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters ] features_original = sorted(ft.dfs(target_entity='sessions', entityset=es, features_only=True, agg_primitives=agg_primitives, trans_primitives=trans_primitives), key=lambda x: x.unique_name()) features_deserialized = sorted(ft.load_features(url, profile_name=profile_name), key=lambda x: x.unique_name()) assert_features(features_original, features_deserialized)
def get_test_data(project, testfile, prediction_key, prediction_target, variable_types={}, drop_columns=None): print("==========Reading test data file {}".format(testfile)) test_data = pd.read_csv(testfile) print(test_data.describe()) if drop_columns is not None: print("==========dropping columns {}".format(drop_columns)) test_data = test_data.drop(drop_columns, axis=1) es = ft.EntitySet(project) entities = get_ft_entities(es=es, project=project, prediction_key=prediction_key, data=test_data, variable_types=variable_types) print("==========entities are:") print(entities) print("==========Reading features from {}".format(project)) saved_features = ft.load_features("data/{}/ft_features".format(project)) print("==========saved_features are:") print(saved_features) feature_matrix = ft.calculate_feature_matrix(saved_features, entities) feature_matrix_enc, _ = ft.encode_features(feature_matrix, saved_features) index_column = test_data[prediction_key] return feature_matrix_enc, index_column
def main(users_from, users_till): # ### DEFINE PIPELINE PARAMETERS # In[11]: load_to_database = False save_as_csv = False # the timeframe of extracted users # users_from = '2018-04-01' # users_till = '2018-04-30' # include all users in each of the cohorts cohort_size = 1000000000 # the timeframe of extracted behavioral data interval = '3 weeks' # the type of the prediction problem # 'regression', 'binary classification', 'multiclass classification' prediction_problem_type = 'binary classification' print("Pipeline parameters defined") print("Extraction of scoring for users from", users_from, "till", users_till) # ### CONNECT TO THE DATABASE # In[12]: conn, cur = utils.connect_to_db() # ### BUILD ENTITIES # #### Cohorts entity # In[13]: cohorts = utils_bux.build_cohorts_entity(cur=cur, users_from=users_from, users_till=users_till) # #### Users entity # In[14]: users = utils_bux.build_users_entity(cur=cur, users_from=users_from, users_till=users_till, interval=interval, cohorts=cohorts, cohort_size=cohort_size) # #### Transactions entity # In[15]: transactions = utils_bux.build_transactions_entity(cur=cur, interval=interval) # ### CREATE THE ENTITY SET # In[16]: es = utils_bux.create_bux_entity_set(cohorts, users, transactions) es # ### FEATURE ENGINEERING (DFS) # In[17]: top_features = ft.load_features("top_features", es) fm = utils.calculate_feature_matrix_top_features(es, top_features) X = fm.reset_index(drop=True).fillna(0) print("Features built:\n", list(fm.columns)) # ### LOADING THE MODEL # In[18]: model = joblib.load('models/model.pkl') print("Model loaded") # ### SCORING # In[19]: y_pred = utils.rf_predict(model, X, prediction_problem_type) print("Prediction done") # In[20]: # save predictions in a csv predictions = pd.DataFrame() predictions["user_id"] = user_details["user_id"] predictions["topic_type"] = "clv_prediction" predictions['report_date'] = pd.to_datetime('today').strftime("%Y-%m-%d") predictions["model_type"] = "randomforest" predictions["class_prediction"] = y_pred predictions["prob"] = 0 predictions = predictions[["topic_type", "report_date", "model_type", "user_id", "class_prediction", "prob"]] predictions.head() # ### SAVE AS CSV AND/OR LOAD RESULTS INTO THE THE DATABASE # In[21]: if save_as_csv: predictions.to_csv("scoring/results" + users_from + "-" + users_till, index=False) # In[22]: if load_to_database: utils_bux.copy_to_database(predictions, 'db_table_name', conn)
def build_transaction_data(): """ Builds a data set from raw card and transaction data using the featuretools package. The resulting data set will be strictly concerned with transactions shown in the historical transactions CSV, and linking them to the proper card. :return: training, testing feature matrices """ logger = logging.getLogger(__name__) logger.info("Reading in card data") customer_df = pd.read_csv("data/raw/train.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") customer_df.drop(columns='target', inplace=True) logger.info("Reading in transactions") transactions_df = pd.read_csv("data/raw/historical_transactions.csv", dtype=TRANSACTION_LOAD_DTYPES) transactions_df['authorized_flag'] = np.where( transactions_df['authorized_flag'] == 'Y', 1, 0) transactions_df.reset_index(inplace=True) logger.info("Creating training entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) del customer_df gc.collect() logger.info("Defining relationships") relationship = ft.Relationship(es_train['customer']['card_id'], es_train['transactions']['card_id']) es_train = es_train.add_relationship(relationship) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity='customer') train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Loading test data") customer_df = pd.read_csv("data/raw/test.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") logger.info("Creating testing entity set") es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) es_test = es_test.add_relationship(relationship) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) for col in train_feature_matrix_enc.columns: logger.debug(f"Normalizing feature [{col}]") old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max']) if (old_min == old_max): logger.debug(f"Droping feature [{col}] due to lack of variation") train_feature_matrix_enc.drop(columns=col, inplace=True) test_feature_matrix_enc.drop(columns=col, inplace=True) continue train_feature_matrix_enc[col] = normalize_series( series=train_feature_matrix_enc[col], min_max=(old_min, old_max)) assert col in test_feature_matrix_enc.columns test_feature_matrix_enc[col] = normalize_series( series=test_feature_matrix_enc[col], min_max=(old_min, old_max)) logger.info("Dropping SKEW features.") # TODO: Determine why these have lower counts than other features drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c] train_feature_matrix_enc.drop(columns=drop_cols, inplace=True) test_feature_matrix_enc.drop(columns=drop_cols, inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def test_deserializer_uses_common_primitive_instances_with_args(es, tmp_path): # Single argument scalar1 = MultiplyNumericScalar(value=1) scalar5 = MultiplyNumericScalar(value=5) features = ft.dfs( entityset=es, target_dataframe_name="products", features_only=True, agg_primitives=["sum"], trans_primitives=[scalar1, scalar5], ) scalar1_features = [ f for f in features if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name() ] scalar5_features = [ f for f in features if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name() ] # Make sure we have multiple features of each type assert len(scalar1_features) > 1 assert len(scalar5_features) > 1 # DFS should use the the passed in primitive instance for all features assert all([f.primitive is scalar1 for f in scalar1_features]) assert all([f.primitive is scalar5 for f in scalar5_features]) file = os.path.join(tmp_path, "features.json") ft.save_features(features, file) deserialized_features = ft.load_features(file) new_scalar1_features = [ f for f in deserialized_features if f.primitive.name == "multiply_numeric_scalar" and " * 1" in f.get_name() ] new_scalar5_features = [ f for f in deserialized_features if f.primitive.name == "multiply_numeric_scalar" and " * 5" in f.get_name() ] # After deserialization all features that share a primitive should use the same primitive instance new_scalar1_primitive = new_scalar1_features[0].primitive new_scalar5_primitive = new_scalar5_features[0].primitive assert all([f.primitive is new_scalar1_primitive for f in new_scalar1_features]) assert all([f.primitive is new_scalar5_primitive for f in new_scalar5_features]) assert new_scalar1_primitive.value == 1 assert new_scalar5_primitive.value == 5 # Test primitive with multiple args - pandas only due to primitive compatibility if es.dataframe_type == Library.PANDAS.value: distance_to_holiday = DistanceToHoliday( holiday="Victoria Day", country="Canada" ) features = ft.dfs( entityset=es, target_dataframe_name="customers", features_only=True, agg_primitives=[], trans_primitives=[distance_to_holiday], ) distance_features = [ f for f in features if f.primitive.name == "distance_to_holiday" ] assert len(distance_features) > 1 # DFS should use the the passed in primitive instance for all features assert all([f.primitive is distance_to_holiday for f in distance_features]) file = os.path.join(tmp_path, "distance_features.json") ft.save_features(distance_features, file) new_distance_features = ft.load_features(file) # After deserialization all features that share a primitive should use the same primitive instance new_distance_primitive = new_distance_features[0].primitive assert all( [f.primitive is new_distance_primitive for f in new_distance_features] ) assert new_distance_primitive.holiday == "Victoria Day" assert new_distance_primitive.country == "Canada" # Test primitive with list arg is_in = IsIn(list_of_outputs=[5, True, "coke zero"]) features = ft.dfs( entityset=es, target_dataframe_name="customers", features_only=True, agg_primitives=[], trans_primitives=[is_in], ) is_in_features = [f for f in features if f.primitive.name == "isin"] assert len(is_in_features) > 1 # DFS should use the the passed in primitive instance for all features assert all([f.primitive is is_in for f in is_in_features]) file = os.path.join(tmp_path, "distance_features.json") ft.save_features(is_in_features, file) new_is_in_features = ft.load_features(file) # After deserialization all features that share a primitive should use the same primitive instance new_is_in_primitive = new_is_in_features[0].primitive assert all([f.primitive is new_is_in_primitive for f in new_is_in_features]) assert new_is_in_primitive.list_of_outputs == [5, True, "coke zero"]
# pandas and numpy for data manipulation import pandas as pd import numpy as np import json # featuretools for automated feature engineering import featuretools as ft import featuretools.variable_types as vtypes featurenames = ft.load_features('../input/features.txt') print('Number of features: {}'.format(len(featurenames))) print('Reading in data') # Read in the datasets and replace the anomalous values app_train = pd.read_csv('../input/application_train.csv').replace({365243: np.nan}) app_test = pd.read_csv('../input/application_test.csv').replace({365243: np.nan}) bureau = pd.read_csv('../input/bureau.csv').replace({365243: np.nan}) bureau_balance = pd.read_csv('../input/bureau_balance.csv').replace({365243: np.nan}) cash = pd.read_csv('../input/POS_CASH_balance.csv').replace({365243: np.nan}) credit = pd.read_csv('../input/credit_card_balance.csv').replace({365243: np.nan}) previous = pd.read_csv('../input/previous_application.csv').replace({365243: np.nan}) installments = pd.read_csv('../input/installments_payments.csv').replace({365243: np.nan}) app_test['TARGET'] = np.nan # Join together training and testing app = app_train.append(app_test, ignore_index = True, sort = True) def convert_types(df):
def gen_feature_matrix(entityset, features_only=False, feature_matrix_encode=False, saved_features=None): '''A function compute and return (feature_matrix, feature_defs) from an featuretools EntitySet entityset: the EntitySet to compute features from features_only: only return feature_defs, do not actually compute the feature_matrix feature_matrix_encode: whether return encoded feature_matrix (Categorical variable one-hot) saved_features: load a pre defined feature file and compute feature_matrix based on it ''' if 'goldstandard' in entityset.entity_dict.keys(): goldstandard_exist = True goldstandard_id = 'goldstandard' else: goldstandard_exist = False goldstandard_id = None ##FIX manual partition by person_id does NOT improve Dask computing performance # ignore 'partition' columns in every entity when building features # ignore_variables = dict() # for entity in entityset.entities: # if 'partition' in [v.name for v in entity.variables]: # ignore_variables[entity.id] = ['partition'] ##CAUTION when the entityset is backed by Dask dataframes, only limited set of primitives are supported # agg_primitives_all=['avg_time_between', 'count', 'all', 'entropy', 'last', 'num_unique', 'n_most_common', # 'min', 'std', 'median', 'mean', 'percent_true', 'trend', 'sum', 'time_since_last', 'any', # 'num_true', 'time_since_first', 'first', 'max', 'mode', 'skew'] # agg_primitives_dask=['count', 'all', 'num_unique', #'n_most_common', # 'min', 'std', 'mean', 'percent_true', 'sum', 'any', # 'num_true', 'max'] ## define features per entity(table) agg_primitives = [ 'mean', 'max', 'min', 'std', 'last', 'skew', 'time_since_last' ] # 'trend' # trend takes extremely long time to compute include_variables = { 'measurement': ['measurement_datetime', 'value_as_number', 'measurement_concept_id'], 'observation': ['observation_concept_id', 'observation_datetime', 'value_as_number'] } agg_primitives_device_exposure = [ 'count', 'avg_time_between', 'time_since_first' ] include_entities_device_exposure = ['device_exposure'] trans_primitives = ['age'] groupby_trans_primitives = [] include_entities = ['person'] primitive_options = { tuple(trans_primitives): { 'include_entities': include_entities }, tuple(agg_primitives): { 'include_variables': include_variables }, tuple(agg_primitives_device_exposure): { 'include_entities': include_entities_device_exposure }, } ignore_entities = [ goldstandard_id, 'condition_occurrence', 'drug_exposure', 'observation_period', 'procedure_occurrence', 'visit_occurrence' ] ignore_variables = {} where_primitives = agg_primitives entityset['measurement'][ 'measurement_concept_id'].interesting_values = entityset[ 'measurement'].df['measurement_concept_id'].unique() entityset['observation'][ 'observation_concept_id'].interesting_values = entityset[ 'observation'].df['observation_concept_id'].unique() # if isinstance(entityset.entities[0].df, pandas.DataFrame): # agg_primitives = agg_primitives_all # else: # agg_primitives = agg_primitives_dask # build features if saved_features is None: with yaspin(color="yellow") as spinner: spinner.write( "No features definition file specified, calculating feature matrix from ground zero ... " ) feature_defs = ft.dfs( entityset=entityset, target_entity="person", features_only=True, agg_primitives=agg_primitives + agg_primitives_device_exposure, trans_primitives=trans_primitives, groupby_trans_primitives=groupby_trans_primitives, primitive_options=primitive_options, ignore_entities=ignore_entities, ignore_variables=ignore_variables, where_primitives=where_primitives, max_depth=2) spinner.write("> generated {} features".format(len(feature_defs))) if features_only: return feature_defs tic = time.perf_counter() feature_matrix = ft.calculate_feature_matrix( feature_defs, entityset) if isinstance(entityset.entities[0].df, dd.DataFrame): feature_matrix = feature_matrix.compute() toc = time.perf_counter() spinner.write( f"> feature matrix calculate completed in {toc - tic:0.4f} seconds" ) if feature_matrix_encode: feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) spinner.write( "> generated {} encoded features and the feature matrix". format(len(features_enc))) spinner.ok("Done") else: with yaspin(color="yellow") as spinner: spinner.write( "Using saved features from {} ... ".format(saved_features)) feature_defs = ft.load_features(saved_features) spinner.write("> {} features loaded from {}".format( len(feature_defs), saved_features)) tic = time.perf_counter() feature_matrix = ft.calculate_feature_matrix( feature_defs, entityset) if isinstance(entityset.entities[0].df, dd.DataFrame): feature_matrix = feature_matrix.compute() toc = time.perf_counter() spinner.write( f"> feature matrix calculate complete in {toc - tic:0.4f} seconds" ) spinner.ok("Done") if goldstandard_exist: if isinstance(entityset.entities[0].df, dd.DataFrame): goldstandard = entityset['goldstandard'].df.compute() else: goldstandard = entityset['goldstandard'].df if feature_matrix_encode: feature_matrix = feature_matrix_enc if goldstandard_exist: feature_matrix = feature_matrix.merge(goldstandard, on='person_id', how='right') return feature_matrix, feature_defs