def run(): print("============================================") print("Testing Multi Table Pipeline") print("============================================") orders = pd.read_csv("data/Retail/orders.csv") order_products = pd.read_csv("data/Retail/order_products.csv") label_times = pd.read_csv("data/Retail/label_times.csv") X_train = label_times.sample(frac=0.8) X_test = label_times.drop(X_train.index) y_train = X_train["label"] y_test = X_test["label"] entity_set = make_entity_set(orders, order_products) multitable = MLPipeline(['dfs', 'random_forest_classifier']) updated_hyperparam = MLHyperparam('max_depth', 'int', [1, 10]) updated_hyperparam.block_name = 'dfs' # multitable.update_tunable_hyperparams([updated_hyperparam]) # Check that the hyperparameters are correct. for hyperparam in multitable.get_tunable_hyperparams(): print(hyperparam) # Check that the blocks are correct. expected_blocks = {'dfs', 'rf_classifier'} blocks = set(multitable.blocks.keys()) assert expected_blocks == blocks # Check that we can score properly. produce_params = { ('dfs', 'entityset'): entity_set, ('dfs', 'cutoff_time_in_index'): True } print("\nFitting pipeline...") fit_params = { ('dfs', 'entityset'): entity_set, ('dfs', 'target_entity'): "users", ('dfs', 'training_window'): ft.Timedelta("60 days") } multitable.fit(X_train, y_train, fit_params=fit_params, produce_params=produce_params) print("\nFit pipeline.") print("\nScoring pipeline...") predicted_y_val = multitable.predict(X_test, predict_params=produce_params) score = f1_score(predicted_y_val, y_test, average='micro') print("\nf1 micro score: %f" % score) return score
def __init__(self, customer_entity, customer_entity_index, training_window_unit, training_window, agg_primitives=None, trans_primitives=None, ignore_entities=None, ignore_variables=None, n_jobs=1, chunk_size=0.1, drop_contains=None, drop_exact=None, entity_set_drop_index_list=[], auto_max_values=None, manual_interesting_values_info=None, where_primitives=None, default_time_col_name='time', str_id_col_threshold=0.9): self.customer_entity = customer_entity self.customer_entity_index = customer_entity_index self.agg_primitives = agg_primitives self.trans_primitives = trans_primitives self.ignore_entities = ignore_entities self.ignore_variables = ignore_variables self.training_window_unit = training_window_unit self.training_window = training_window self.n_jobs = n_jobs self.chunk_size = chunk_size self.drop_contains = drop_contains self.drop_exact = drop_exact self.entity_set_drop_index_list = entity_set_drop_index_list self.auto_max_values = auto_max_values self.manual_interesting_values_info = manual_interesting_values_info self.where_primitives = where_primitives self.default_time_col_name = default_time_col_name self.str_id_col_threshold = str_id_col_threshold self.feature_engineering_class = self.get_feature_engineering_class() # 进一步处理 self.training_window = ft.Timedelta( self.training_window, unit=self._timedelta_mapper[self.training_window_unit]) if self.drop_contains is None: self.drop_contains = entity_set_drop_index_list else: self.drop_contains += entity_set_drop_index_list
def test_serialization(es): primitives_deserializer = PrimitivesDeserializer() value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() max1 = ft.AggregationFeature(value, es['customers'], primitive) path = next(es.find_backward_paths('customers', 'log')) dictionary = { 'name': None, 'base_features': [value.unique_name()], 'relationship_path': [r.to_dictionary() for r in path], 'primitive': serialize_primitive(primitive), 'where': None, 'use_previous': None, } assert dictionary == max1.get_arguments() assert max1 == \ ft.AggregationFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, primitives_deserializer) is_purchased = ft.IdentityFeature(es['log']['purchased']) use_previous = ft.Timedelta(3, 'd') max2 = ft.AggregationFeature(value, es['customers'], primitive, where=is_purchased, use_previous=use_previous) dictionary = { 'name': None, 'base_features': [value.unique_name()], 'relationship_path': [r.to_dictionary() for r in path], 'primitive': serialize_primitive(primitive), 'where': is_purchased.unique_name(), 'use_previous': use_previous.get_arguments(), } assert dictionary == max2.get_arguments() dependencies = { value.unique_name(): value, is_purchased.unique_name(): is_purchased } assert max2 == \ ft.AggregationFeature.from_dictionary(dictionary, es, dependencies, primitives_deserializer)
def test_serialization(es): primitives_deserializer = PrimitivesDeserializer() value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() max1 = ft.AggregationFeature(value, "customers", primitive) path = next(es.find_backward_paths("customers", "log")) dictionary = { "name": None, "base_features": [value.unique_name()], "relationship_path": [r.to_dictionary() for r in path], "primitive": serialize_primitive(primitive), "where": None, "use_previous": None, } assert dictionary == max1.get_arguments() deserialized = ft.AggregationFeature.from_dictionary( dictionary, es, {value.unique_name(): value}, primitives_deserializer ) _assert_agg_feats_equal(max1, deserialized) is_purchased = ft.IdentityFeature(es["log"].ww["purchased"]) use_previous = ft.Timedelta(3, "d") max2 = ft.AggregationFeature( value, "customers", primitive, where=is_purchased, use_previous=use_previous ) dictionary = { "name": None, "base_features": [value.unique_name()], "relationship_path": [r.to_dictionary() for r in path], "primitive": serialize_primitive(primitive), "where": is_purchased.unique_name(), "use_previous": use_previous.get_arguments(), } assert dictionary == max2.get_arguments() dependencies = { value.unique_name(): value, is_purchased.unique_name(): is_purchased, } deserialized = ft.AggregationFeature.from_dictionary( dictionary, es, dependencies, primitives_deserializer ) _assert_agg_feats_equal(max2, deserialized)
def test_serialization(es): primitives_deserializer = PrimitivesDeserializer() value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() max1 = ft.AggregationFeature(value, es['sessions'], primitive) dictionary = { 'base_features': [value.unique_name()], 'parent_entity_id': 'sessions', 'primitive': serialize_primitive(primitive), 'where': None, 'use_previous': None, } assert dictionary == max1.get_arguments() assert max1 == \ ft.AggregationFeature.from_dictionary(dictionary, es, {value.unique_name(): value}, primitives_deserializer) is_purchased = ft.IdentityFeature(es['log']['purchased']) use_previous = ft.Timedelta(3, 'd') max2 = ft.AggregationFeature(value, es['sessions'], primitive, where=is_purchased, use_previous=use_previous) dictionary = { 'base_features': [value.unique_name()], 'parent_entity_id': 'sessions', 'primitive': serialize_primitive(primitive), 'where': is_purchased.unique_name(), 'use_previous': use_previous.get_arguments(), } assert dictionary == max2.get_arguments() dependencies = { value.unique_name(): value, is_purchased.unique_name(): is_purchased } assert max2 == \ ft.AggregationFeature.from_dictionary(dictionary, es, dependencies, primitives_deserializer)
def main(): logger = logging.getLogger(__name__) logger.info('creating a bunch of features') pbar = ProgressBar() pbar.register() target_entities = ['ip', 'app', 'device', 'os', 'channel'] filenames_train = sorted(glob('../data/interim/train_2017-11-*00.csv')) training_windows = ['1 hours', '3 hours', '1 day'] for target_entity in target_entities: filenames = glob( f"../data/interim/partitioned/{target_entity}/train_*.csv") b = bag.from_sequence(filenames) entity_sets = b.map(create_entityset, target_entity).compute() gc.collect() for filename in filenames_train: logger.info(f"Processing: {filename}") df = pd.read_csv(filename, usecols=['click_time'], parse_dates=to_parse) cutoff_time = df['click_time'].min() del df for training_window in training_windows: create_features(filename, entity_sets, target_entity=target_entity, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window)) del entity_sets, b gc.collect() logger.info('finished')
def dfsWindow(self, target_entity, time_scope=None, training_window=None, cutoff_times=None, max_depth=1, chunk_size=None, n_jobs=1): '''Runs dfs on the target_entity and outputs a feature matrix with features based on the training_window and time_scope relative to cutoff times. If no training_window, time_scope, or cutoff_times are specified, regular dfs will run without using cutoff times. target_entity: str. Name of target_entity in entity set to run dfs on. The index of the target_entity must match the instance_id column in the cutoff_times table. time_scope: 'daily', 'weekly' or 'monthly'. Assumes 7 days in a week, and 31 days in a month. training_window: list of integers that refer to the number of months or weeks depending on the time_scope. Ex. [1, 2] for time_scope='monthly' returns features based on the last month and last 2 months from the cutoff date. cutoff_times: Pandas dataframe with instance_id, cutoff_dates, and label (label is optional). Any columns after instance_id and cutoff_dates will not be used for feature synthesis. The instance_id column must match the index of the target entity. max_depth: integer, defines how many levels of dfs to run. For example if max_depth = 2 on a transactions table, features returned include avg. transactions and avg. of avg. transactions. chunk_size: integer, float, None, or "cutoff time". Number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, it will use that many rows per chunk. If passed a float value between 0 and 1, sets the chunk size to that percentage of all instances. If passed “cutoff time”, rows are split per cutoff time. n_jobs: integer. The number of parallel processes to use when creating the feature matrix. ''' orig_window = training_window if (time_scope is None) or (training_window is None) or (cutoff_times is None): self.df, feature_defs = ft.dfs( entityset=self.es, target_entity=target_entity, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, where_primitives=self.where_primitives, max_depth=max_depth, features_only=False, verbose=1, chunk_size=chunk_size, n_jobs=n_jobs) else: self.df, feature_defs = ft.dfs( entityset=self.es, target_entity=target_entity, cutoff_time=cutoff_times, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, where_primitives=self.where_primitives, max_depth=max_depth, features_only=False, verbose=1, chunk_size=chunk_size, n_jobs=n_jobs, cutoff_time_in_index=True) if time_scope == 'daily': training_window = [int(x) for x in orig_window] for i in range(len(training_window)): feature_matrix = ft.calculate_feature_matrix( entityset=self.es, features=feature_defs, cutoff_time=cutoff_times, chunk_size=chunk_size, cutoff_time_in_index=True, n_jobs=n_jobs, training_window=ft.Timedelta(training_window[i], "d")) suffix = '_' + str(orig_window[i]) + 'day' feature_matrix = feature_matrix.add_suffix(suffix) self.df = pd.concat([self.df, feature_matrix], axis=1, join='inner') elif time_scope == 'monthly': training_window = [x * 30 for x in orig_window] for i in range(len(training_window)): feature_matrix = ft.calculate_feature_matrix( entityset=self.es, features=feature_defs, cutoff_time=cutoff_times, chunk_size=chunk_size, cutoff_time_in_index=True, n_jobs=n_jobs, training_window=ft.Timedelta(training_window[i], "d")) suffix = '_' + str(orig_window[i]) + 'mos' feature_matrix = feature_matrix.add_suffix(suffix) self.df = pd.concat([self.df, feature_matrix], axis=1, join='inner') elif time_scope == 'weekly': training_window = [x * 7 for x in orig_window] for i in range(len(training_window)): feature_matrix, feature_defs = ft.dfs( entityset=self.es, target_entity=target_entity, cutoff_time=cutoff_times, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, where_primitives=self.where_primitives, max_depth=max_depth, features_only=False, verbose=1, chunk_size=chunk_size, cutoff_time_in_index=True, n_jobs=n_jobs, training_window=ft.Timedelta(training_window[i], "d")) suffix = '_' + str(orig_window[i]) + 'wks' feature_matrix = feature_matrix.add_suffix(suffix) self.df = pd.concat([self.df, feature_matrix], axis=1, join='inner') else: print("ERROR: time_scope entered is not one of the options.") drop_duplicates = DropDuplicate() self.df = drop_duplicates.fit_transform(self.df) for i in self.df.columns: self.feature_defs.append(i) return self.df
import featuretools as ft import pandas as pd import utils, os from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score es = utils.load_entityset("./featuretools_part_1/") print(es) label_times = utils.make_labels(es=es, product_name="Banana", cutoff_time=pd.Timestamp('March 15, 2015'), prediction_window=ft.Timedelta("4 weeks"), training_window=ft.Timedelta("60 days")) feature_matrix, features = ft.dfs( target_entity="users", cutoff_time=label_times, training_window=ft.Timedelta("60 days"), # same as above entityset=es, verbose=True) # Encode categorical values fm_encoded, features_encoded = ft.encode_features(feature_matrix, features) print("Number of features %s" % len(features_encoded)) print(features_encoded) # Sample the feature by user input # Train the classifier