def valid_dfs(es, aggregations, transforms, feature_substrings, target_entity='log', multi_output=False, max_depth=3, max_features=-1, instance_ids=[0, 1, 2, 3]): if not isinstance(feature_substrings, list): feature_substrings = [feature_substrings] features = dfs(entityset=es, target_entity=target_entity, agg_primitives=aggregations, trans_primitives=transforms, max_features=max_features, max_depth=max_depth, features_only=True) applicable_features = [] for feat in features: for x in feature_substrings: if x in feat.get_name(): applicable_features.append(feat) if len(applicable_features) == 0: raise ValueError('No feature names with %s, verify the name attribute \ is defined and/or generate_name() is defined to \ return %s ' % (feature_substrings, feature_substrings)) df = ft.calculate_feature_matrix(entityset=es, features=applicable_features, instance_ids=instance_ids) ft.encode_features(df, applicable_features) # TODO: check the multi_output shape by checking # feature.number_output_features for each feature # and comparing it with the matrix shape if not multi_output: assert len(applicable_features) == df.shape[1] return
def feature_encoder(feature_matrix, features, top_n=10, include_unknown=True, to_encode=None, inplace=False, drop_first=False, verbose=False): """ :param feature_matrix: df 格式的特征 :param features: 特征矩阵的特征定义 :param top_n: 包含的tokn的数目, int 或 使用字典, key为特征的名字,value对敌营top的数目,默认为10 :param include_unknown: #默认为True,添加编码未知类的特性。默认为true :param to_encode: #要编码的特征名字列表 ["name1","name2"] :param inplace: 默认为False, 是否替代 :param drop_first: 是否通过去除第一级从k个分类级别获取k-1个。默认为false :param verbose: 打印进度信息, :return: """ fm_encoded, f_encoded = ft.encode_features( feature_matrix=feature_matrix, features=features, top_n=top_n, include_unknown=include_unknown, to_encode=to_encode, inplace=inplace, drop_first=drop_first, verbose=verbose) return fm_encoded, f_encoded
def generate_feature_matrix(self, es, target, cutoff, verbose=True): """Calculates a feature matrix and features given in Featurization object. Args: es: A featuretools entityset that holds injested data. target: A string of the target entity name. cutoff: A pandas dataframe that indicates cutoff_time for each instance. verbose: A boolean indicator of verbose option. Returns: A pandas dataframe of the calculated matrix. """ feature_matrix, features_defs = ft.dfs( entityset=es, target_entity=target, agg_primitives=self.agg_prim(), trans_primitives=self.trans_prim(), cutoff_time=cutoff, n_jobs=self.n_jobs(), max_depth=self.max_depth(), verbose=verbose) # encode categorical values fm_encoded, features_encoded = ft.encode_features( feature_matrix, features_defs) return fm_encoded, features_encoded
def load_train_data(): print('Loading CSV data...') applications_df = pd.read_csv(C_PATH + 'application_train.csv') previous_df = pd.read_csv(C_PATH + 'previous_application.csv') # bureau_df = pd.read_csv(C_PATH + 'bureau.csv') print("Creating entityset...") es = ft.EntitySet(id="home-credit") print("Loading applications entity...") es = es.entity_from_dataframe(entity_id="applications", dataframe=applications_df, index="SK_ID_CURR") print("Loading previous entity...") es = es.entity_from_dataframe(entity_id="previous", dataframe=previous_df, index="SK_ID_PREV") # print("Loading bureau data...") # es = es.entity_from_dataframe(entity_id="bureau", dataframe=bureau_df, index="SK_ID_BUREAU") print("Adding relationships...") applications_previous = ft.Relationship(es["applications"]["SK_ID_CURR"], es["previous"]["SK_ID_CURR"]) es = es.add_relationship(applications_previous) # applications_bureau = ft.Relationship(es["applications"]["SK_ID_CURR"], es["bureau"]["SK_ID_CURR"]) # es = es.add_relationship(applications_bureau) # return es print("Generating DFS...") feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="applications", verbose=True) fm_encoded, defs_encoded = ft.encode_features(feature_matrix, feature_defs) return fm_encoded, defs_encoded
def generate_feature_matrix(self, es, target, cutoff, verbose=True): """Calculates a feature matrix and features given in Featurization object. Args: es (featuretools.EntitySet): An already initialized entityset. Required if entities and relationships are not defined. target (str): A string of the target entity name. cutoff (pandas.DataFrame): Specified times at which to calculate the features for each instance. verbose (bool): An indicator of verbose option. Returns: pandas.DataFrame, list: * The generated feature matrix. * List of feature definitions in the feature matrix. """ feature_matrix, features_defs = ft.dfs( entityset=es, target_entity=target, agg_primitives=self.agg_prim(), trans_primitives=self.trans_prim(), cutoff_time=cutoff, n_jobs=self.n_jobs(), max_depth=self.max_depth(), verbose=verbose) # encode categorical values fm_encoded, features_encoded = ft.encode_features( feature_matrix, features_defs) return fm_encoded, features_encoded
def test_retail_binary(ftens_file='retail_binary_files/ftens.csv', labels_file='retail_binary_files/labels.csv', fl_file='retail_binary_files/fl.p'): ftens, labels, fl = construct_retail_example(ftens_file, labels_file, fl_file) baseline_ftens = (ftens.reset_index( 'customer_id', drop=False).drop_duplicates('customer_id', keep='last').set_index('customer_id')) baseline_ftens, baseline_fl = ft.encode_features(baseline_ftens, fl) baseline_ftens, baseline_fl = remove_low_information_features( baseline_ftens, baseline_fl) train_customers, test_customers = train_test_split( baseline_ftens.index.values, shuffle=True, test_size=0.1) train_labels = labels.loc[train_customers] test_labels = labels.loc[test_customers] train_ftens = ftens.loc[(train_customers, slice(None)), :] test_ftens = ftens.loc[(test_customers, slice(None)), :] baseline_train_fm = baseline_ftens.loc[train_customers, :] baseline_test_fm = baseline_ftens.loc[test_customers, :] dl_model = DLDB(regression=False, classes=[False, True], recurrent_layer_sizes=(32, ), dense_layer_sizes=(32, 32), categorical_max_vocab=10) dl_model.fit(train_ftens, train_labels, fl=fl, epochs=1, batch_size=32) predictions = dl_model.predict(test_ftens) score = roc_auc_score(test_labels, predictions) baseline_scores = score_baseline_pipeline(baseline_train_fm, train_labels, baseline_test_fm, test_labels) return score, baseline_scores
def autoFeatureEngineering(es, target_entityName): from featuretools.selection import remove_low_information_features fm, features = ft.dfs( entityset=es, target_entity=target_entityName, #agg_primitives=['Sum', 'Mean', 'Percent_True'], trans_primitives=['divide_numeric', 'multiply_numeric'], #'add_numeric', # trans_primitives=['Hour'], max_depth=1, # approximate='2m', #cutoff_time=cutoff_times[1000:], ignore_variables={'toolgkpi': ['MFG_DATE', targetColumn]}, verbose=True) # -------------------------用one_hot编码特征矩阵------------------------- fm_enc, f_enc = ft.encode_features(fm, features) #print("用one_hot编码特征矩阵 Number of features %s" % len(fm_enc)) # -------------------------na 用0取代------------------------- fm_enc = fm_enc.fillna(0) #print("fillna Number of features %s" % len(fm_enc)) # -------------------------移除较小信息的特征------------------------- fm_enc = remove_low_information_features(fm_enc) # print("移除较小信息的特征 Number of features %s" % len(fm_enc)) # ------------------------- # feature = feature_names[14] # ft.graph_feature(feature) # ft.describe_feature(feature) fm_enc.replace([np.inf, -np.inf], np.nan) # np.inf都用np.nan代替 print(fm_enc.isnull().sum()) # print(fm_enc,f_enc) print(fm_enc.columns) return fm_enc
def _fit_and_return_result(self, *, timeout: float = None, iterations: int = None): if self._entityset is None: raise ValueError( 'Must call .set_training_data() before calling .fit()') ignore_variables = {self._target_entity: [self._target]} time_index = self._entityset[self._target_entity].time_index index = self._entityset[self._target_entity].index cutoff_time = None if time_index: target_df = self._entityset[self._target_entity].df cutoff_time = target_df[[index, time_index]] ignore_variables = None features_only = (not self.hyperparams['encode'] and not self.hyperparams['remove_low_information']) agg_primitives = [ name[12:] for name, value in self.hyperparams.items() if name.startswith('aggregation_') and value ] trans_primitives = [ name[10:] for name, value in self.hyperparams.items() if name.startswith('transform_') and value ] res = ft.dfs(entityset=self._entityset, target_entity=self._target_entity, cutoff_time=cutoff_time, cutoff_time_in_index=False, features_only=features_only, ignore_variables=ignore_variables, max_depth=self.hyperparams['max_depth'], agg_primitives=agg_primitives, trans_primitives=trans_primitives) if not features_only: if self.hyperparams['encode']: fm, self._features = ft.encode_features( *res, top_n=self.hyperparams['top_n'], include_unknown=self.hyperparams['include_unknown']) if self.hyperparams['remove_low_information']: fm, self._features = remove_low_information_features( fm, self._features) self._fitted = True return fm else: self._fitted = True self._features = res
def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): if not entities and not entityset: target_entity = 'X' else: target_entity = target_entity or self.target_entity if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) if self.training_window is not None: entityset.add_last_time_indexes() cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] self.features = ft.dfs( cutoff_time=cutoff_time, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, max_features=self.max_features, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode or self.remove_low_information: X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features( X, self.features)
def compute_features(features, cutoff_time): # shuffle so we don't see encoded features in the front or backs np.random.shuffle(features) feature_matrix = ft.calculate_feature_matrix(features, cutoff_time=cutoff_time, approximate='36d', verbose=True) print "Finishing computing..." feature_matrix, features = ft.encode_features(feature_matrix, features, to_encode=["pickup_neighborhood", "dropoff_neighborhood"], include_unknown=False) return feature_matrix
def create_features(es, label='Outcome', custom_agg=[]): cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', label]] fm, features = ft.dfs(entityset=es, target_entity='transactions', agg_primitives=[Sum, Mean] + custom_agg, trans_primitives=[Hour], max_depth=3, approximate='2m', cutoff_time=cutoff_times, verbose=True) fm_enc, _ = ft.encode_features(fm, features) fm_enc = fm_enc.fillna(0) fm_enc = remove_low_information_features(fm_enc) labels = fm.pop(label) return (fm_enc, labels)
def calculate_feature_matrix(es, target_entity, trans_primitives, agg_primitives, max_depth): feature_matrix, features = ft.dfs(entityset=es, target_entity=target_entity, trans_primitives=trans_primitives, agg_primitives=agg_primitives, max_depth=max_depth, verbose=True) print("{} features generated".format(len(features))) fm_encoded, features_encoded = ft.encode_features(feature_matrix, features) fm_encoded = fm_encoded.fillna(0) return fm_encoded, features_encoded
def generate_features(self): """ 06. Run deep feature synthesis . """ # Create new features using specified primitives self.feature_matrix, self.feature_defs = ft.dfs( entityset=self.es, target_entity='users', trans_primitives=self.trans_primitives, agg_primitives=self.agg_primitives, verbose=1, max_depth=self.max_feature_depth) # encode at self.feature_matrix_enc, self.features_enc = ft.encode_features( self.feature_matrix, self.feature_defs) self.next(self.split_training_data)
def get_train_data(project, train_file, prediction_key, prediction_target, variable_types={}, drop_columns=None): # Read the training data print("==========Reading the training file {}".format(train_file)) train_data = pd.read_csv(train_file) train_data.head(5) print("==========Preparing training labels for target {}".format( prediction_target)) train_labels = train_data[prediction_target].values train_data = train_data.drop(prediction_target, axis=1) if drop_columns is not None: print("==========dropping columns {}".format(drop_columns)) train_data = train_data.drop(drop_columns, axis=1) print("==========Generating the feature with featuretools") es = ft.EntitySet(project) entities = get_ft_entities(es=es, project=project, prediction_key=prediction_key, data=train_data, variable_types=variable_types) print("==========entities are:") print(entities) feature_matrix, feature_defs = ft.dfs(entityset=entities, target_entity=project) feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) print("==========columns are:") print(feature_matrix_enc.columns) print("==========saving features to {}".format(project)) ft.save_features(feature_defs, "data/{}/ft_features".format(project)) return feature_matrix_enc, train_labels
def build_card_one_hot(): """ Reads in the raw data from train.csv and creates one-hot encodings for the feature and date fields. :return: Data frame with one-hot encoding """ logger = logging.getLogger(__name__) logger.info("Reading in data.") df = pd.read_csv('data/raw/train.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") logger.info("Creating entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity="transactions") logger.info("Creating one-hot training data") train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Creating one-hot test data") df = pd.read_csv('data/raw/test.csv') df['first_active_month'] = pd.to_datetime(df['first_active_month'] + "-01") df['target'] = 0 es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=df, index='card_id', time_index="first_active_month", variable_types=CARD_TYPES) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) test_feature_matrix_enc.drop(columns='target', inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def get_final_data(or_df: pd.DataFrame, features_def, **kwds): ''' check the data types,only support numeric/categorical/boolean,return numeric data. 1.drop unsupport cols 2.encode categorical/boolean cols ''' # drop un numeric/categorical cols unnum = ['bool', 'category'] numeric_and_boolean_dtypes = vtypes.PandasTypes._pandas_numerics + unnum clean_df = or_df.select_dtypes(include=numeric_and_boolean_dtypes) unuse_col = set(or_df.columns) - set(clean_df.columns) features_def = [ item for item in features_def if item.get_name() not in unuse_col ] warnings.warn(f'{unuse_col} columns will be dropped because the dtype') # categorical/boolean will be encode to number by one-hot; clean_df, features_def = ft.encode_features(clean_df, features_def, **kwds) return clean_df, features_def
def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): if not entities and not entityset: target_entity = 'X' else: target_entity = target_entity or self.target_entity if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) instance_ids = None cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] elif self.index: instance_ids = X[self.index] else: instance_ids = X.index.values self.features = ft.dfs( cutoff_time=cutoff_time, instance_ids=instance_ids, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives ) X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, instance_ids=instance_ids, ) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features(X, self.features)
def produce(self, X, instance_ids=None, include_unknown=True, remove_low_information=True, **kwargs): if instance_ids is not None: feature_matrix = ft.calculate_feature_matrix( self.features, instance_ids=instance_ids, **kwargs) feature_matrix = (feature_matrix.reset_index('time').loc[ instance_ids, :].set_index('time', append=True)) else: feature_matrix = ft.calculate_feature_matrix(self.features, cutoff_time=X, **kwargs) for f in self.features: if issubclass(f.variable_type, vtypes.Discrete): feature_matrix[f.get_name()] = feature_matrix[ f.get_name()].astype(object) elif issubclass(f.variable_type, vtypes.Numeric): feature_matrix[f.get_name()] = pd.to_numeric( feature_matrix[f.get_name()]) elif issubclass(f.variable_type, vtypes.Datetime): feature_matrix[f.get_name()] = pd.to_datetime( feature_matrix[f.get_name()]) encoded_fm, encoded_fl = ft.encode_features(feature_matrix, self.features) if remove_low_information: encoded_fm, encoded_fl = remove_low_information_features( encoded_fm, encoded_fl) encoded_fm.reset_index('time', drop=True, inplace=True) return encoded_fm.fillna(0)
def get_test_data(project, testfile, prediction_key, prediction_target, variable_types={}, drop_columns=None): print("==========Reading test data file {}".format(testfile)) test_data = pd.read_csv(testfile) print(test_data.describe()) if drop_columns is not None: print("==========dropping columns {}".format(drop_columns)) test_data = test_data.drop(drop_columns, axis=1) es = ft.EntitySet(project) entities = get_ft_entities(es=es, project=project, prediction_key=prediction_key, data=test_data, variable_types=variable_types) print("==========entities are:") print(entities) print("==========Reading features from {}".format(project)) saved_features = ft.load_features("data/{}/ft_features".format(project)) print("==========saved_features are:") print(saved_features) feature_matrix = ft.calculate_feature_matrix(saved_features, entities) feature_matrix_enc, _ = ft.encode_features(feature_matrix, saved_features) index_column = test_data[prediction_key] return feature_matrix_enc, index_column
def compute_features(self, df, cutoff_strategy, feature_window): assert cutoff_strategy.entity_col == self.entity_col cutoffs = cutoff_strategy.generate_cutoffs(df) cutoffs_ft = [] for _id, row in cutoffs.iterrows(): cutoffs_ft.append((row[self.entity_col], row['cutoff_st'] - timedelta(days=1))) cutoffs_ft = pd.DataFrame(cutoffs_ft, columns=['instance_id', 'time']) feature_matrix, features = ft.dfs(target_entity=self.entity_col, cutoff_time=cutoffs_ft, training_window="%dday" % feature_window, # same as above entityset=self.es, cutoff_time_in_index=True, verbose=True) # encode categorical values fm_encoded, features_encoded = ft.encode_features(feature_matrix, features) self.features = fm_encoded.fillna(0)
def dfs(self, X=None, target_entity='X', entityset=None, entities=None, relationships=None): if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) target = entityset[target_entity] time_index = target.time_index index = target.index cutoff_time = None if time_index: cutoff_time = target.df[[index, time_index]] instance_ids = X[index].values.copy() self.features = ft.dfs(cutoff_time=cutoff_time, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, instance_ids=instance_ids) X = ft.calculate_feature_matrix(self.features, entityset=entityset, instance_ids=instance_ids) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features( X, self.features)
"GarageFinish": ft.variable_types.Categorical, "GarageQual": ft.variable_types.Categorical, "GarageCond": ft.variable_types.Categorical, "PavedDrive": ft.variable_types.Categorical, "PoolQC": ft.variable_types.Categorical, "Fence": ft.variable_types.Categorical, "MiscFeature": ft.variable_types.Categorical, "MoSold": ft.variable_types.Categorical, "YrSold": ft.variable_types.Categorical, "SaleType": ft.variable_types.Categorical, "SaleCondition": ft.variable_types.Categorical }) #The training Set feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="HousingSet") fm_encoded, features_encoded = ft.encode_features(feature_matrix, feature_defs) #Need to normalize the housing prices house_prices = np.log1p(house_prices) #Lets see labels for each X, y = fm_encoded, house_prices ###Now testing set feature_matrix_test, feature_defs_test = ft.dfs(entityset=es, target_entity="HousingTest") fm_encoded_test, features_encoded_test = ft.encode_features( feature_matrix_test, feature_defs_test) Actual_test = fm_encoded_test ##Fixing the alignment X, Actual_test = X.align(Actual_test, join='left', axis=1, fill_value=0)
def build_transaction_data(): """ Builds a data set from raw card and transaction data using the featuretools package. The resulting data set will be strictly concerned with transactions shown in the historical transactions CSV, and linking them to the proper card. :return: training, testing feature matrices """ logger = logging.getLogger(__name__) logger.info("Reading in card data") customer_df = pd.read_csv("data/raw/train.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") customer_df.drop(columns='target', inplace=True) logger.info("Reading in transactions") transactions_df = pd.read_csv("data/raw/historical_transactions.csv", dtype=TRANSACTION_LOAD_DTYPES) transactions_df['authorized_flag'] = np.where( transactions_df['authorized_flag'] == 'Y', 1, 0) transactions_df.reset_index(inplace=True) logger.info("Creating training entity set") es_train = ft.EntitySet() es_train = es_train.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_train = es_train.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) del customer_df gc.collect() logger.info("Defining relationships") relationship = ft.Relationship(es_train['customer']['card_id'], es_train['transactions']['card_id']) es_train = es_train.add_relationship(relationship) feature_matrix, feature_defs = ft.dfs(entityset=es_train, target_entity='customer') train_feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) ft.save_features(features_enc, "feature_definitions") saved_features = ft.load_features('feature_definitions') logger.info("Loading test data") customer_df = pd.read_csv("data/raw/test.csv") customer_df['first_active_month'] = pd.to_datetime( customer_df['first_active_month'] + "-01") logger.info("Creating testing entity set") es_test = ft.EntitySet() es_test = es_test.entity_from_dataframe(entity_id='customer', dataframe=customer_df, index='card_id', time_index='first_active_month', variable_types=CARD_TYPES) es_test = es_test.entity_from_dataframe(entity_id='transactions', dataframe=transactions_df, index='index', variable_types=TRANSACTION_TYPES) es_test = es_test.add_relationship(relationship) test_feature_matrix_enc = ft.calculate_feature_matrix( saved_features, es_test) for col in train_feature_matrix_enc.columns: logger.debug(f"Normalizing feature [{col}]") old_min, old_max = train_feature_matrix_enc[col].agg(['min', 'max']) if (old_min == old_max): logger.debug(f"Droping feature [{col}] due to lack of variation") train_feature_matrix_enc.drop(columns=col, inplace=True) test_feature_matrix_enc.drop(columns=col, inplace=True) continue train_feature_matrix_enc[col] = normalize_series( series=train_feature_matrix_enc[col], min_max=(old_min, old_max)) assert col in test_feature_matrix_enc.columns test_feature_matrix_enc[col] = normalize_series( series=test_feature_matrix_enc[col], min_max=(old_min, old_max)) logger.info("Dropping SKEW features.") # TODO: Determine why these have lower counts than other features drop_cols = [c for c in train_feature_matrix_enc.columns if "SKEW" in c] train_feature_matrix_enc.drop(columns=drop_cols, inplace=True) test_feature_matrix_enc.drop(columns=drop_cols, inplace=True) return train_feature_matrix_enc, test_feature_matrix_enc
def generate_feature_matrix(self, es, target, label_times, instance_ids=None, agg_primitives=AGG_PRIMITIVES, trans_primitives=TRANS_PRIMITIVES, max_depth=2, ignore_entities=None, ignore_variables=None, seed_features=None, drop_contains=None, drop_exact=None, max_features=-1, training_window=None, n_jobs=1, verbose=False, include_cutoff_time=True, encode=False): """Calculates a feature matrix and features given in Featurization object. Args: es (featuretools.EntitySet): An already initialized entityset. target (str): Name of the entity (entity id) on which to make predictions. label_times (pandas.DataFrame): A data frame that specifies the times at which to calculate the features for each instance. This data frame contains three columns ``instance_id``, ``time``, ``label``. The ``instance_id`` specifies the instances for which to calculate features over. The ``time`` column specifies the cutoff time for each instance. Data before the cutoff time will be used for calculating the feature matrix. The ``label`` column specifies the ground truth label (value we want to predict) for each instance. instance_ids (list): List of instances on which to calculate features. agg_primitives (list): List of Aggregation Feature types to apply. trans_primitives (list): List of Transform Feature functions to apply. max_depth (int): Maximum allowed depth of features. ignore_entities (list): List of entities to blacklist when creating features. ignore_variables (dict): List of specific variables within each entity to blacklist when creating features. seed_features (list): List of manually defined features to use. drop_contains (list): Drop features that contains these strings in name. drop_exact (list): Drop features that exactly match these strings in name. max_features (int): Cap the number of generated features to this number. If -1, no limit. training_window (ft.Timedelta or str): Window defining how much time before the cutoff time data can be used when c alculating features. If ``None``, all data before cutoff time is used. Defaults to ``None``. Month and year units are not relative when Pandas Timedeltas are used. Relative units should be passed as a Featuretools Timedelta or a string. n_jobs (int): Number of parallel processes to use when calculating feature matrix. verbose (bool): An indicator of verbose option. include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``. encode (bool): Whether or not to encode categorical into one-hot features. Returns: pandas.DataFrame, list: * The generated feature matrix. * List of feature definitions in the feature matrix. """ feature_matrix, features_defs = ft.dfs( entityset=es, target_entity=target, cutoff_time=label_times, instance_ids=instance_ids, agg_primitives=agg_primitives, trans_primitives=trans_primitives, max_depth=max_depth, ignore_entities=ignore_entities, ignore_variables=ignore_variables, seed_features=seed_features, drop_contains=drop_contains, drop_exact=drop_exact, max_features=max_features, training_window=training_window, n_jobs=n_jobs, verbose=verbose, include_cutoff_time=include_cutoff_time) if encode: # encode categorical values return ft.encode_features(feature_matrix, features_defs) return feature_matrix, features_defs
es['transactions']['date_of_birth'].interesting_values = [ '1986-08-18', '1986-08-19' ] #'where_primitives' to specify agg primitives in agg_primitives feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity='products', where_primitives=['count'], agg_primitives=['count', 'mean' ], # specified, otherwise defaults primitives will be used max_depth=1) print(feature_matrix.columns.tolist()) print(feature_matrix.head()) print(feature_defs) print('-----------encode category feature-----------') feature_matrix_enc, feature_enc = ft.encode_features(feature_matrix, feature_defs) print(feature_matrix_enc.columns.tolist()) print(feature_matrix_enc.head()) print(feature_enc) print('-----------list primitives---------------------') print(ft.list_primitives().head()) print('----------custom primitives----------------------') from featuretools.primitives import make_agg_primitive, make_trans_primitive from featuretools.variable_types import Text, Numeric def absolute(column): return abs(column)
def gen_feature_matrix(entityset, features_only=False, feature_matrix_encode=False, saved_features=None): '''A function compute and return (feature_matrix, feature_defs) from an featuretools EntitySet entityset: the EntitySet to compute features from features_only: only return feature_defs, do not actually compute the feature_matrix feature_matrix_encode: whether return encoded feature_matrix (Categorical variable one-hot) saved_features: load a pre defined feature file and compute feature_matrix based on it ''' if 'goldstandard' in entityset.entity_dict.keys(): goldstandard_exist = True goldstandard_id = 'goldstandard' else: goldstandard_exist = False goldstandard_id = None ##FIX manual partition by person_id does NOT improve Dask computing performance # ignore 'partition' columns in every entity when building features # ignore_variables = dict() # for entity in entityset.entities: # if 'partition' in [v.name for v in entity.variables]: # ignore_variables[entity.id] = ['partition'] ##CAUTION when the entityset is backed by Dask dataframes, only limited set of primitives are supported # agg_primitives_all=['avg_time_between', 'count', 'all', 'entropy', 'last', 'num_unique', 'n_most_common', # 'min', 'std', 'median', 'mean', 'percent_true', 'trend', 'sum', 'time_since_last', 'any', # 'num_true', 'time_since_first', 'first', 'max', 'mode', 'skew'] # agg_primitives_dask=['count', 'all', 'num_unique', #'n_most_common', # 'min', 'std', 'mean', 'percent_true', 'sum', 'any', # 'num_true', 'max'] ## define features per entity(table) agg_primitives = [ 'mean', 'max', 'min', 'std', 'last', 'skew', 'time_since_last' ] # 'trend' # trend takes extremely long time to compute include_variables = { 'measurement': ['measurement_datetime', 'value_as_number', 'measurement_concept_id'], 'observation': ['observation_concept_id', 'observation_datetime', 'value_as_number'] } agg_primitives_device_exposure = [ 'count', 'avg_time_between', 'time_since_first' ] include_entities_device_exposure = ['device_exposure'] trans_primitives = ['age'] groupby_trans_primitives = [] include_entities = ['person'] primitive_options = { tuple(trans_primitives): { 'include_entities': include_entities }, tuple(agg_primitives): { 'include_variables': include_variables }, tuple(agg_primitives_device_exposure): { 'include_entities': include_entities_device_exposure }, } ignore_entities = [ goldstandard_id, 'condition_occurrence', 'drug_exposure', 'observation_period', 'procedure_occurrence', 'visit_occurrence' ] ignore_variables = {} where_primitives = agg_primitives entityset['measurement'][ 'measurement_concept_id'].interesting_values = entityset[ 'measurement'].df['measurement_concept_id'].unique() entityset['observation'][ 'observation_concept_id'].interesting_values = entityset[ 'observation'].df['observation_concept_id'].unique() # if isinstance(entityset.entities[0].df, pandas.DataFrame): # agg_primitives = agg_primitives_all # else: # agg_primitives = agg_primitives_dask # build features if saved_features is None: with yaspin(color="yellow") as spinner: spinner.write( "No features definition file specified, calculating feature matrix from ground zero ... " ) feature_defs = ft.dfs( entityset=entityset, target_entity="person", features_only=True, agg_primitives=agg_primitives + agg_primitives_device_exposure, trans_primitives=trans_primitives, groupby_trans_primitives=groupby_trans_primitives, primitive_options=primitive_options, ignore_entities=ignore_entities, ignore_variables=ignore_variables, where_primitives=where_primitives, max_depth=2) spinner.write("> generated {} features".format(len(feature_defs))) if features_only: return feature_defs tic = time.perf_counter() feature_matrix = ft.calculate_feature_matrix( feature_defs, entityset) if isinstance(entityset.entities[0].df, dd.DataFrame): feature_matrix = feature_matrix.compute() toc = time.perf_counter() spinner.write( f"> feature matrix calculate completed in {toc - tic:0.4f} seconds" ) if feature_matrix_encode: feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) spinner.write( "> generated {} encoded features and the feature matrix". format(len(features_enc))) spinner.ok("Done") else: with yaspin(color="yellow") as spinner: spinner.write( "Using saved features from {} ... ".format(saved_features)) feature_defs = ft.load_features(saved_features) spinner.write("> {} features loaded from {}".format( len(feature_defs), saved_features)) tic = time.perf_counter() feature_matrix = ft.calculate_feature_matrix( feature_defs, entityset) if isinstance(entityset.entities[0].df, dd.DataFrame): feature_matrix = feature_matrix.compute() toc = time.perf_counter() spinner.write( f"> feature matrix calculate complete in {toc - tic:0.4f} seconds" ) spinner.ok("Done") if goldstandard_exist: if isinstance(entityset.entities[0].df, dd.DataFrame): goldstandard = entityset['goldstandard'].df.compute() else: goldstandard = entityset['goldstandard'].df if feature_matrix_encode: feature_matrix = feature_matrix_enc if goldstandard_exist: feature_matrix = feature_matrix.merge(goldstandard, on='person_id', how='right') return feature_matrix, feature_defs
import featuretools as ft import pandas as pd import utils, os from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score es = utils.load_entityset("./featuretools_part_1/") print(es) label_times = utils.make_labels(es=es, product_name="Banana", cutoff_time=pd.Timestamp('March 15, 2015'), prediction_window=ft.Timedelta("4 weeks"), training_window=ft.Timedelta("60 days")) feature_matrix, features = ft.dfs( target_entity="users", cutoff_time=label_times, training_window=ft.Timedelta("60 days"), # same as above entityset=es, verbose=True) # Encode categorical values fm_encoded, features_encoded = ft.encode_features(feature_matrix, features) print("Number of features %s" % len(features_encoded)) print(features_encoded) # Sample the feature by user input # Train the classifier
def run_featuretools(self, read_in_data_if_needed=True, export_to_csv=False): # TODO: This should eventually be dynamic. dataset_filenames = ['POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau.csv',\ 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv'] if self.datasets == []: self.read_all_data(dataset_filenames=dataset_filenames) for data in self.datasets: if data.name == 'POS_CASH_balance': pos = data.data elif data.name == 'application_test': test = data.data elif data.name == 'application_train': train_full = data.data elif data.name == 'bureau': bureau = data.data elif data.name == 'bureau_balance': bureau_balance = data.data elif data.name == 'credit_card_balance': cc_bal = data.data elif data.name == 'installments_payments': inst = data.data elif data.name == 'previous_application': prev_app = data.data train = train_full.drop('TARGET', axis=1) train_y = train_full['TARGET'] print('Creating entity set.') # Create new entityset es = ft.EntitySet(id='train') print('Creating train entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='train', dataframe=train, index='SK_ID_CURR') print('Creating bureau entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') print('Creating bureau_bal entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='bureau_bal', dataframe=bureau_balance, make_index=True, index='bureau_bal_id') print('Creating pos entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='pos', dataframe=pos, make_index=True, index='pos_id') print('Creating cc_bal entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='cc_bal', dataframe=cc_bal, make_index=True, index='cc_bal_id') print('Creating inst entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='inst', dataframe=inst, make_index=True, index='inst_id') print('Creating prev_app entity.') print(str(pd.Timestamp.now())) es = es.entity_from_dataframe(entity_id='prev_app', dataframe=prev_app, index='SK_ID_PREV') print('Creating relationships.') print(str(pd.Timestamp.now())) # Create relationships print('Creating r_train_bureau.') print(str(pd.Timestamp.now())) r_train_bureau = ft.Relationship(es['train']['SK_ID_CURR'], es['bureau']['SK_ID_CURR']) es = es.add_relationship(r_train_bureau) print('Creating r_bureau_bureau_bal.') print(str(pd.Timestamp.now())) r_bureau_bureau_bal = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_bal']['SK_ID_BUREAU']) es = es.add_relationship(r_bureau_bureau_bal) print('Creating r_train_pos.') print(str(pd.Timestamp.now())) r_train_pos = ft.Relationship(es['train']['SK_ID_CURR'], es['pos']['SK_ID_CURR']) es = es.add_relationship(r_train_pos) print('Creating r_train_cc_bal.') print(str(pd.Timestamp.now())) r_train_cc_bal = ft.Relationship(es['train']['SK_ID_CURR'], es['cc_bal']['SK_ID_CURR']) es = es.add_relationship(r_train_cc_bal) print('Creating r_train_inst.') print(str(pd.Timestamp.now())) r_train_inst = ft.Relationship(es['train']['SK_ID_CURR'], es['inst']['SK_ID_CURR']) es = es.add_relationship(r_train_inst) print('Creating r_train_prev_app.') print(str(pd.Timestamp.now())) r_train_prev_app = ft.Relationship(es['train']['SK_ID_CURR'], es['prev_app']['SK_ID_CURR']) es = es.add_relationship(r_train_prev_app) print('Creating r_prev_app_pos.') print(str(pd.Timestamp.now())) r_prev_app_pos = ft.Relationship(es['prev_app']['SK_ID_PREV'], es['pos']['SK_ID_PREV']) es = es.add_relationship(r_prev_app_pos) print('Creating r_prev_app_inst.') print(str(pd.Timestamp.now())) r_prev_app_inst = ft.Relationship(es['prev_app']['SK_ID_PREV'], es['inst']['SK_ID_PREV']) es = es.add_relationship(r_prev_app_inst) print('Creating r_prev_app_cc_bal.') print(str(pd.Timestamp.now())) r_prev_app_cc_bal = ft.Relationship(es['prev_app']['SK_ID_PREV'], es['cc_bal']['SK_ID_PREV']) es = es.add_relationship(r_prev_app_cc_bal) # Create new features using specified primitives # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html print('Creating actual features.') print(str(pd.Timestamp.now())) feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'train', \ agg_primitives = ['mean', 'max', 'last'] # trans_primitives = ['years', 'month', 'subtract', 'divide'] ) self.featuretools_feature_set = feature_matrix self.featuretools_feature_names = feature_defs # One hot encode categorical features feature_matrix_enc, feature_defs_enc = ft.encode_features( feature_matrix, feature_defs) # Create entity set for test print('Creating test entity') ts = ft.EntitySet(id='test') print('Creating test entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='test', dataframe=test, index='SK_ID_CURR') print('Creating bureau entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') print('Creating bureau_bal entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='bureau_bal', dataframe=bureau_balance, make_index=True, index='bureau_bal_id') print('Creating pos entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='pos', dataframe=pos, make_index=True, index='pos_id') print('Creating cc_bal entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='cc_bal', dataframe=cc_bal, make_index=True, index='cc_bal_id') print('Creating inst entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='inst', dataframe=inst, make_index=True, index='inst_id') print('Creating prev_app entity.') print(str(pd.Timestamp.now())) ts = ts.entity_from_dataframe(entity_id='prev_app', dataframe=prev_app, index='SK_ID_PREV') print('Creating relationships.') print(str(pd.Timestamp.now())) # Create relationships print('Creating r_test_bureau.') print(str(pd.Timestamp.now())) r_test_bureau = ft.Relationship(ts['test']['SK_ID_CURR'], ts['bureau']['SK_ID_CURR']) ts = ts.add_relationship(r_test_bureau) print('Creating r_bureau_bureau_bal.') print(str(pd.Timestamp.now())) r_bureau_bureau_bal = ft.Relationship(ts['bureau']['SK_ID_BUREAU'], ts['bureau_bal']['SK_ID_BUREAU']) ts = ts.add_relationship(r_bureau_bureau_bal) print('Creating r_test_pos.') print(str(pd.Timestamp.now())) r_test_pos = ft.Relationship(ts['test']['SK_ID_CURR'], ts['pos']['SK_ID_CURR']) ts = ts.add_relationship(r_test_pos) print('Creating r_test_cc_bal.') print(str(pd.Timestamp.now())) r_test_cc_bal = ft.Relationship(ts['test']['SK_ID_CURR'], ts['cc_bal']['SK_ID_CURR']) ts = ts.add_relationship(r_test_cc_bal) print('Creating r_test_inst.') print(str(pd.Timestamp.now())) r_test_inst = ft.Relationship(ts['test']['SK_ID_CURR'], ts['inst']['SK_ID_CURR']) ts = ts.add_relationship(r_test_inst) print('Creating r_test_prev_app.') print(str(pd.Timestamp.now())) r_test_prev_app = ft.Relationship(ts['test']['SK_ID_CURR'], ts['prev_app']['SK_ID_CURR']) ts = ts.add_relationship(r_test_prev_app) print('Creating r_prev_app_pos.') print(str(pd.Timestamp.now())) r_prev_app_pos = ft.Relationship(ts['prev_app']['SK_ID_PREV'], ts['pos']['SK_ID_PREV']) ts = ts.add_relationship(r_prev_app_pos) print('Creating r_prev_app_inst.') print(str(pd.Timestamp.now())) r_prev_app_inst = ft.Relationship(ts['prev_app']['SK_ID_PREV'], ts['inst']['SK_ID_PREV']) ts = ts.add_relationship(r_prev_app_inst) print('Creating r_prev_app_cc_bal.') print(str(pd.Timestamp.now())) r_prev_app_cc_bal = ft.Relationship(ts['prev_app']['SK_ID_PREV'], ts['cc_bal']['SK_ID_PREV']) ts = ts.add_relationship(r_prev_app_cc_bal) # Create new features using specified primitives # Documentation: https://docs.featuretools.com/generated/featuretools.dfs.html print('Creating actual features.') print(str(pd.Timestamp.now())) feature_matrix_test = ft.calculate_feature_matrix( features=feature_matrix_enc, entityset='test') # One hot encode categorical features feature_matrix_test_enc, feature_defs_test_enc = ft.encode_features( feature_matrix_test, feature_defs) print('Done running featuretools!') print('Exporting features to CSV.') if export_to_csv: pd.DataFrame(feature_matrix_enc).to_csv('featuretools_feature.csv') train_y.to_csv('train_y.csv') pd.DataFrame(feature_matrix_test_enc).to_csv( 'featuretools_features_test.csv')
def auto_build(self, model_description): columns = [] if isinstance(model_description["factors"], dict): factors_dict = model_description["factors"] for k, v in factors_dict.items(): columns.extend(v) else: columns.extend(model_description["factors"]) # print("Data Columns: ", self.data.columns.tolist()) # print("Extract Columns: ", columns) factors_df = self.data[columns] factors_df["customer_id"] = list(range(self.data.shape[0])) es = ft.EntitySet(id='customer_experience_entity') es = es.entity_from_dataframe(entity_id='c_id', dataframe=factors_df, index='customer_id') features, feature_names = ft.dfs(entityset=es, target_entity='c_id', max_depth=2, verbose=True) feature_matrix_enc, features_enc = ft.encode_features( features, feature_names) original_factors = set(feature_matrix_enc.columns.tolist()) feature_matrix_enc = feature_matrix_enc.dropna(axis=1) after_naelimination = set(feature_matrix_enc.columns.tolist()) print("Dropped columns with na: ", list(original_factors - after_naelimination)) feature_matrix_enc = feature_matrix_enc.loc[:, ( feature_matrix_enc != 0).any(axis=0)] after_allzeros = set(feature_matrix_enc.columns.tolist()) print("Dropped columns with all zeros: ", after_naelimination - after_allzeros) # print(feature_matrix_enc.head()) # print("Original Columns: ", columns) # print("Generated Columns: ", feature_matrix_enc.columns.tolist()) corr_matrix = feature_matrix_enc.corr() corr_matrix = corr_matrix.dropna(axis=1, how='all') corr_matrix = corr_matrix.dropna(axis=0, how='all') print( "Dropped columns with na in correlation matrix: ", list(after_naelimination - set(corr_matrix.columns.tolist()))) feature_matrix_enc = feature_matrix_enc[ corr_matrix.columns.tolist()] for it in range(10): willdropped = set([]) corr_matrix = feature_matrix_enc.corr() cols = corr_matrix.columns.tolist() for i in range(len(cols)): row = cols[i] if row in willdropped: pass for j in range(i + 1, len(cols)): col = cols[j] if col in willdropped: pass val = corr_matrix[row][col] if np.abs(val) > 0.95: print("{} , {} = {}".format(row, col, val)) willdropped.add(col) if len(list(willdropped)) == 0: break print("Iteration: ", it + 1, " Highly correlated columns have been dropped!: ", list(willdropped)) feature_matrix_enc = feature_matrix_enc.drop( columns=list(willdropped)) correlation_matrix = feature_matrix_enc.corr() covariance_matrix = feature_matrix_enc.cov() cond_number = np.linalg.cond(correlation_matrix.values) print("Condition number: {}".format(cond_number)) copy_model = copy.deepcopy(model_description) current_columns = feature_matrix_enc.columns.tolist() def replace_marks(s): s = s.replace("=", "equals") s = s.replace(".", "dot") s = s.replace(",", "comma") return s current_columns = [ "_".join(replace_marks(c).split(" ")) for c in current_columns ] feature_matrix_enc.columns = current_columns print("Cols: ", current_columns) if isinstance(copy_model["factors"], dict): factors_dict = copy_model["factors"] new_factors_dict = {} for k, v in factors_dict.items(): newv = [] for c in v: replace = list( filter( lambda x: x.startswith("_".join( replace_marks(c).split(" "))), current_columns)) newv.extend(replace) if len(newv) > 0: new_factors_dict[k] = newv else: raise Exception( "Latent variable {} has been dropped! Rearrange your initial model description." .format(k)) copy_model["factors"] = new_factors_dict else: newv = [] for c in copy_model["factors"]: replace = list( filter( lambda x: x.startswith("_".join( replace_marks(c).split(" "))), current_columns)) newv.extend(replace) if len(newv) > 0: copy_model["factors"] = newv else: raise Exception( "All loading factors have been dropped! Rearrange your initial model description." ) others = [] others.extend(copy_model["observations"]) copy_model["observations"] = [ "_".join(replace_marks(c).split(" ")) for c in copy_model["observations"] ] if isinstance(copy_model["kpis"], dict): kpis_dict = copy_model["kpis"] for k, v in kpis_dict.items(): others.extend(v) copy_model["kpis"][k] = [ "_".join(replace_marks(c).split(" ")) for c in v ] else: others.extend(copy_model["kpis"]) copy_model["kpis"] = [ "_".join(replace_marks(c).split(" ")) for c in copy_model["kpis"] ] feature_matrix_enc = feature_matrix_enc.reset_index( inplace=False).drop("customer_id", axis=1) others_df = self.data[others] current_columns = [ "_".join(replace_marks(c).split(" ")) for c in others_df.columns ] others_df.columns = current_columns feature_matrix_enc = pd.concat([feature_matrix_enc, others_df], axis=1) feature_matrix_enc.to_csv("/tmp/autodata.csv", sep="\t", index=False) print(feature_matrix_enc.head()) model = sem.build_model(copy_model, "auto_model") result = sem.fit_model("/tmp/autodata.csv", model, "auto_model", verbose="FALSE") return result
def extraction(entity: str, action_type: List[int], name_to_save: str, interesting_value: dict, agg_pre: list, depth: int, variable_type: dict = None, drop_list: list = [], sub_entity_list: list = [], trans_pre: list = []): log_df = get_train_log(None) # choose action type which used log_df = log_df.loc[log_df['action_type'].isin(action_type)] # choose logs by entity used log_df = choose_logs_in_train_and_test(log_df, entity=entity) log_df = log_df.reset_index(drop=True) log_df["index"] = log_df.index # required by featuretools log_df["month"] = log_df["time_stamp"].map(lambda x: int(x / 100)) log_df['data'] = log_df["time_stamp"].map( lambda x: '2016-' + str(int(x / 100)) + '-' + str(int(x // 100))) user_df = get_user_info() log_df = log_df.merge(user_df, on="user_id", how="inner") log_df["before_pro"] = log_df["time_stamp"].map(lambda x: (1101 < x) and (x < 1111)) # drop useless column log_df.drop(labels=drop_list, axis=1, inplace=True) es = ft.EntitySet(id="logs") # select feature column if entity == "user_id": log_df.drop(labels=["gender", "age_range"], axis=1, inplace=True) es = es.entity_from_dataframe( entity_id="logs", dataframe=log_df, index="index", time_index="data", variable_types=variable_type if variable_type is not None else { "user_id": ft.variable_types.Categorical, "item_id": ft.variable_types.Categorical, "cat_id": ft.variable_types.Categorical, "seller_id": ft.variable_types.Categorical, "brand_id": ft.variable_types.Categorical, "month": ft.variable_types.Categorical, "time_stamp": ft.variable_types.Categorical, "data": ft.variable_types.Datetime, 'action_type': ft.variable_types.Categorical, "before_pro": ft.variable_types.Boolean, }) es = es.normalize_entity(base_entity_id="logs", new_entity_id="user_id", index="user_id") es = es.normalize_entity(base_entity_id="logs", new_entity_id="seller_id", index="seller_id") elif entity == "user_seller": log_df["user_seller"] = np.add( np.array(log_df["user_id"].map(lambda x: str(x) + "_")), np.array(log_df["seller_id"].map(lambda x: str(x)))) log_df.drop(labels=['user_id', 'seller_id', 'age_range', 'gender'], axis=1, inplace=True) es = es.entity_from_dataframe( entity_id="logs", dataframe=log_df, index="index", time_index="data", variable_types=variable_type if variable_type is not None else { "item_id": ft.variable_types.Categorical, "cat_id": ft.variable_types.Categorical, "brand_id": ft.variable_types.Categorical, "data": ft.variable_types.Datetime, "user_seller": ft.variable_types.Categorical, "time_stamp": ft.variable_types.Categorical, 'action_type': ft.variable_types.Categorical, "month": ft.variable_types.Categorical, "before_pro": ft.variable_types.Boolean }) es = es.normalize_entity(base_entity_id="logs", new_entity_id="user_seller", index="user_seller") elif entity == "seller_id": es = es.entity_from_dataframe( entity_id="logs", dataframe=log_df, index="index", time_index="data", variable_types=variable_type if variable_type is not None else { "user_id": ft.variable_types.Categorical, "item_id": ft.variable_types.Categorical, "cat_id": ft.variable_types.Categorical, "seller_id": ft.variable_types.Categorical, "brand_id": ft.variable_types.Categorical, "time_stamp": ft.variable_types.Categorical, "month": ft.variable_types.Categorical, 'action_type': ft.variable_types.Categorical, "before_pro": ft.variable_types.Boolean, "age_range": ft.variable_types.Categorical, "gender": ft.variable_types.Categorical }) if "user_id" not in drop_list: es = es.normalize_entity(base_entity_id="logs", new_entity_id="user_id", index="user_id") es = es.normalize_entity(base_entity_id="logs", new_entity_id="seller_id", index="seller_id") for key in interesting_value.keys(): es["logs"][key].interesting_values = interesting_value[key] for sub_entity in sub_entity_list: es = es.normalize_entity(base_entity_id="logs", new_entity_id=sub_entity, index=sub_entity) print("start") feature_defs = ft.dfs(entityset=es, target_entity=entity, agg_primitives=agg_pre, max_depth=depth, where_primitives=agg_pre, trans_primitives=trans_pre, features_only=True) print(feature_defs) feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity=entity, agg_primitives=agg_pre, max_depth=depth, where_primitives=agg_pre, trans_primitives=trans_pre, n_jobs=1, verbose=True) print(feature_defs) feature_matrix_enc, features_enc = ft.encode_features( feature_matrix, feature_defs) print(features_enc) feature_matrix_enc.to_csv(os.path.join(get_root_path(), "feature_vectors", name_to_save), float_format='%.4f', index_label="index")