def test_transform_consistency(): # Create dataframe df = pd.DataFrame({ 'a': [14, 12, 10], 'b': [False, False, True], 'b1': [True, True, False], 'b12': [4, 5, 6], 'P': [10, 15, 12] }) es = ft.EntitySet(id='test') # Add dataframe to entityset es.entity_from_dataframe(entity_id='first', dataframe=df, index='index', make_index=True) # Generate features feature_defs = ft.dfs(entityset=es, target_entity='first', trans_primitives=['and', 'add_numeric', 'or'], features_only=True) # Check for correct ordering of features assert feature_with_name(feature_defs, 'a') assert feature_with_name(feature_defs, 'b') assert feature_with_name(feature_defs, 'b1') assert feature_with_name(feature_defs, 'b12') assert feature_with_name(feature_defs, 'P') assert feature_with_name(feature_defs, 'AND(b, b1)') assert not feature_with_name( feature_defs, 'AND(b1, b)') # make sure it doesn't exist the other way assert feature_with_name(feature_defs, 'a + P') assert feature_with_name(feature_defs, 'b12 + P') assert feature_with_name(feature_defs, 'a + b12') assert feature_with_name(feature_defs, 'OR(b, b1)') assert feature_with_name(feature_defs, 'OR(AND(b, b1), b)') assert feature_with_name(feature_defs, 'OR(AND(b, b1), b1)')
def parse_data(self): """ Parse data from dataframe to EntitySet (FeatureTools formatting) Parameters ---------- Nothing Returns ---------- es: EntitySet The entity set of grouped data. """ es = ft.EntitySet(id="Dados") columns = list(self.data.columns) variable_types = {} for indx, ftype in enumerate(self.feature_types[0].values): if ftype == 'Categorical': variable_types[columns[indx]] = vtypes.Categorical # Create EntitySet and load data es = es.entity_from_dataframe(entity_id="entity", dataframe=self.data, make_index=True, index=self.names, time_index=self.date_var, variable_types=variable_types) # Groups data if required # Commented because variable grouped doesn't exist on dataset es.normalize_entity(new_entity_id="normal", base_entity_id="entity", index=self.group_var[0]) return es
def new_features(self): """ creates new features using current numeric features. Datafarme must only have numeric values. Returns ---------- feature_matrix : dataframe containing all the old features and new synthetized features """ self.numeric_features() if self.numeric_features.shape[ 1] == self.numeric_features.select_dtypes( include=np.number).shape[1]: # Make an entityset and add the entity es = ft.EntitySet(id='id_1') es.entity_from_dataframe(entity_id='id_2', dataframe=self.numeric_features, make_index=True, index='new_index') # Run deep feature synthesis self.feature_matrix, self.feature_defs = ft.dfs( entityset=es, target_entity='id_2', agg_primitives=self.list_agg_primitives, trans_primitives=self.list_trans_primitives, max_depth=self.max_depth_value) # Add categorical features back to the features dataframe if self.categorical_col_name is not None: for col in self.categorical_col_name: self.feature_matrix[col] = self.cat_features[col].values return self.feature_matrix else: raise ValueError("Data Frame contains non-numeric values")
def get_cross_features(df, features, key='tim'): """ :type df: DataFrame :type features: list[str]] :rtype: DataFrame """ use_df = copy.deepcopy(df.loc[:, [key] + features]) es = ft.EntitySet(id='temperature_predict') es = es.entity_from_dataframe(entity_id='temp', dataframe=use_df, index=key) trans_primitives = [ 'add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric' ] feature_matrix, _ = ft.dfs( entityset=es, target_entity='temp', max_depth=1, # max_depth=1,只在原特征上进行运算产生新特征 verbose=1, trans_primitives=trans_primitives) features_df = pd.DataFrame(feature_matrix).reset_index() features_df.drop(columns=features, inplace=True) return features_df
def _set_entity_set(self, data: pd.DataFrame, groups: Dict[str, Sequence[str]]) -> ft.EntitySet: """ Set an entity set. Args: data: DataFrame groups: Dict of features groups. Returns: Featuretools entity set. """ es = ft.EntitySet(id="main") index_name = self._index_name(data) for group, features in groups.items(): es = es.entity_from_dataframe( entity_id=group, dataframe=data[features].reset_index(), index=index_name, ) return es
def _create_feature(self): """ マージしたデータから特徴量を生成する """ raceuma_df = self.base_df.copy()[["競走コード", "馬番", "予想タイム指数", "予想展開", "クラス変動", "騎手評価", "調教師評価", "枠順評価", "脚質評価", "馬齢", "前走着順", "前走人気", "前走頭数", "騎手ランキング", "調教師ランキング"]] raceuma_df.loc[:, "競走馬コード"] = raceuma_df["競走コード"].astype(str).str.cat(raceuma_df["馬番"].astype(str)) raceuma_df.drop("馬番", axis=1, inplace=True) # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5 es = ft.EntitySet(id="race") es.entity_from_dataframe(entity_id='race', dataframe=raceuma_df, index="競走馬コード") es.normalize_entity(base_entity_id='race', new_entity_id='raceuma', index="競走コード") # 集約関数 aggregation_list = ['count', 'min', 'max', 'mean'] transform_list = [] # run dfs feature_matrix, features_dfs = ft.dfs(entityset= es, target_entity= 'race', agg_primitives = aggregation_list , trans_primitives=transform_list, max_depth=2) print("_create_feature: feature_matrix", feature_matrix.shape) feature_matrix.head(3) # 予想1番人気のデータを取得 ninki_df = self.base_df.query("予想人気==1")[["競走コード", "枠番", "性別コード", "予想タイム指数順位", "見習区分", "キャリア", "馬齢", "予想展開", "距離増減", "前走頭数", "前走人気", "テン乗り"]].add_prefix("人気_").rename(columns={"人気_競走コード":"競走コード"}) # 逃げ予想馬のデータを取得 nige_df = self.base_df.query("予想展開==1")[["競走コード", "先行指数", "距離増減", "前走人気", "前走頭数", "テン乗り"]].add_prefix("逃げ_").rename(columns={"逃げ_競走コード":"競走コード"}) self.base_df = pd.merge(feature_matrix, nige_df, on="競走コード") self.base_df = pd.merge(self.base_df, ninki_df, on="競走コード") self.base_df = pd.merge(self.base_df, self.ld.race_df, on="競走コード")
def auto_adp_features(train, test, cols, entities): df_c = train[cols] es = ft.EntitySet(id='petfinder') es.entity_from_dataframe(entity_id="Pets", dataframe=df_c, index="PetID") ignored_variable = {} ignored_variable.update({'Pets': entities}) for e in entities: print(e) es.normalize_entity(base_entity_id='Pets', new_entity_id=e, index=e) feature_matrix, feature_names = ft.dfs(entityset=es, target_entity=e, max_depth=2, verbose=1, #n_jobs=3, ignore_variables=ignored_variable) fm = feature_matrix.add_prefix(e+"_") print(feature_names) fm.drop([e+"_COUNT(Pets)"], axis = 1, inplace=True) train = train.set_index(e).join(fm).reset_index() test = test.set_index(e).join(fm).reset_index() return train, test
def _make_entityset(self, input_df): es = ft.EntitySet() primary_key = find_primary_key(input_df) make_index = False if primary_key is None: primary_key = "D3M_INDEX" make_index = True cols_to_use = input_df.metadata.list_columns_with_semantic_types( [st.PRIMARY_KEY, st.ATTRIBUTE]) input_df = input_df.select_columns(cols_to_use) variable_types = get_featuretools_variable_types(input_df) es.entity_from_dataframe(entity_id=TARGET_ENTITY, dataframe=pd.DataFrame(input_df.copy()), index=primary_key, make_index=make_index, variable_types=variable_types) return es
def create_entity_set(data: pd.DataFrame, train_table: str, test_table: str) -> ft.EntitySet: print(f"\nCreating entity set based on client data") start = time.monotonic() es = ft.EntitySet(id='clients') es = es.entity_from_dataframe(entity_id='combined_train_test', dataframe=data['combined_train_test'], index='SK_ID_CURR') es = es.entity_from_dataframe(entity_id='bureau', dataframe=data['bureau'], index='SK_ID_BUREAU') es = es.entity_from_dataframe(entity_id='bureau_balance', dataframe=data['bureau_balance'], make_index=True, index='bureaubalance_index') es = es.entity_from_dataframe(entity_id='previous_application', dataframe=data['previous_application'], index='SK_ID_PREV') es = es.add_relationships([ ft.Relationship(es['combined_train_test']['SK_ID_CURR'], es['bureau']['SK_ID_CURR']), ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU']), ft.Relationship(es['combined_train_test']['SK_ID_CURR'], es['previous_application']['SK_ID_CURR']) ]) end = time.monotonic() print(f" Entity set creation completed in {round(end - start)} seconds") return es
def generate_entity_set(self): """ 05. Define the entity set along with the table relations. """ import featuretools as ft self.es = ft.EntitySet(id='clients') self.es = self.es.entity_from_dataframe( entity_id='users', dataframe=self.users.reset_index(), index='user_id', time_index='created_date') for d in self.feature_windows: self.es = self.es.entity_from_dataframe( entity_id=f'transactions_{d}d', dataframe=self.transactions.query( f' {d} > days_before_cutoff >= 0 ').reset_index(), index='transaction_id', time_index='created_date') # Add the relationship between customera and transactions self.es = self.es.add_relationship( ft.Relationship(self.es['users']['user_id'], self.es[f'transactions_{d}d']['user_id'])) self.next(self.generate_features)
def create_entitysets(part_num, partition_name, dfs_params): es_dict = {} es = ft.EntitySet(id = partition_name) for target in dfs_params['target']: for t_k, t_v in target.items(): s = data[t_k].duplicated(t_v) df_target = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) +'/partition/'+partition_name+'/'+part_num+'/'+t_k+'.csv') index = index_needed(t_k,t_v) if index == True: es = es.entity_from_dataframe(entity_id = t_k, dataframe = df_target, make_index = True, index = t_v) else: es = es.entity_from_dataframe(entity_id = t_k, dataframe = df_target, index = t_v) for frame in dfs_params['frames']: for f_k, f_v in frame.items(): index = index_needed(f_k,f_v) df = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) +'/partition/'+partition_name+'/'+part_num+'/'+f_k+'.csv') if index == True: es = es.entity_from_dataframe(entity_id = f_k, dataframe = df, make_index = True, index = f_k+'_'+f_v) else: es = es.entity_from_dataframe(entity_id = f_k, dataframe = df, index = f_v) r = ft.Relationship(es[t_k][t_v], es[f_k][f_v]) es = es.add_relationships([r]) es_dict.update({t_k: es, 'num': part_num}) return es_dict
def dd_mock_customer(pd_mock_customer): dataframes = {} for df in pd_mock_customer.dataframes: dd_df = dd.from_pandas(df.reset_index(drop=True), npartitions=4) dd_df.ww.init(schema=df.ww.schema) dataframes[df.ww.name] = ( dd_df, df.ww.index, df.ww.time_index, df.ww.logical_types, ) relationships = [ ( rel._parent_dataframe_name, rel._parent_column_name, rel._child_dataframe_name, rel._child_column_name, ) for rel in pd_mock_customer.relationships ] return ft.EntitySet( id=pd_mock_customer.id, dataframes=dataframes, relationships=relationships )
def diamond_es(): regions_df = pd.DataFrame({ 'id': range(3), 'name': ['Northeast', 'Midwest', 'South'], }) stores_df = pd.DataFrame({ 'id': range(5), 'region_id': [0, 1, 2, 2, 1], }) customers_df = pd.DataFrame({ 'id': range(5), 'region_id': [1, 0, 0, 1, 1], 'name': ['A', 'B', 'C', 'D', 'E'], }) transactions_df = pd.DataFrame({ 'id': range(8), 'store_id': [4, 4, 2, 3, 4, 0, 1, 1], 'customer_id': [3, 0, 2, 4, 3, 3, 2, 3], 'amount': [100, 40, 45, 83, 13, 94, 27, 81], }) entities = { 'regions': (regions_df, 'id'), 'stores': (stores_df, 'id'), 'customers': (customers_df, 'id'), 'transactions': (transactions_df, 'id'), } relationships = [ ('regions', 'id', 'stores', 'region_id'), ('regions', 'id', 'customers', 'region_id'), ('stores', 'id', 'transactions', 'store_id'), ('customers', 'id', 'transactions', 'customer_id'), ] return ft.EntitySet(id='ecommerce_diamond', entities=entities, relationships=relationships)
def load_retail(id='demo_retail_data', nrows=None): ''' Returns the retail entityset example. Args: id (str): Id to assign to EntitySet. nrows (int): Number of rows to load of item_purchases entity. If None, load all. Examples: .. ipython:: :verbatim: In [1]: import featuretools as ft In [2]: es = ft.demo.load_retail() In [3]: es Out[3]: Entityset: demo_retail_data Entities: invoices (shape = [25900, 3]) items (shape = [4070, 3]) customers (shape = [4373, 3]) item_purchases (shape = [541909, 6]) Load in subset of data .. ipython:: :verbatim: In [2]: es = ft.demo.load_retail(nrows=1000) In [3]: es Out[3]: Entityset: demo_retail_data Entities: invoices (shape = [66, 3]) items (shape = [590, 3]) customers (shape = [49, 3]) item_purchases (shape = [1000, 6]) ''' demo_save_path = make_retail_pathname(nrows) es = ft.EntitySet(id) csv_s3 = "s3://featuretools-static/uk_online_retail.csv" if not os.path.isfile(demo_save_path): df = pd.read_csv(csv_s3, nrows=nrows, parse_dates=["InvoiceDate"]) df.to_csv(demo_save_path) df = pd.read_csv(demo_save_path, nrows=nrows, parse_dates=["InvoiceDate"]) df.rename(columns={"Unnamed: 0": 'item_purchase_id'}, inplace=True) es.entity_from_dataframe("item_purchases", dataframe=df, index="item_purchase_id", time_index="InvoiceDate") es.normalize_entity(new_entity_id="items", base_entity_id="item_purchases", index="StockCode", additional_variables=["Description"]) es.normalize_entity(new_entity_id="invoices", base_entity_id="item_purchases", index="InvoiceNo", additional_variables=["CustomerID", "Country"]) es.normalize_entity(new_entity_id="customers", base_entity_id="invoices", index="CustomerID", additional_variables=["Country"]) es.add_last_time_indexes() return es
# dataframe is instead loan_id. # When we create an entity in featuretools, we have to identify which column of the dataframe is # the index. If the data does not have a unique index we can tell featuretools to make an index # for the entity by passing in make_index = True and specifying a name for the index. If the data # also has a uniquely identifying time index, we can pass that in as the time_index parameter. # Featuretools will automatically infer the variable types (numeric, categorical, datetime) of # the columns in our data, but we can also pass in specific datatypes to override this behavior. # As an example, even though the repaid column in the loans dataframe is represented as an integer, # we can tell featuretools that this is a categorical feature since it can only take on two # discrete values. This is done using an integer with the variables as keys and the feature types # as values. # Create new EntitySet es = ft.EntitySet(id='clients') # Create an entity from the client DataFrame # This dataframe already has an index and a time index es = es.entity_from_dataframe(entity_id='clients', dataframe=clients, index='client_id', time_index='joined') # Create an entity from the loans DataFrame # This DataFrame already has an index and a time index es = es.entity_from_dataframe( entity_id='loans', dataframe=loans, variable_types={'repaid': ft.variable_types.Categorical}, index='loan_id', time_index='loan_start')
("encode", OrdinalEncoder()), ]) numeric_transformer = Pipeline([ ("imputer", SimpleImputer(strategy="median")), ]) # - combine[["Sex", "Embarked", "Title"]] = categorical_transformer.fit_transform( combine[["Sex", "Embarked", "Title"]]) combine[["Age", "Fare"]] = numeric_transformer.fit_transform(combine[["Age", "Fare"]]) # + es = ft.EntitySet(id="titanic_data") es = es.entity_from_dataframe( entity_id="combine", dataframe=combine.drop(["Survived"], axis=1), variable_types={ "Embarked": ft.variable_types.Categorical, "Sex": ft.variable_types.Boolean, "Title": ft.variable_types.Categorical, }, index="PassengerId", ) es # -
def datashop_to_entityset(filename): # Make an EntitySet called Dataset with the following structure # # schools students problems # \ | / # classes sessions problem steps # \ | / # transactions -- attempts # # Convert the csv into a dataframe using pandas data = pd.read_csv(filename, '\t', parse_dates=True) # Make the Transaction Id the index column of the dataframe and clean other columns data.index = data['Transaction Id'] data = data.drop(['Row'], axis=1) data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1}) # Make a new 'End Time' column which is start_time + duration # This is /super useful/ because you shouldn't be using outcome data at # any point before the student has attempted the problem. data['End Time'] = pd.to_datetime( data['Time']) + pd.to_timedelta(pd.to_numeric(data['Duration (sec)']), 's') # Make a list of all the KC and CF columns present kc_and_cf_cols = [x for x in data.columns if ( x.startswith('KC ') or x.startswith('CF '))] # Now we start making an entityset. We make 'End Time' a time index for 'Outcome' # even though our primary time index for a row is 'Time' preventing label leakage. es = ft.EntitySet('Dataset') es.entity_from_dataframe(entity_id='transactions', index='Transaction Id', dataframe=data, variable_types={'Outcome': vtypes.Boolean, 'Attempt At Step': vtypes.Categorical}, time_index='Time', secondary_time_index={'End Time': [ 'Outcome', 'Is Last Attempt', 'Duration (sec)']} ) # Every transaction has a `problem_step` which is associated to a problem es.normalize_entity(base_entity_id='transactions', new_entity_id='problem_steps', index='Step Name', additional_variables=['Problem Name'] + kc_and_cf_cols, make_time_index=True) es.normalize_entity(base_entity_id='problem_steps', new_entity_id='problems', index='Problem Name', make_time_index=True) # Every transaction has a `session` associated to a student es.normalize_entity(base_entity_id='transactions', new_entity_id='sessions', index='Session Id', additional_variables=['Anon Student Id'], make_time_index=True) es.normalize_entity(base_entity_id='sessions', new_entity_id='students', index='Anon Student Id', make_time_index=True) # Every transaction has a `class` associated to a school es.normalize_entity(base_entity_id='transactions', new_entity_id='classes', index='Class', additional_variables=['School'], make_time_index=False) es.normalize_entity(base_entity_id='classes', new_entity_id='schools', index='School', make_time_index=False) # And because we might be interested in creating features grouped # by attempts we normalize by those as well. # es.normalize_entity(base_entity_id='transactions', # new_entity_id='attempts', # index='Attempt At Step', # additional_variables=[], # make_time_index=False) return es
def es_set(self): self.__es = ft.EntitySet(id="application_test") self.__es = self.__es.entity_from_dataframe( entity_id="application_test", dataframe=self.__application_test, index="SK_ID_CURR", variable_types=None if len(self.__application_test_categorical) == 0 else self.__application_test_categorical) self.__es = self.__es.entity_from_dataframe( entity_id="bureau", dataframe=self.__bureau, index="SK_ID_BUREAU", variable_types=None if len( self.__bureau_categorical) == 0 else self.__bureau_categorical) self.__es = self.__es.entity_from_dataframe( entity_id="bureau_balance", dataframe=self.__bureau_balance, make_index=True, index="bureau_balance_id", variable_types=None if len(self.__bureau_balance_categorical) == 0 else self.__bureau_balance_categorical) self.__es = self.__es.entity_from_dataframe( entity_id="previous_application", dataframe=self.__previous_application, index="SK_ID_PREV", variable_types=None if len(self.__previous_application_categorical) == 0 else self.__previous_application_categorical) self.__es = self.__es.entity_from_dataframe( entity_id="pos_cash_balance", dataframe=self.__pos_cash_balance, make_index=True, index="pos_cash_balance_id", variable_types=None if len(self.__pos_cash_balance_categorical) == 0 else self.__pos_cash_balance_categorical) self.__es = self.__es.entity_from_dataframe( entity_id="credit_card_balance", dataframe=self.__credit_card_balance, make_index=True, index="credit_card_balance_id", variable_types=None if len(self.__credit_card_balance_categorical) == 0 else self.__credit_card_balance_categorical) self.__es = self.__es.entity_from_dataframe( entity_id="installments_payments", dataframe=self.__installments_payments, make_index=True, index="installments_payments_id", variable_types=None if len(self.__installments_payments_categorical) == 0 else self.__installments_payments_categorical) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["application_test"]["SK_ID_CURR"], self.__es["bureau"]["SK_ID_CURR"])) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["bureau"]["SK_ID_BUREAU"], self.__es["bureau_balance"]["SK_ID_BUREAU"])) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["application_test"]["SK_ID_CURR"], self.__es["previous_application"]["SK_ID_CURR"])) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["previous_application"]["SK_ID_PREV"], self.__es["pos_cash_balance"]["SK_ID_PREV"])) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["previous_application"]["SK_ID_PREV"], self.__es["credit_card_balance"]["SK_ID_PREV"])) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["previous_application"]["SK_ID_PREV"], self.__es["installments_payments"]["SK_ID_PREV"])) self.__es["previous_application"][ "NAME_CONTRACT_STATUS_Refused"].interesting_values = [1] self.__es["previous_application"][ "NAME_PRODUCT_TYPE_walk-in"].interesting_values = [1] self.__es["previous_application"][ "CODE_REJECT_REASON_HC"].interesting_values = [1]
if feat_name != '': feature_list.append(feat_name) f.close() return feature_list # finally let's import the data df = pd.read_csv("creditcard.csv") df = df.drop(['Time'], axis=1) #,'V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8'], axis =1) df = df.dropna() # ok and then we'll do all the featuretools things that need to happen es = ft.EntitySet(id = 'card') # no clue what this means but whatever # make an entity from the observations data es = es.entity_from_dataframe(dataframe = df.drop('Class', axis=1), entity_id = 'obs', index = 'index') feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='obs', #agg_primitives = ['min', 'max', 'mean', 'count', 'sum', 'std', 'trend'], trans_primitives = ['divide_by_feature', 'add_numeric', 'less_than_equal_to', 'greater_than_equal_to_scalar', 'multiply_numeric', 'subtract_numeric_scalar', 'divide_numeric_scalar', 'add_numeric_scalar', 'subtract_numeric', 'divide_numeric', 'percentile', 'greater_than', 'less_than', 'multiply_numeric_scalar', 'greater_than_equal_to', 'modulo_by_feature', 'scalar_subtract_numeric_feature', 'absolute', 'modulo_numeric'], max_depth=1, n_jobs=1, verbose=1) # alright here is where we're going to want to cut down all the variables feature_list = fetch_feature_list()
stats.columns = ['loan_amount','payment','missed'] clients2 = clients.merge(stats, left_on='client_id', right_index=True, how='left') # Then add some features manually clients2['join_month'] = clients2['joined'].dt.month clients2['log_income'] = np.log(clients2['income']) # 5 new features with 7 lines of code clients2.head() ########################################################################## # So far so good. # OK Let's use featuretools # # An ft entity is simply a data-frame. And ft uses sets of them - an entityset! # Basically we're creating metadata. NB payments has no payment_id, so create one. es = ft.EntitySet(id = 'myentityset') es = es.entity_from_dataframe(entity_id = 'clients', dataframe=clients, index='client_id', time_index='joined') es = es.entity_from_dataframe(entity_id = 'loans', dataframe=loans, variable_types = {'repaid': ft.variable_types.Categorical}, index='loan_id', time_index='loan_start') es = es.entity_from_dataframe(entity_id = 'payments', dataframe=payments, variable_types = {'missed': ft.variable_types.Categorical}, make_index = True,
# Let's merge the hit and miss dfs for modelling df = pd.merge(hit_df, miss_df, how='outer') # Get rid of duplicate values again df = df.drop_duplicates(subset='CompoundSMILES', keep="first") # Produce rdkit features from SMILES df, properties = rdkit_utils.get_rdkit_properties(df) # Get X, y and training and test data y = df['Site_No'] X = df.drop(columns=['Site_No']) # Let's try add some feature engineering from feature tools # Make an entityset and add the entity es = ft.EntitySet(id='chem_features') es.entity_from_dataframe(entity_id='data', dataframe=X, make_index=False, index='CompoundSMILES') # Run deep feature synthesis with transformation primitives X, feature_defs = ft.dfs(entityset=es, max_depth=1, target_entity='data', agg_primitives=["mean", "sum", "mode"], trans_primitives=[ 'add_numeric', 'multiply_numeric', 'cum_count', 'cum_mean', 'cum_sum', 'equal' ])
def __init__(self, id_name=None): if id_name is None: id_name = 'auto_create' self.auto_create = ft.EntitySet(id=id_name)
def get_create_feature_race_df(self, base_df, race_df): """ マージしたデータから特徴量を生成する """ print("_create_feature") raceuma_df = base_df[[ "RACE_KEY", "UMABAN", "脚質", "距離適性", "父馬産駒連対平均距離", "母父馬産駒連対平均距離", "IDM", "テン指数", "ペース指数", "上がり指数", "位置指数", "IDM結果_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "先行率_1", "追込率_1", "fa_1_1", "fa_2_1", "fa_3_1", "fa_4_1", "fa_5_1" ]] raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df[ "RACE_KEY"] + raceuma_df["UMABAN"] raceuma_df.drop("UMABAN", axis=1, inplace=True) # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5 es = ft.EntitySet(id="race") es.entity_from_dataframe(entity_id='race', dataframe=race_df, index="RACE_KEY") es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY") relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"]) es = es.add_relationship(relationship) print(es) # 集約関数 aggregation_list = ['min', 'max', 'mean', 'skew', 'percent_true'] transform_list = [] # run dfs print("un dfs") feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list, trans_primitives=transform_list, max_depth=2) print("_create_feature: feature_matrix", feature_matrix.shape) # 予想1番人気のデータを取得 ninki_df = base_df.query("基準人気順位==1")[[ "RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "情報印", "騎手印", "厩舎印", "調教印", "激走印", "展開記号", "輸送区分", "騎手期待単勝率", "騎手期待3着内率", #"激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "放牧先ランク", "厩舎ランク", "調教量評価", "仕上指数変化", "調教評価", "IDM", "騎手指数", "情報指数", "総合指数", "人気指数", "調教指数", "厩舎指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "IDM結果_1", "IDM結果_2" ]].add_prefix("人気_").rename(columns={"人気_RACE_KEY": "RACE_KEY"}) # 逃げ予想馬のデータを取得 nige_df = base_df.query("展開記号=='1'")[[ "RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "先行率_1", "先行率_2", "距離", "距離_1" ]].add_prefix("逃げ_").rename(columns={"逃げ_RACE_KEY": "RACE_KEY"}) # "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "斤量_2","斤量_1", nige_df.loc[:, "逃げ_距離増減"] = nige_df["逃げ_距離"] - nige_df["逃げ_距離_1"] nige_df.drop(["逃げ_距離", "逃げ_距離_1"], axis=1, inplace=True) nige_ddf = nige_df.groupby("RACE_KEY") nige_df2 = nige_df.loc[nige_ddf["逃げ_テン指数"].idxmax(), :] # 上がり最速予想馬のデータを取得 agari_df = base_df.query("展開記号=='2'")[[ "RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "先行率_1", "先行率_2" ]].add_prefix("上り_").rename(columns={"上り_RACE_KEY": "RACE_KEY"}) # "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ","斤量_1", "斤量_2", base_df = pd.merge(feature_matrix, nige_df2, on="RACE_KEY", how="left") base_df = pd.merge(base_df, agari_df, on="RACE_KEY", how="left") base_df = pd.merge(base_df, ninki_df, on="RACE_KEY") return base_df
import featuretools as ft if __name__ == '__main__': # 基础原始数据 data = ft.load_mock_customer() # 获取原始数据中不同的部分 # session sessions_df = data["sessions"] # products products_df = data["products"] # transactions transactions_df = data["transactions"] # customers customers_df = data["customers"] # 创建EntitySet es = ft.EntitySet(id="tests") # 添加entity es = es.entity_from_dataframe( entity_id="trans_entity", dataframe=transactions_df, index="transaction_id", ) es = es.entity_from_dataframe( entity_id="session_entity", dataframe=sessions_df, index="session_id", ) es = es.entity_from_dataframe(
def test_empty_child_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({ "id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2] }) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) # cutoff time before all rows fm = ft.calculate_feature_matrix( entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017")) names = [ count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name() ] assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]]) # cutoff time after all rows, but where clause filters all rows fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018")) names = [count_where.get_name(), trend_where.get_name()] assert_array_equal(fm2[names], [[0, np.nan]])
def __init__(self, sparse_feats=None, dense_feats=None): self.sparse_feats = sparse_feats self.dense_feats = dense_feats self.es = ft.EntitySet(id='MAIN')
def load_mock_customer(n_customers=5, n_products=5, n_sessions=35, n_transactions=500, random_seed=0, return_single_table=False, return_entityset=False): """Return dataframes of mock customer data""" random.seed(random_seed) customers_df = pd.DataFrame({"customer_id": range(1, n_customers + 1)}) customers_df["zip_code"] = choice( ["60091", "02139"], n_customers, ) customers_df["join_date"] = pd.date_range( '1/1/2008', periods=n_customers, freq='50d') # todo make these less regular products_df = pd.DataFrame({"product_id": range(1, n_products + 1)}) products_df["brand"] = choice(["A", "B", "C"], n_products) sessions_df = pd.DataFrame({"session_id": range(1, n_sessions + 1)}) sessions_df["customer_id"] = choice(customers_df["customer_id"], n_sessions) sessions_df["device"] = choice(["desktop", "mobile", "tablet"], n_sessions) transactions_df = pd.DataFrame( {"transaction_id": range(1, n_transactions + 1)}) transactions_df["session_id"] = choice(sessions_df["session_id"], n_transactions) transactions_df = transactions_df.sort_values("session_id").reset_index( drop=True) transactions_df["transaction_time"] = pd.date_range( '1/1/2014', periods=n_transactions, freq='65s') # todo make these less regular transactions_df["product_id"] = pd.Categorical( choice(products_df["product_id"], n_transactions)) transactions_df["amount"] = random.randint(500, 15000, n_transactions) / 100.0 # calculate and merge in session start # based on the times we came up with for transactions session_starts = transactions_df.drop_duplicates("session_id")[[ "session_id", "transaction_time" ]].rename(columns={"transaction_time": "session_start"}) sessions_df = sessions_df.merge(session_starts) if return_single_table: return transactions_df.merge(sessions_df).merge(customers_df).merge( products_df).reset_index(drop=True) elif return_entityset: es = ft.EntitySet(id="transactions") es = es.entity_from_dataframe( entity_id="transactions", dataframe=transactions_df, index="transaction_id", time_index="transaction_time", variable_types={"product_id": ft.variable_types.Categorical}) es = es.entity_from_dataframe(entity_id="products", dataframe=products_df, index="product_id") es = es.entity_from_dataframe(entity_id="sessions", dataframe=sessions_df, index="session_id", time_index="session_start") es = es.entity_from_dataframe(entity_id="customers", dataframe=customers_df, index="customer_id", time_index="join_date") rels = [ ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"]), ft.Relationship(es["sessions"]["session_id"], es["transactions"]["session_id"]), ft.Relationship(es["customers"]["customer_id"], es["sessions"]["customer_id"]) ] es = es.add_relationships(rels) es.add_last_time_indexes() return es return { "customers": customers_df, "sessions": sessions_df, "transactions": transactions_df, "products": products_df }
# here put the import lib from sklearn.datasets import load_iris import pandas as pd import numpy as np import featuretools as ft if __name__ == "__main__": dataset = load_iris() X = dataset.data y = dataset.target iris_feature_names = dataset.feature_names df = pd.DataFrame(X, columns=iris_feature_names) es = ft.EntitySet(id='single_dataframe') # 用id标识实体集 # 增加一个数据框,命名为iris es.entity_from_dataframe(entity_id='iris', dataframe=df, index='index', make_index=True) trans_primitives=['add_numeric', 'subtract_numeric', ,'multiply_numeric', 'divide_numeric'] # 2列相加减乘除来生成新特征 feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='iris', max_depth=1, # max_depth=1,只在原特征上进行运算产生新特征 verbose=1, trans_primitives=trans_primitives ) ft.list_primitives() # 查看可使用的特征集元 # features_df = pd.DataFrame(feature_matrix, columns= feature_names) # print(features_df.head())
def test_empty_child_dataframe(parent_child): parent_df, child_df = parent_child if not isinstance(parent_df, pd.DataFrame): parent_vtypes = {'id': variable_types.Index} child_vtypes = { 'id': variable_types.Index, 'parent_id': variable_types.Numeric, 'time_index': variable_types.Datetime, 'value': variable_types.Numeric, 'cat': variable_types.Categorical } else: parent_vtypes = None child_vtypes = None es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id", variable_types=parent_vtypes) es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index", variable_types=child_vtypes) es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create multi-output agg feature n_most_common = ft.Feature(es["child"]['cat'], parent_entity=es["parent"], primitive=NMostCommon) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) n_most_common_where = ft.Feature(es["child"]['cat'], parent_entity=es["parent"], where=where, primitive=NMostCommon) if isinstance(parent_df, pd.DataFrame): features = [ count, count_where, trend, trend_where, n_most_common, n_most_common_where ] names = [ count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name(), *n_most_common.get_feature_names(), *n_most_common_where.get_feature_names() ] values = [ 0, 0, np.nan, np.nan, *np.full(n_most_common.number_output_features, np.nan), *np.full(n_most_common_where.number_output_features, np.nan) ] else: features = [count, count_where] names = [count.get_name(), count_where.get_name()] values = [0, 0] # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("12/31/2017")) fm = to_pandas(fm) assert_array_equal(fm[names], [values]) # cutoff time after all rows, but where clause filters all rows if isinstance(parent_df, pd.DataFrame): features = [count_where, trend_where, n_most_common_where] names = [ count_where.get_name(), trend_where.get_name(), *n_most_common_where.get_feature_names() ] values = [ 0, np.nan, *np.full(n_most_common_where.number_output_features, np.nan) ] else: features = [count_where] names = [count_where.get_name()] values = [0] fm2 = ft.calculate_feature_matrix(entityset=es, features=features, cutoff_time=pd.Timestamp("1/4/2018")) fm2 = to_pandas(fm2) assert_array_equal(fm2[names], [values])
numeric = [] for col in config.ordinal: num_col = f'{col}_num' numeric.append(num_col) X_joined[num_col] = X_joined[col] # Add dataframe to entityset categorical_ft = dict([col, ft.variable_types.Boolean] for col in config.categorical.keys()) ordinal_ft = dict([col, ft.variable_types.Ordinal] for col in config.ordinal) numeric_ft = dict([col, ft.variable_types.Numeric] for col in numeric) variable_dtypes = {**categorical_ft, **ordinal_ft, **numeric_ft} # Create an entity set es = ft.EntitySet(id='flu') es = es.entity_from_dataframe( entity_id='flu', dataframe=X_joined, index='respondent_id', variable_types=variable_dtypes, ) agg_primitives = ['count', 'median', 'entropy'] trans_primitives = ['add_numeric'] # Run deep feature synthesis dfs_feat, dfs_defs = ft.dfs(entityset=es, target_entity='flu', trans_primitives=trans_primitives, agg_primitives=agg_primitives,