def gen_auto_feats(): action["userid"] = action["userid"].astype("str") future["userid"] = future["userid"].astype("str") history["orderTime"] = history["orderTime"].apply(lambda x:get_date(int(x))) history["userid"] = history["userid"].astype("str") history["orderid"] = history["orderid"].astype("str") history["orderType"] = history["orderType"].astype("str") comment["userid"] = comment["userid"].astype("str") comment["orderid"] = comment["orderid"].astype("str") profile["userid"] = profile["userid"].astype("str") es = ft.EntitySet(id="train") es = es.entity_from_dataframe(entity_id="userProfile", dataframe=profile, index="userid") es = es.entity_from_dataframe(entity_id="userComment", dataframe=comment, index="userid") es = es.entity_from_dataframe(entity_id="orderHistory", dataframe=history, index="orderid", time_index="orderTime", variable_types={"orderType" : ft.variable_types.Categorical}) relationship_1 = ft.Relationship(es["userProfile"]["userid"], es["userComment"]["userid"]) es = es.add_relationship(relationship_1) relationship_2 = ft.Relationship(es["orderHistory"]["orderid"], es["userComment"]["orderid"]) es = es.add_relationship(relationship_2) feature, _ = ft.dfs(entityset=es, target_entity="userProfile") feature = feature.T.drop_duplicates(keep='first').T.reset_index() feature = pd.merge(future, feature, how='left', on="userid") feature = auto_feats_process(feature) feature.fillna(0) return feature
def engineer_features_by_featuretools(model_params, race_master_df, race_table_result_df, race_past_x_result_df): es = ft.EntitySet(id='netkeiba') es.entity_from_dataframe(entity_id='race_master', dataframe=race_master_df[model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_MASTER'] + model_params['FEATURETOOLS_PARAMS']['FEATURE_COL'][ 'RACE_MASTER']], index='race_id') es.entity_from_dataframe(entity_id='race_table', dataframe=race_table_result_df[ model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_TABLE_RESULT'] + model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_TABLE_RESULT']], index='race_horse_id') es.entity_from_dataframe(entity_id='race_past_x', dataframe=race_past_x_result_df[ model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_PAST_X_RESULT'] + model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_PAST_X_RESULT']], index='race_horse_past_x_id') r_master_table = ft.Relationship(es['race_master']['race_id'], es['race_table']['race_id']) r_table_past_x = ft.Relationship(es['race_table']['race_horse_id'], es['race_past_x']['race_horse_id']) es.add_relationships(relationships=[r_master_table]) es.add_relationships(relationships=[r_table_past_x]) feature_matrix_df, _ = ft.dfs( entityset=es, target_entity='race_table', agg_primitives=model_params['FEATURETOOLS_PARAMS']['PRIMITIVES']['aggregation'], trans_primitives=model_params['FEATURETOOLS_PARAMS']['PRIMITIVES']['transform'], max_depth=2 ) feature_matrix_df = feature_matrix_df.fillna(0) table_index_df = decode_race_horse_id(feature_matrix_df) feature_matrix_df = feature_matrix_df.reset_index(drop=True) return feature_matrix_df, table_index_df
def create_es(entities, relationships, target, entityset_name="Demo"): # Entity set with id applications es = ft.EntitySet(id=entityset_name) for entity_name, entity_values in entities.items(): if entity_values["index_col"] is not None: es = es.entity_from_dataframe( entity_id=entity_name, dataframe=entity_values["df"], index=entity_values["index_col"], variable_types=entity_values["df_type"]) else: es = es.entity_from_dataframe( entity_id=entity_name, dataframe=entity_values["df"], make_index=True, index=entity_name + "_id", variable_types=entity_values["df_type"]) relationship_list = [] for r in relationships: r_parent_df, r_parent_col, r_child_df, r_child_col = r relationship = ft.Relationship(es[r_parent_df][r_parent_col], es[r_child_df][r_child_col]) relationship_list.append(relationship) es = es.add_relationships(relationship_list) return es
def add_relationship(entityset, parent, parent_column, child, child_column): parent_variable = entityset[parent][parent_column] child_variable = entityset[child][child_column] relationship = ft.Relationship(parent_variable, child_variable) entityset.add_relationship(relationship) return entityset
def merge_featuretools(df_parent, df_related, parent_column, related_column, date_column): """Automated feature engineering More info: https://www.featuretools.com https://github.com/featuretools/featuretools https://docs.featuretools.com http://www.jmaxkanter.com/static/papers/DSAA_DSM_2015.pdf """ # Create the entityset es = ft.EntitySet('parent') # Add the entities to the entityset es = es.entity_from_dataframe('parent', df_parent, index=parent_column) es = es.entity_from_dataframe('relate', df_related, make_index=True, time_index=date_column, index='related_id') # Define the relationships relationship = ft.Relationship(es['parent'][parent_column], es['relate'][related_column]) # Add the relationships es = es.add_relationships([relationship]) # Deep feature synthesis feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='parent') return feature_matrix.reset_index()
def test_where_clause_empty_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({ "id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2] }) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship( ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) where = ft.Feature(es["child"]["value"]) == 1 count = Count(es["child"]['id'], es["parent"], where=where) # cutoff time before all rows ft.calculate_feature_matrix(entityset=es, features=[count], cutoff_time=pd.Timestamp("12/31/2017")) # cutoff time after all rows, but where clause filters all rows ft.calculate_feature_matrix(entityset=es, features=[count], cutoff_time=pd.Timestamp("1/4/2018"))
def test_empty_child_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({"id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2]}) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship(ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = ft.Feature(es["child"]['id'], parent_entity=es["parent"], primitive=Count) # create agg feature that requires multiple arguments trend = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], primitive=Trend) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = ft.Feature(es["child"]['id'], parent_entity=es["parent"], where=where, primitive=Count) trend_where = ft.Feature([es["child"]['value'], es["child"]['time_index']], parent_entity=es["parent"], where=where, primitive=Trend) # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017")) names = [count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name()] assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]]) # cutoff time after all rows, but where clause filters all rows fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018")) names = [count_where.get_name(), trend_where.get_name()] assert_array_equal(fm2[names], [[0, np.nan]])
def generate_target_label(self, es): """Generates target labels in the case of having missing label in the entityset. Args: es: fhir entityset. Returns: Updated entityset with the generated label. Raises: ValueError: An error occurs if the target label cannot be generated. """ generate_from = 'Period' start = self.cutoff_time_label end = 'end' label_name = self.target_label_column_name if (DataLoader().check_column_existence(es, generate_from, start) and DataLoader().check_column_existence( es, generate_from, end)): if (not DataLoader().check_for_missing_values( es, generate_from, start) and not DataLoader().check_for_missing_values( es, generate_from, end)): es[generate_from].df[start] = pd.to_datetime( es[generate_from].df[start]) es[generate_from].df[end] = pd.to_datetime( es[generate_from].df[end]) duration = (es[generate_from].df[end] - es[generate_from].df[start]).dt.days duration = duration.tolist() es[self.target_entity].df[label_name] = duration updated_target_entity = es[self.target_entity].df duration_df = pd.DataFrame({'object_id': duration}) es = es.entity_from_dataframe(entity_id='Duration', dataframe=duration_df, index='object_id') es = es.entity_from_dataframe(entity_id=self.target_entity, dataframe=updated_target_entity, index='identifier') new_relationship = ft.Relationship( es['Duration']['object_id'], es[self.target_entity][label_name]) es = es.add_relationship(new_relationship) return es else: raise ValueError( 'Can not generate target label {} in table {} beacuse start or end labels in \ table {} contain missing value.'.format( label_name, self.target_entity, generate_from)) else: raise ValueError( 'Can not generate target label {} in table {}.'.format( label_name, self.target_entity))
def create_es(interactions_train, course_df): """ создание представления сущностей для featuretools """ es = ft.EntitySet('user_events') es = es.entity_from_dataframe(entity_id="events", dataframe=interactions_train.copy(), make_index=True, index='id', time_index='date', variable_types=events_vtypes) es = es.entity_from_dataframe(entity_id="steps", dataframe=course_df.copy(), index='step_id', variable_types=course_vtypes) es.normalize_entity('events', 'users', 'user_id', make_time_index=False); es = es.add_relationship(ft.Relationship(es['steps']['step_id'], es['events']['step_id'])) lesson_additional_variables = ['lesson_abuse_count', 'lesson_discussions_count', 'lesson_epic_count', 'lesson_passed_by', 'lesson_time_to_complete', 'lesson_title', 'lesson_viewed_by', 'lesson_vote_delta', 'section_id', 'section_position', 'section_title'] es.normalize_entity('steps', 'lessons', 'lesson_id', additional_variables=lesson_additional_variables, make_time_index=False); sections_additional_variables = ['section_position', 'section_title'] es.normalize_entity('lessons', 'sections', 'section_id', additional_variables=sections_additional_variables, make_time_index=False); es["events"]["action"].interesting_values = interactions_train.action.unique().categories es["steps"]["step_block.name"].interesting_values = course_df['step_block.name'].unique() return es
def load_train_data(): print('Loading CSV data...') applications_df = pd.read_csv(C_PATH + 'application_train.csv') previous_df = pd.read_csv(C_PATH + 'previous_application.csv') # bureau_df = pd.read_csv(C_PATH + 'bureau.csv') print("Creating entityset...") es = ft.EntitySet(id="home-credit") print("Loading applications entity...") es = es.entity_from_dataframe(entity_id="applications", dataframe=applications_df, index="SK_ID_CURR") print("Loading previous entity...") es = es.entity_from_dataframe(entity_id="previous", dataframe=previous_df, index="SK_ID_PREV") # print("Loading bureau data...") # es = es.entity_from_dataframe(entity_id="bureau", dataframe=bureau_df, index="SK_ID_BUREAU") print("Adding relationships...") applications_previous = ft.Relationship(es["applications"]["SK_ID_CURR"], es["previous"]["SK_ID_CURR"]) es = es.add_relationship(applications_previous) # applications_bureau = ft.Relationship(es["applications"]["SK_ID_CURR"], es["bureau"]["SK_ID_CURR"]) # es = es.add_relationship(applications_bureau) # return es print("Generating DFS...") feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="applications", verbose=True) fm_encoded, defs_encoded = ft.encode_features(feature_matrix, feature_defs) return fm_encoded, defs_encoded
def create_relations(self): """ Add relations to EntitySet object. """ list_relations_in_ft_format = list() for rel in self.relations: table_0 = rel[0][0] col_0 = rel[0][1] table_1 = rel[1][0] col_1 = rel[1][1] if check_col_in_df(self.dict_dataframes[table_0], col_0) and \ check_col_in_df(self.dict_dataframes[table_1], col_1): rel_ = ft.Relationship(self.entities[table_0][col_0], self.entities[table_1][col_1]) list_relations_in_ft_format.append(rel_) else: raise ValueError(f"Check key column in {table_0} or {table_1}") self.entities = self.entities.add_relationships( list_relations_in_ft_format) logging.info(f"Entityset is created \n {self.entities}")
def get_entityset(holding_data, price_data, trans_data, company_data): ''' Construct an entityset data model from different data frames ''' company_data = company_data.drop( ['listing_date', 'delisting_date', 'last_trade_date', 'indices'], axis=1) es = ft.EntitySet(id="trading") es = es.entity_from_dataframe( entity_id="prices", dataframe=price_data, time_index="Date", index='index', variable_types={"Tick": ft.variable_types.Categorical}) es = es.entity_from_dataframe( entity_id="holdings", dataframe=holding_data, index='Tick', time_index="Date", variable_types={"Tick": ft.variable_types.Categorical}) es = es.entity_from_dataframe( entity_id="companies", dataframe=company_data, index='index', time_index="Date", variable_types={"Tick": ft.variable_types.Categorical}) es = es.entity_from_dataframe(entity_id="transactions", dataframe=trans_data, index='index', time_index="Date", variable_types={ "Tick": ft.variable_types.Categorical, "Type": ft.variable_types.Categorical }) holdings_trans = ft.Relationship(es["holdings"]["Tick"], es["transactions"]["Tick"]) es = es.add_relationship(holdings_trans) holdings_companies = ft.Relationship(es["holdings"]["Tick"], es["companies"]["Tick"]) es = es.add_relationship(holdings_companies) holdings_prices = ft.Relationship(es["holdings"]["Tick"], es["prices"]["Tick"]) es = es.add_relationship(holdings_prices) return es
def agg(train_df, hist_df, new_trans_df, isTrain, x_save_path, y_save_path): train_df = train_df.copy(deep=True) if isTrain: target = train_df['target'] del train_df['target'] else: target = None es_train = ft.EntitySet(id='es_train') es_train = es_train.entity_from_dataframe(entity_id='train', dataframe=train_df, index='', time_index='first_active_month') es_train = es_train.entity_from_dataframe(entity_id='history', dataframe=hist_df, index='', time_index='purchase_date') es_train = es_train.entity_from_dataframe(entity_id='new_trans', dataframe=new_trans_df, index='', time_index='purchase_date') # Relationship between clients and previous loans r_client_previous = ft.Relationship(es_train['train']['card_id'], es_train['history']['card_id']) # Add the relationship to the entity set es_train = es_train.add_relationship(r_client_previous) r_client_previous = ft.Relationship(es_train['train']['card_id'], es_train['new_trans']['card_id']) # Add the relationship to the entity set es_train = es_train.add_relationship(r_client_previous) print(" dfs ing ... ") x_train, _ = ft.dfs(entityset=es_train, target_entity='train', max_depth=2) send_msg("dfs done! ") print("saving...") if target: target.to_csv(y_save_path) x_train['index'] = target.index x_train.set_index('index') x_train.to_csv(x_save_path) return x_train, target
def test_serialization(es): relationship = ft.Relationship(es['sessions']['id'], es['log']['session_id']) dictionary = { 'parent_entity_id': 'sessions', 'parent_variable_id': 'id', 'child_entity_id': 'log', 'child_variable_id': 'session_id', } assert relationship.to_dictionary() == dictionary assert ft.Relationship.from_dictionary(dictionary, es) == relationship
def multitable_d3m_to_entityset(inpath): with open(os.path.join(inpath, 'data/dataSchema.json'), 'rb') as f: raw_json = json.load(f) es = ft.EntitySet(raw_json['dataSchema']['datasetId']) relationships = [] entities = [ key[8:] for key in raw_json['dataSchema'] if key.startswith("rawData/") ] d3m_var_to_ft_var = { 'boolean': Boolean, 'float': Numeric, 'zeroToOneFloat': Numeric, 'integer': Numeric, 'text': Text, 'categorical': Categorical, 'ordinal': Ordinal, 'dateTime': Datetime } for entity in entities: index = None csv_path = os.path.join(inpath, 'data/raw_data/%s.csv' % (entity)) entityData = raw_json['dataSchema']['rawData/%s' % (entity)]['rawData/%s' % (entity)] var_types = {} for varData in entityData: if varData['varRole'] == 'index': index = varData['varName'] var_types[varData['varName']] = Index elif 'varReference' in varData: var_types[varData['varName']] = Id parent_entity = varData['varReference']['references'][8:] parent_var_id = varData['varReference']['reference_id'] relationships.append( (parent_entity, parent_var_id, entity, varData['varName'])) else: var_types[varData['varName']] = d3m_var_to_ft_var[ varData['varType']] es.entity_from_csv(entity, csv_path, index=index, variable_types=var_types) for parent_entity, par_var_id, child_entity, child_var_id in relationships: relationship = ft.Relationship(es[parent_entity][par_var_id], es[child_entity][child_var_id]) es.add_relationship(relationship) return es
def create_entity_set(entityset_name, entityset_quads, entity_relationships): es = ft.EntitySet(entityset_name) for es_quad in entityset_quads: es.entity_from_dataframe(entity_id=es_quad[0], dataframe=es_quad[1], index=es_quad[2], time_index=es_quad[3]) # if cohorts entity is included if len(entityset_quads) > 2: for rel in entity_relationships: es.add_relationship( ft.Relationship(es[rel[0]][rel[2]], es[rel[1]][rel[2]])) # if cohorts entity is not included elif len(entityset_quads) == 2: er = entity_relationships es.add_relationship(ft.Relationship(es[er[0]][er[2]], es[er[1]][er[2]])) return es
def create_entity_set(dp: str, sp: list, esc: list, rls: list, od: Any, mt: str, oge: bool = False) -> Any: """ 创建实体集 :param dp: 数据文件所在目录 :param sp: 跳过文件列表 :param esc: 定制实体列表 :param rls: 定制关系列表 :param od: 读取分块文件后的处理 :param mt: 主表 :param oge: 是否仅返回实体 :return: 返回的实体集 """ if os.path.exists(os.path.join(dp, Data_Val.feature_matrix_part_file)): Log.debug('跳过创建实体集 {}'.format(dp)) return None data = DealDataFile.get_data_dict_by_path(dp, sp) es = ft.EntitySet(id='clients') data = od(data) # 定制实体 for x in esc: if len(x) != 2: return None if x[1] in data[x[0]]: es = es.entity_from_dataframe(entity_id=x[0], dataframe=data[x[0]], index=x[1]) else: es = es.entity_from_dataframe(entity_id=x[0], dataframe=data[x[0]], make_index=True, index=x[1]) # 定制关系 r = [] for x in rls: if len(x) != 4: return None r.append(ft.Relationship(es[x[0]][x[1]], es[x[2]][x[3]])) es = es.add_relationships(r) if oge: return es else: feature_matrix_from_entity_set(es, dp, mt) return None
def get_ft_features(askprice, bidprice, askvolume, bidvolume, others): es_train = ft.EntitySet(id='stock') es_train = es_train.entity_from_dataframe(entity_id='askprices', dataframe=askprice, index='stock_id', make_index=True) es_train = es_train.entity_from_dataframe(entity_id='bidprices', dataframe=bidprice, index='stock_id', make_index=True) es_train = es_train.entity_from_dataframe(entity_id='askvolumes', dataframe=askvolume, index='stock_id', make_index=True) es_train = es_train.entity_from_dataframe(entity_id='bidvolumes', dataframe=bidvolume, index='stock_id', make_index=True) es_train = es_train.entity_from_dataframe(entity_id='otherprices', dataframe=others, index='stock_id', make_index=True) r1 = ft.Relationship(es_train['askprices']['stock_id'], es_train['askvolumes']['stock_id']) r2 = ft.Relationship(es_train['bidprices']['stock_id'], es_train['bidvolumes']['stock_id']) r3 = ft.Relationship(es_train['askprices']['stock_id'], es_train['otherprices']['stock_id']) es_train = es_train.add_relationship(r1) es_train = es_train.add_relationship(r2) es_train = es_train.add_relationship(r3) print(es_train) features, feature_names = ft.dfs(entityset=es_train, target_entity='askprices') print(features) return np.array(features)
def set_es(self): self.__es = ft.EntitySet(id="customers") self.__es = self.__es.entity_from_dataframe( entity_id="customers", index="customer_id", dataframe=self.__customers_df) self.__es = self.__es.entity_from_dataframe( entity_id="sessions", index="session_id", dataframe=self.__sessions_df, variable_types={"device": ft.variable_types.Categorical}) self.__es = self.__es.add_relationship( ft.Relationship(self.__es["customers"]["customer_id"], self.__es["sessions"]["customer_id"]))
def add_relationship(self, parent_variable_name, child_variable_name): """ relation = ft.Relationship(es['t1']['id'], es['t2']['id']) es.add_relationship(relation) :param parent_variable_name: (entity_id, id) :param child_variable_name: (entity_id, id) :return: """ # relation = ft.Relationship(parent_variable, child_variable) relation = ft.Relationship( parent_variable=self.es[parent_variable_name[0]][parent_variable_name[1]], child_variable=self.es[child_variable_name[0]][child_variable_name[1]] ) self.es.add_relationship(relation)
def _build_child_entity(self, entity_set, child_entitys_info): for entity_name, entity_name_info in child_entitys_info.items(): parent_entitys = entity_name_info['parent_entity'] for parent_entity in parent_entitys: parent_entity_name = parent_entity['entity_name'] join_column = parent_entity.get('join_column', None) parent_entity_index = self.entity_set_info[parent_entity_name][ 'index'][0] if join_column is None: join_column = parent_entity_index entity_set.add_relationship( ft.Relationship( entity_set[parent_entity_name][parent_entity_index], entity_set[entity_name][join_column])) return entity_set
def create_entity_set(data: pd.DataFrame, train_table: str, test_table: str) -> ft.EntitySet: print(f"\nCreating entity set based on client data") start = time.monotonic() es = ft.EntitySet(id='clients') es = es.entity_from_dataframe(entity_id='combined_train_test', dataframe=data['combined_train_test'], index='SK_ID_CURR') es = es.entity_from_dataframe(entity_id='bureau', dataframe=data['bureau'], index='SK_ID_BUREAU') es = es.entity_from_dataframe(entity_id='bureau_balance', dataframe=data['bureau_balance'], make_index=True, index='bureaubalance_index') es = es.entity_from_dataframe(entity_id='previous_application', dataframe=data['previous_application'], index='SK_ID_PREV') es = es.add_relationships([ ft.Relationship(es['combined_train_test']['SK_ID_CURR'], es['bureau']['SK_ID_CURR']), ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU']), ft.Relationship(es['combined_train_test']['SK_ID_CURR'], es['previous_application']['SK_ID_CURR']) ]) end = time.monotonic() print(f" Entity set creation completed in {round(end - start)} seconds") return es
def main(): formatter = '%(asctime)s %(message)s' logging.basicConfig(filename='../logs/02_featuretools.log', level=logging.INFO, format=formatter) datas = read_all() app_train = datas['application_train'] app_test = datas['application_test'] bureau = datas['bureau'] bureau_balance = datas['bureau_balance'] cash = datas['POS_CASH_balance'] previous = datas['previous_application'] installments = datas['installments_payments'] credit = datas['credit_card_balance'] app_test["TARGET"] = np.nan app = app_train.append(app_test, ignore_index=True, sort=False) # Entity set with id applications entity_set = ft.EntitySet(id='HomeCredit') # Entities with a unique index entity_set = entity_set.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR') entity_set = entity_set.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') entity_set = entity_set.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV') # Entities that do not have a unique index entity_set = entity_set.entity_from_dataframe( entity_id='bureau_balance', dataframe=bureau_balance, make_index=True, index='bureaubalance_index' ) entity_set = entity_set.entity_from_dataframe( entity_id='cash', dataframe=cash, make_index=True, index='cash_index' ) entity_set = entity_set.entity_from_dataframe( entity_id='installments', dataframe=installments, make_index=True, index='installments_index' ) entity_set = entity_set.entity_from_dataframe( entity_id='credit', dataframe=credit, make_index=True, index='credit_index' ) # Add in the defined relationships entity_set = entity_set.add_relationships([ ft.Relationship(entity_set['app']['SK_ID_CURR'], entity_set['bureau']['SK_ID_CURR']), ft.Relationship(entity_set['bureau']['SK_ID_BUREAU'], entity_set['bureau_balance']['SK_ID_BUREAU']), ft.Relationship(entity_set['app']['SK_ID_CURR'], entity_set['previous']['SK_ID_CURR']), ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['cash']['SK_ID_PREV']), ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['installments']['SK_ID_PREV']), ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['credit']['SK_ID_PREV']) ]) agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode'] feature_matrix, _ = ft.dfs( entityset=entity_set, target_entity='app', agg_primitives=agg_primitives, max_depth=2, features_only=False, verbose=True ) feature_matrix = feature_matrix.reset_index() dump(feature_matrix, '../data/02_featuretools/feature_matrix.joblib')
def add_relation(self, relationships: list): ''' for auto_create add entitys relation. Parameters -------- relationships : the entitys relation and relation from parent to child,the format like ['entity1.key1','entity2.key1','entity2.key2','entity3.key2'] ''' relationships = [item.split('.') for item in relationships] trans_relationships = [ ft.Relationship(self.auto_create[parent[0]][parent[1]], self.auto_create[child[0]][child[1]]) for parent, child in zip(relationships[::2], relationships[1::2]) ] self.auto_create = self.auto_create.add_relationships( trans_relationships)
def dask_es(make_es): es = ft.EntitySet(id=make_es.id) for entity in make_es.entities: es.entity_from_dataframe( entity.id, dd.from_pandas(entity.df.reset_index(drop=True), npartitions=4), index=entity.index, time_index=entity.time_index, variable_types=entity.variable_types, secondary_time_index=entity.secondary_time_index) for rel in make_es.relationships: es.add_relationship( ft.Relationship(es[rel.parent_entity.id][rel.parent_variable.id], es[rel.child_entity.id][rel.child_variable.id])) return es
def _feature_summary_data(self): raceuma_df = self.base_df[[ "RACE_KEY", "UMABAN", "激走指数", "馬スタート指数", "馬出遅率", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "テンF指数", "中間F指数", "終いF指数", "コーナー順位3_1", "コーナー順位4_1", "前3F先頭差_1", "後3F先頭差_1", "レース脚質_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "追込率_1", "コーナー順位3_2", "コーナー順位4_2", "前3F先頭差_2", "後3F先頭差_2", "レース脚質_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "追込率_2", "コーナー順位3_3", "コーナー順位4_3", "前3F先頭差_3", "後3F先頭差_3", "レース脚質_3", "テン指数結果_3", "上がり指数結果_3", "ペース指数結果_3", "レースP指数結果_3", "追込率_3", "コーナー順位3_4", "コーナー順位4_4", "前3F先頭差_4", "後3F先頭差_4", "レース脚質_4", "テン指数結果_4", "上がり指数結果_4", "ペース指数結果_4", "レースP指数結果_4", "追込率_4", "コーナー順位3_5", "コーナー順位4_5", "前3F先頭差_5", "後3F先頭差_5", "レース脚質_5", "テン指数結果_5", "上がり指数結果_5", "ペース指数結果_5", "レースP指数結果_5", "追込率_5" ]] raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"].astype( str).str.cat(raceuma_df["UMABAN"].astype(str)) raceuma_df.drop("UMABAN", axis=1, inplace=True) es = ft.EntitySet(id="race") es.entity_from_dataframe( entity_id='race', dataframe=self.ld.race_df[["RACE_KEY", "target_date"]], index="RACE_KEY") es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY") relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"]) es = es.add_relationship(relationship) print(es) # 集約関数 aggregation_list = ['mean', 'skew'] transform_list = [] # run dfs print("un dfs") feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list, trans_primitives=transform_list, max_depth=2) feature_summary_df = pd.merge(feature_matrix, self.ld.race_df, on=["RACE_KEY", "target_date"]) print("_create_feature: feature_summary_df", feature_summary_df.shape) return feature_summary_df
def create_relationships(self, relationships, entity_set): """Binds entities in the entityset. Args: relationships: A dataframe of the relationships in fhir. entity_set: The global entityset that the entity will be added to. """ for i, relation in relationships.iterrows(): # parent table: 0, field: 1 # child table: 2, field: 3 new_relationship = ft.Relationship( entity_set[relation['parent_entity']][relation['parent_variable']], entity_set[relation['child_entity']][relation['child_variable']]) entity_set.add_relationship(new_relationship)
def _create_feature(self): """ マージしたデータから特徴量を生成する """ print("_create_feature") raceuma_df = self.base_df[["RACE_KEY", "UMABAN", "脚質", "距離適性", "父馬産駒連対平均距離", "母父馬産駒連対平均距離", "IDM", "テン指数", "ペース指数", "上がり指数", "位置指数", "IDM結果_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "先行率_1", "追込率_1", "fa_1_1", "fa_2_1", "fa_3_1", "fa_4_1", "fa_5_1"]] raceuma_df.loc[:, "RACE_UMA_KEY"] = raceuma_df["RACE_KEY"] + raceuma_df["UMABAN"] raceuma_df.drop("UMABAN", axis=1, inplace=True) # https://qiita.com/daigomiyoshi/items/d6799cc70b2c1d901fb5 es = ft.EntitySet(id="race") es.entity_from_dataframe(entity_id='race', dataframe=self.ld.race_df.drop("NENGAPPI", axis=1), index="RACE_KEY") es.entity_from_dataframe(entity_id='raceuma', dataframe=raceuma_df, index="RACE_UMA_KEY") relationship = ft.Relationship(es['race']["RACE_KEY"], es['raceuma']["RACE_KEY"]) es = es.add_relationship(relationship) print(es) # 集約関数 aggregation_list = ['min', 'max', 'mean', 'skew', 'percent_true'] transform_list = [] # run dfs print("un dfs") feature_matrix, features_dfs = ft.dfs(entityset=es, target_entity='race', agg_primitives=aggregation_list, trans_primitives=transform_list, max_depth=2) print("_create_feature: feature_matrix", feature_matrix.shape) # 予想1番人気のデータを取得 ninki_df = self.base_df.query("基準人気順位==1")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "情報印", "騎手印", "厩舎印", "調教印", "激走印", "展開記号", "輸送区分", "騎手期待単勝率", "騎手期待3着内率", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "放牧先ランク", "厩舎ランク", "調教量評価", "仕上指数変化", "調教評価", "IDM", "騎手指数", "情報指数", "総合指数", "人気指数", "調教指数", "厩舎指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "IDM結果_1", "IDM結果_2"]].add_prefix("人気_").rename(columns={"人気_RACE_KEY":"RACE_KEY"}) # 逃げ予想馬のデータを取得 nige_df = self.base_df.query("展開記号=='1'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "先行率_1", "先行率_2"]].add_prefix("逃げ_").rename(columns={"逃げ_RACE_KEY":"RACE_KEY"}) # 上がり最速予想馬のデータを取得 agari_df = self.base_df.query("展開記号=='2'")[["RACE_KEY", "脚質", "距離適性", "上昇度", "激走指数", "蹄コード", "見習い区分", "枠番", "総合印", "IDM印", "基準人気順位", "輸送区分", "激走タイプ", "休養理由分類コード", "芝ダ障害フラグ", "距離フラグ", "クラスフラグ", "転厩フラグ", "去勢フラグ", "乗替フラグ", "IDM", "騎手指数", "テン指数", "ペース指数", "上がり指数", "位置指数", "追切指数", "仕上指数", "斤量_1", "テン指数結果_1", "上がり指数結果_1", "ペース指数結果_1", "レースP指数結果_1", "斤量_2", "テン指数結果_2", "上がり指数結果_2", "ペース指数結果_2", "レースP指数結果_2", "先行率_1", "先行率_2"]].add_prefix("上り_").rename(columns={"上り_RACE_KEY":"RACE_KEY"}) self.base_df = pd.merge(feature_matrix, nige_df, on="RACE_KEY", how="left") self.base_df = pd.merge(self.base_df, agari_df, on="RACE_KEY", how="left") self.base_df = pd.merge(self.base_df, ninki_df, on="RACE_KEY") self.base_df = pd.merge(self.base_df, self.ld.race_df[["RACE_KEY", "NENGAPPI"]], on="RACE_KEY")
def ks_es(make_es): ks = pytest.importorskip('databricks.koalas', reason="Koalas not installed, skipping") es = ft.EntitySet(id=make_es.id) for entity in make_es.entities: cleaned_df = pd_to_ks_clean(entity.df).reset_index(drop=True) es.entity_from_dataframe( entity.id, ks.from_pandas(cleaned_df), index=entity.index, time_index=entity.time_index, variable_types=entity.variable_types, secondary_time_index=entity.secondary_time_index) for rel in make_es.relationships: es.add_relationship( ft.Relationship(es[rel.parent_entity.id][rel.parent_variable.id], es[rel.child_entity.id][rel.child_variable.id])) return es
def _to_entityset(self, dataset): es = ft.EntitySet() for table_name, df in dataset.tables.items(): if len(df.columns) == 1: continue # skipping single column tables table = dataset.metadata.get_table(table_name) primary_key = table[ "primary_key"] if "primary_key" in table else None if isinstance(primary_key, str): es = es.entity_from_dataframe(entity_id=table_name, dataframe=df.copy(), index=primary_key) else: es = es.entity_from_dataframe(entity_id=table_name, dataframe=df.copy(), make_index=True, index="_ft_id") if not primary_key: logger.warning("Table %s has no primary key.", table_name) else: logger.warning( "Table %s has a composite primary key, it will be ignored.", table_name) for foreign_key in dataset.metadata.get_foreign_keys(): if foreign_key["table"] not in es.entity_dict: continue if foreign_key["ref_table"] not in es.entity_dict: continue if not isinstance(foreign_key["ref_field"], str): logger.warning( "Tables %s and %s have a composite foreign key, it will be ignored.", foreign_key["ref_table"], foreign_key["table"]) continue try: es = es.add_relationship( ft.Relationship( es[foreign_key["ref_table"]][foreign_key["ref_field"]], es[foreign_key["table"]][foreign_key["field"]])) except ValueError as err: logger.warning(err) return es