def _generate_features(self, input_df): df = input_df.copy() df["id"] = df.index + 1 es = ft.EntitySet(id="data") es = es.entity_from_dataframe(entity_id="time_seq", dataframe=df, index="id", time_index=self.dt_col) def is_awake(column): hour = column.dt.hour return (((hour >= 6) & (hour <= 23)) | (hour == 0)).astype(int) def is_busy_hours(column): hour = column.dt.hour return (((hour >= 7) & (hour <= 9)) | (hour >= 16) & (hour <= 19)).astype(int) IsAwake = make_trans_primitive(function=is_awake, input_types=[DatetimeTimeIndex], return_type=Numeric) IsBusyHours = make_trans_primitive(function=is_busy_hours, input_types=[DatetimeTimeIndex], return_type=Numeric) feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="time_seq", agg_primitives=["count"], trans_primitives=[ "month", "weekday", "day", "hour", "is_weekend", IsAwake, IsBusyHours ]) return feature_matrix, feature_defs
def test_make_transform_restricts_time_arg(): make_trans_primitive(lambda time: time, [Datetime], Numeric, name="AllowedPrimitive", description="This primitive should be accepted", uses_calc_time=True) with pytest.raises(ValueError): make_trans_primitive(lambda time: time, [Datetime], Numeric, name="BadPrimitive", description="This primitive should erorr")
def test_make_transform_restricts_time_arg(): make_trans_primitive(lambda time: time, [Datetime], Numeric, name="AllowedPrimitive", description="This primitive should be accepted", uses_calc_time=True) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): make_trans_primitive(lambda time: time, [Datetime], Numeric, name="BadPrimitive", description="This primitive should erorr")
def test_make_transform_sets_kwargs_correctly(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin_1_list = ["toothpaste", "coke_zero"] isin_1_base_f = Feature(es['log']['product_id']) isin_1 = IsIn(isin_1_base_f, list_of_outputs=isin_1_list) isin_2_list = ["coke_zero"] isin_2_base_f = Feature(es['log']['session_id']) isin_2 = IsIn(isin_2_base_f, list_of_outputs=isin_2_list) assert isin_1_base_f == isin_1.base_features[0] assert isin_1_list == isin_1.kwargs['list_of_outputs'] assert isin_2_base_f == isin_2.base_features[0] assert isin_2_list == isin_2.kwargs['list_of_outputs']
def test_make_transform_restricts_time_arg(): make_trans_primitive( lambda time: time, [Datetime], Numeric, name="AllowedPrimitive", description="This primitive should be accepted", uses_calc_time=True) with pytest.raises(ValueError): make_trans_primitive( lambda time: time, [Datetime], Numeric, name="BadPrimitive", description="This primitive should erorr")
def test_warns_with_unused_custom_primitives(pd_es): def above_ten(column): return column > 10 AboveTen = make_trans_primitive(function=above_ten, input_types=[Numeric], return_type=Numeric) trans_primitives = [AboveTen] warning_text = "Some specified primitives were not used during DFS:\n" + \ " trans_primitives: ['above_ten']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=pd_es, target_entity='sessions', trans_primitives=trans_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='customers', trans_primitives=trans_primitives, max_depth=1) def max_above_ten(column): return max(column) > 10 MaxAboveTen = make_agg_primitive(function=max_above_ten, input_types=[Numeric], return_type=Numeric) agg_primitives = [MaxAboveTen] warning_text = "Some specified primitives were not used during DFS:\n" + \ " agg_primitives: ['max_above_ten']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=pd_es, target_entity='stores', agg_primitives=agg_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='sessions', agg_primitives=agg_primitives, max_depth=1)
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = IsIn(es['log']['product_id'], list_of_outputs=["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
''' assert string is not None, "string to count needs to be defined" counts = [element.lower().count(string) for element in column] return counts # %% def string_count_get_name(self): return u"STRING_COUNT(%s, %s)" % (self.base_features[0].get_name(), '"' + str(self.kwargs['string'] + '"')) # %% StringCount = make_trans_primitive(function=string_count, input_types=[Text], return_type=Numeric, cls_attributes={"get_name": string_count_get_name}) # %% from featuretools.tests.testing_utils import make_ecommerce_entityset es = make_ecommerce_entityset() count_the_feat = StringCount(es['log']['comments'], string="the") # 原始日志数据 # %% es['log'].df.head() # %% md # 统计日志表的评论字段出现the的求和值、平均值、标准差 # %%
def get_results(request): max_depth = request.POST['max_depth'] agg_pri = request.POST.getlist('agg_pri') agg_pri_customer = request.POST.getlist('agg_pri_customer') trans_pri_customer = request.POST.getlist('trans_pri_customer') trans_pri = request.POST.getlist('trans_pri') context = {'max_depth': max_depth, 'agg_pri': agg_pri, 'trans_pri': trans_pri} import featuretools as ft import pandas as pd import numpy as np from featuretools.primitives import make_trans_primitive, make_agg_primitive from featuretools.variable_types import DatetimeTimeIndex, Numeric pd.set_option('display.max_columns', 20) data = ft.demo.load_mock_customer() transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"]) products_df = data["products"] es = ft.EntitySet() s = es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_df, index="transaction_id", time_index="transaction_time", variable_types={"product_id": ft.variable_types.Categorical, "zip_code": ft.variable_types.ZIPCode}) es = es.entity_from_dataframe(entity_id="products", dataframe=products_df, index="product_id") new_relationship = ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"]) es = es.add_relationship(new_relationship) es = es.normalize_entity(base_entity_id="transactions", new_entity_id="sessions", index="session_id", make_time_index="session_start", additional_variables=["device", "customer_id", "zip_code", "session_start", "join_date"]) es = es.normalize_entity(base_entity_id="sessions", new_entity_id="customers", index="customer_id", make_time_index="join_date", additional_variables=["zip_code", "join_date"]) # feature_matrix1, feature_defs1 = ft.dfs(entityset=es, target_entity="products") # # feature_matrix2, feature_defs2 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=["count"], # trans_primitives=["month"], max_depth=1) """ 自定义agg_primitives: 改写time since last,原函数为秒,现在改为小时输出 """ def time_since_last_by_hour(values, time=None): time_since = time - values.iloc[-1] return time_since.total_seconds() / 3600 Time_since_last_by_hour = make_agg_primitive(function=time_since_last_by_hour, input_types=[DatetimeTimeIndex], return_type=Numeric, uses_calc_time=True) """ 自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive(function=log, input_types=[Numeric], return_type=Numeric, # uses_calc_time=True, description="Calculates the log of the value.", name="log") # 将前端页面的提交参数,保存为agg_pri列表 agg_pri = context['agg_pri'] trans_pri = context['trans_pri'] # 如果勾选了参数,加上自定义的Time_since_last_by_hour if 'Time_since_last_by_hour' in agg_pri_customer: agg_pri.append(Time_since_last_by_hour) if 'log_e' in trans_pri_customer: trans_pri.append(log) # 生成新的特征融合矩阵 feature_matrix3, feature_defs3 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=agg_pri, trans_primitives=trans_pri, max_depth=int(context['max_depth'])) res = [] for i in feature_defs3: res.append(str(i)) sample_data = [i for i in feature_matrix3.iloc[0]] return render(request, 'get_results.html', {'res': res, 'sample_data': sample_data})
n_collinear, correlation_threshold)) total_removed = n_missing_cols + n_zero_variance_cols + n_collinear print('Total columns removed: ', total_removed) print('Shape after feature selection: {}.'.format(feature_matrix.shape)) return feature_matrix # before we get into things, let's do all the featuretools definitions def log_plus_one(column): return np.log(column + min(column) + 1) lpo = make_trans_primitive(function=log_plus_one, input_types=[Numeric], return_type=Numeric) def abs_log(column): return np.log(np.abs(column) + 1) al = make_trans_primitive(function=abs_log, input_types=[Numeric], return_type=Numeric) def squared(column): return np.square(column)
print(feature_enc) print('-----------list primitives---------------------') print(ft.list_primitives().head()) print('----------custom primitives----------------------') from featuretools.primitives import make_agg_primitive, make_trans_primitive from featuretools.variable_types import Text, Numeric def absolute(column): return abs(column) Absolute = make_trans_primitive(function=absolute, input_types=[Numeric], return_type=Numeric) def maximum(columns): return max(columns) Maximum = make_agg_primitive(function=maximum, input_types=[Numeric], return_type=Numeric) #Multiple Input Types def mean_numeric(num1, num2): return (num1 + num2) / 2
def get_results(request): try: import featuretools as ft import pandas as pd import numpy as np from featuretools.primitives import make_trans_primitive, make_agg_primitive # 数据源相关的参数 types_dict = eval(request.COOKIES['types_dict']) columns_dict = eval(request.COOKIES['columns_dict']) target = request.COOKIES['target'] # 如何决定 base entity? # 目前思路是由 id 类型最多的 entity 来做 base entity # 把对应的表和id个数封装成字典,然后根据个数给表名排逆序,然后按照这个顺序merge表,是为最终思路 base_entity = '' base_index = '' max_count = 0 sorted_dict = {} for k, v in types_dict.items(): count = 0 index = '' for i in v: if '.Id' in str(i): count += 1 if '.Index' in str(i): index = i sorted_dict[k] = count if count > max_count: base_entity = k base_index = index max_count = count sorted_list = sorted(sorted_dict.items(), key=lambda item: item[1], reverse=True) sorted_table_name = [i[0] for i in sorted_list] print("sorted_table_name\n", sorted_table_name) # 把columns 和对应的 类型拼接成字典,存在一个列表中,并且找到base_index types_dict_list = [] entity_name_list = [] for key, values1, values2 in zip(columns_dict.keys(), columns_dict.values(), types_dict.values()): types_dict_list.append( {k: eval(v) for k, v in zip(values1, values2)}) entity_name_list.append(key) if key == base_entity: for k, v in zip(values2, values1): if '.Index' in k: base_index = v # 自动识别标记为Index的特征,并作为抽取实体的index参数,传入模型 # 把所有的类型字典拼成一个大字典 index_list = [] total_type_dict = {} for each_dict in types_dict_list: total_type_dict.update(each_dict) for k, v in each_dict.items(): if '.Index' in str(v): index_list.append(k) print(index_list) # print(total_type_dict) # 原表全部join在一起之后再抽取实体 # 数据接口改成处理CSV结构 import os import re if not os.path.isdir(os.getcwd() + "/demo_data"): os.mkdir(os.getcwd() + "/demo_data") os.chdir(os.getcwd() + "/demo_data") regex = re.compile("csv") raw_dict = {} for file in os.listdir(os.getcwd()): if re.search(regex, file): raw_dict[file.split(".")[0]] = pd.read_csv(file) data = raw_dict os.chdir("..") # todo : merge的逻辑比较复杂,要如何执行join操作?? if len(data) == 0: raise Exception("数据源为空,请检查数据源文件") elif len(data) > 1: data_df = data.pop(sorted_table_name.pop(0)) # print(data_df) for i in sorted_table_name: data_df = data_df.merge(data[i]) # # for i in list(data.values()): # data_df = data_df.merge(i) elif len(data) == 1: data_df = list(data.values())[0] es = ft.EntitySet() # print("+++++++++++++++++++++++") # print("data_df\n", data_df) # print("entity_id\n", base_entity) # print("base_index\n", base_index) # print("total_type_dict\n", total_type_dict) # print("+++++++++++++++++++++++") # 构造base entity, 将第一个表名作为基础实体名称 es = es.entity_from_dataframe( entity_id=base_entity, dataframe=data_df, index=base_index, # time_index="transaction_time", variable_types=total_type_dict) # 基于base entity抽取实体,逻辑比较复杂,基本逻辑是作为base entity的字段,跳过实体抽取,其余的将index 字段单独存储,设为index参数 for k, v in columns_dict.items(): if k == base_entity: continue index = '' for i in index_list: if i in v: v.remove(i) index = i # print("=========") # print(k) # print(index) # print(v) # print("=========") es = es.normalize_entity( base_entity_id=base_entity, new_entity_id=k, index=index, # make_time_index="session_start", additional_variables=v) """ 自定义agg_primitives: 改写time since last,原函数为秒,现在改为小时输出 """ def time_since_last_by_hour(values, time=None): time_since = time - values.iloc[-1] return time_since.total_seconds() / 3600 Time_since_last_by_hour = make_agg_primitive( function=time_since_last_by_hour, input_types=[ft.variable_types.DatetimeTimeIndex], return_type=ft.variable_types.Numeric, uses_calc_time=True) """ 自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive( function=log, input_types=[ft.variable_types.Numeric], return_type=ft.variable_types.Numeric, # uses_calc_time=True, description="Calculates the log of the value.", name="log") """ 自定义trans_primitives: 判断是否为正数 """ import numpy as np def is_positive(vals): return vals > 0 # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) is_positive = make_trans_primitive( function=is_positive, input_types=[ft.variable_types.Numeric], return_type=ft.variable_types.Boolean, # uses_calc_time=True, description="Calculates if the value positive.", name="is_positive") # 模型相关的参数 max_depth = request.POST['max_depth'] agg_pri = request.POST.getlist('agg_pri') agg_pri_customer = request.POST.getlist('agg_pri_customer') trans_pri_customer = request.POST.getlist('trans_pri_customer') trans_pri = request.POST.getlist('trans_pri') context = { 'max_depth': max_depth, 'agg_pri': agg_pri, 'trans_pri': trans_pri } pd.set_option('display.max_columns', 20) # 将前端页面的提交参数,保存为agg_pri列表 agg_pri = context['agg_pri'] trans_pri = context['trans_pri'] print(trans_pri_customer) # 如果勾选了参数,加上自定义的Time_since_last_by_hour if 'Time_since_last_by_hour' in agg_pri_customer: agg_pri.append(Time_since_last_by_hour) if 'log_e' in trans_pri_customer: trans_pri.append(log) if 'is_positive' in trans_pri_customer: trans_pri.append(is_positive) print("+++++++++++++++++++++++++++++") print(trans_pri) print("+++++++++++++++++++++++++++++") # 生成新的特征融合矩阵 feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity=target, agg_primitives=agg_pri, trans_primitives=trans_pri, max_depth=int( context['max_depth'])) # 将索引作为第一列插入数据矩阵 feature_matrix = feature_matrix.reset_index() new_columns = feature_matrix.columns # 保存数据矩阵,注意在特征选择界面,没有 customer_id 作为选项,因为这只是索引 # nlp 数组是将primitives替换为中文后的表头,一并显示在第二行 import os if not os.path.isdir(os.getcwd() + "/demo_data/result"): os.mkdir(os.getcwd() + "/demo_data/result") feature_matrix.to_csv("./demo_data/result/all_features.csv", index=False) # print(feature_matrix.head(5)) from .columns2NLP import columns2NLP res = [] nlp = [] for i in new_columns: res.append(str(i)) nlp.append(columns2NLP(str(i))) # print(res[0]) # print("======================") # print(res) # print(nlp) # print("======================") # 将所有的浮点数精度调整到小数点后两位 sample_data1 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[0] ] sample_data2 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[1] ] sample_data3 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[2] ] sample_data4 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[3] ] sample_data5 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[4] ] response = render( request, 'get_results.html', { 'res': res, 'nlp': nlp, 'sample_data1': sample_data1, 'sample_data2': sample_data2, 'sample_data3': sample_data3, 'sample_data4': sample_data4, 'sample_data5': sample_data5 }) response.set_cookie('target_id', res[0]) return response except Exception as e: response = render(request, 'erro.html', {'erro': e}) return response
# finally let's import the data df = pd.read_csv("creditcard.csv") df = df.drop( ['Time'], axis=1 ) #,'V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8'], axis =1) df = df.dropna() # before we get into things, let's do all the featuretools definitions def abs_log(column): return np.log(np.abs(column) + 1) al = make_trans_primitive(function=abs_log, input_types=[Numeric], return_type=Numeric) def squared(column): return np.square(column) sq = make_trans_primitive(function=squared, input_types=[Numeric], return_type=Numeric) def bins_5(column): temp = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal',
# %% # custom function so the name of the feature prints out correctly def make_name(self): return "%s_goal_last_%d" % (self.kwargs['Qty1'], self.kwargs['Qty2']) # %% # %% def compare_Qty(Qty1,Qty2): return Qty1>Qty2 CompareMove = make_trans_primitive(function=compare_Qty, input_types=[Numeric, Numeric], return_type=Boolean, description="compare_Qty" #cls_attributes={"generate_name": make_name, "uses_full_entity":True} ) input_vars = [es["Machine"]["MOVE_QTY"], es["Machine"]["WIP_QTY"]] # Compare_Move = CompareMove(*input_vars) # #Compare_Move = CompareMove(Qty1=es["Machine"]["MOVE_QTY"], Qty2=es["Machine"]["WIP_QTY"]) # features = [Compare_Move] # fm = ft.calculate_feature_matrix(entityset=es, features=features)
自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive( function=log, input_types=[Numeric], return_type=Numeric, # uses_calc_time=True, description="Calculates the log of the value.", name="log") # 生成新的特征融合矩阵 feature_matrix3, feature_defs3 = ft.dfs( entityset=es, target_entity="customers", agg_primitives=['count', 'mean', 'sum', 'min', 'max'], trans_primitives=['month'], max_depth=3) print(feature_matrix3) # 将索引作为第一列插入数据矩阵 feature_matrix3 = feature_matrix3.reset_index()
自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive( function=log, input_types=[ft.variable_types.Numeric], return_type=ft.variable_types.Numeric, # uses_calc_time=True, description="Calculates the log of the value.", name="log") """ 自定义trans_primitives: 判断是否为正数 """ import numpy as np def is_positive(vals): return vals > 0 # def generate_name(self, base_feature_names):