def test_time_since_last_custom(es): def time_since_last(values, time=None): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, name="time_since_last", uses_calc_time=True) f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = ft.calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def test_custom_primitive_time_as_arg(es): def time_since_last(values, time): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=True) assert TimeSinceLast.name == "time_since_last" f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) with pytest.raises(ValueError): make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def test_custom_primitive_time_as_arg(es): def time_since_last(values, time): time_since = time - values.iloc[0] return time_since.total_seconds() TimeSinceLast = make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=True) assert TimeSinceLast.name == "time_since_last" f = TimeSinceLast(es["log"]["datetime"], es["customers"]) fm = calculate_feature_matrix([f], entityset=es, instance_ids=[0, 1, 2], cutoff_time=datetime(2015, 6, 8)) correct = [131376600, 131289600, 131287800] # note: must round to nearest second assert all(fm[f.get_name()].round().values == correct) with pytest.raises(ValueError): make_agg_primitive(time_since_last, [DatetimeTimeIndex], Numeric, uses_calc_time=False)
def get_feature_matrix(df, n_jobs=1, verbose=True): es = ft.EntitySet('safety_data') es.entity_from_dataframe(entity_id='records', index='id', make_index=True, dataframe=df, variable_types={ 'Accuracy': vtypes.Numeric, 'Bearing': vtypes.Numeric, 'acceleration_x': vtypes.Numeric, 'acceleration_y': vtypes.Numeric, 'acceleration_z': vtypes.Numeric, 'gyro_x': vtypes.Numeric, 'gyro_y': vtypes.Numeric, 'gyro_z': vtypes.Numeric, 'second': vtypes.Numeric, 'Speed': vtypes.Numeric, }) es.normalize_entity(base_entity_id='records', new_entity_id='bookings', index='bookingID') return ft.dfs(entityset=es, target_entity='bookings', agg_primitives=[ make_agg_primitive(function=mean_diff, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=max_diff, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=min_diff, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=std_diff, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=mean_diff_abs, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=max_diff_abs, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=min_diff_abs, input_types=[Numeric], return_type=Numeric), make_agg_primitive(function=std_diff_abs, input_types=[Numeric], return_type=Numeric), 'count', 'mean', 'max', 'min', 'std', ], n_jobs=n_jobs, verbose=verbose)
def _make_agg_primitives(self): self.days_since_last = make_agg_primitive( function=self._days_since_last, name='days_since_last', input_types=[DatetimeTimeIndex], return_type=Numeric, description="Time since last related instance", uses_calc_time=True) self.month_of_cutoff_point = make_agg_primitive( function=self._month_of_cutoff_point, name='month_of_cutoff_point', input_types=[DatetimeTimeIndex], return_type=Numeric, description="month_of_cutoff_point", uses_calc_time=True) self.user_defined_agg_primitives = ['month_of_cutoff_point']
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < getsize(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_pickle_features_with_custom_primitive(es): NewMean = make_agg_primitive( np.nanmean, name="NewMean", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Mean, NewMean], trans_primitives=[], max_features=20) features_no_pickle = dfs_obj.build_features() assert any([isinstance(feat, NewMean) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < getsize(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self): where_str = self._where_str() use_prev_str = self._use_prev_str() return u"COUNT(%s%s%s)" % (self.child_entity.id, where_str, use_prev_str) Count = make_agg_primitive( count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = Count(es['log']['value'], es['sessions'], count_null=True) feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
def test_pickle_features_with_custom_primitive(es): NewMax = make_agg_primitive( lambda x: max(x), name="NewMax", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") features_no_pickle = ft.dfs(target_entity='sessions', entityset=es, agg_primitives=["Last", "Mean", NewMax], features_only=True) assert any( [isinstance(feat.primitive, NewMax) for feat in features_no_pickle]) dir_path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(dir_path, 'test_feature') es_filepath = os.path.join(dir_path, 'test_entityset') # pickle entityset save_obj_pickle(es, es_filepath) ft.save_features(features_no_pickle, filepath) features_pickle = ft.load_features(filepath) for feat_1, feat_2 in zip(features_no_pickle, features_pickle): assert feat_1.hash() == feat_2.hash() assert feat_1.entityset == feat_2.entityset # file is smaller than entityset in memory assert os.path.getsize(filepath) < asizeof(es) # file is smaller than entityset pickled assert os.path.getsize(filepath) < os.path.getsize(es_filepath) os.remove(filepath) os.remove(es_filepath)
def test_custom_primitive_multiple_inputs(es): def mean_sunday(numeric, datetime): ''' Finds the mean of non-null values of a feature that occurred on Sundays ''' days = pd.DatetimeIndex(datetime).weekday.values df = pd.DataFrame({'numeric': numeric, 'time': days}) return df[df['time'] == 6]['numeric'].mean() MeanSunday = make_agg_primitive(function=mean_sunday, input_types=[Numeric, Datetime], return_type=Numeric) fm, features = dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[]) mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None]) iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value) for x, y in iterator: assert ((pd.isnull(x) and pd.isnull(y)) or (x == y)) es.add_interesting_values() mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None]) fm, features = dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[], where_primitives=[MeanSunday]) where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)" for x, y in zip(fm[where_feat], mean_sunday_value_priority_0): assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_custom_primitive_multiple_inputs(es): def mean_sunday(numeric, datetime): ''' Finds the mean of non-null values of a feature that occurred on Sundays ''' days = pd.DatetimeIndex(datetime).weekday.values df = pd.DataFrame({'numeric': numeric, 'time': days}) return df[df['time'] == 6]['numeric'].mean() MeanSunday = make_agg_primitive(function=mean_sunday, input_types=[Numeric, Datetime], return_type=Numeric) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[]) mean_sunday_value = pd.Series([None, None, None, 2.5, 7, None]) iterator = zip(fm["MEAN_SUNDAY(log.value, datetime)"], mean_sunday_value) for x, y in iterator: assert ((pd.isnull(x) and pd.isnull(y)) or (x == y)) es.add_interesting_values() mean_sunday_value_priority_0 = pd.Series([None, None, None, 2.5, 0, None]) fm, features = ft.dfs(entityset=es, target_entity="sessions", agg_primitives=[MeanSunday], trans_primitives=[], where_primitives=[MeanSunday]) where_feat = "MEAN_SUNDAY(log.value, datetime WHERE priority_level = 0)" for x, y in zip(fm[where_feat], mean_sunday_value_priority_0): assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def custom_aggregation(func, *args): """Takes custom aggregation function and returns it in a format usable by featuretools.""" return make_agg_primitive( lambda x: func(x, *args), [Numeric], Numeric, func.__name__ + "_".join(str(i) for i in args), )
def test_warns_with_unused_custom_primitives(pd_es): def above_ten(column): return column > 10 AboveTen = make_trans_primitive(function=above_ten, input_types=[Numeric], return_type=Numeric) trans_primitives = [AboveTen] warning_text = "Some specified primitives were not used during DFS:\n" + \ " trans_primitives: ['above_ten']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=pd_es, target_entity='sessions', trans_primitives=trans_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='customers', trans_primitives=trans_primitives, max_depth=1) def max_above_ten(column): return max(column) > 10 MaxAboveTen = make_agg_primitive(function=max_above_ten, input_types=[Numeric], return_type=Numeric) agg_primitives = [MaxAboveTen] warning_text = "Some specified primitives were not used during DFS:\n" + \ " agg_primitives: ['max_above_ten']\n" + \ "This may be caused by a using a value of max_depth that is too small, not setting interesting values, " + \ "or it may indicate no compatible variable types for the primitive were found in the data." with pytest.warns(UnusedPrimitiveWarning) as record: dfs(entityset=pd_es, target_entity='stores', agg_primitives=agg_primitives, max_depth=1) assert record[0].message.args[0] == warning_text # Should not raise a warning with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='sessions', agg_primitives=agg_primitives, max_depth=1)
def test_agg_same_method_name(es): """ Pandas relies on the function name when calculating aggregations. This means if a two primitives with the same function name are applied to the same column, pandas can't differentiate them. We have a work around to this based on the name property that we test here. """ # test with normally defined functions def custom_primitive(x): return x.sum() Sum = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="sum") def custom_primitive(x): return x.max() Max = make_agg_primitive(custom_primitive, input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()] # test with lambdas Sum = make_agg_primitive(lambda x: x.sum(), input_types=[Numeric], return_type=Numeric, name="sum") Max = make_agg_primitive(lambda x: x.max(), input_types=[Numeric], return_type=Numeric, name="max") f_sum = Sum(es["log"]["value"], es["customers"]) f_max = Max(es["log"]["value"], es["customers"]) fm = ft.calculate_feature_matrix([f_sum, f_max], entityset=es) assert fm.columns.tolist() == [f_sum.get_name(), f_max.get_name()]
def test_pickle_features_with_custom_primitive(es): NewMax = make_agg_primitive( lambda x: max(x), name="NewMax", input_types=[Numeric], return_type=Numeric, description="Calculate means ignoring nan values") features_original = ft.dfs(target_entity='sessions', entityset=es, agg_primitives=["Last", "Mean", NewMax], features_only=True) assert any([isinstance(feat.primitive, NewMax) for feat in features_original]) pickle_features_test_helper(asizeof(es), features_original)
def test_pickle_features_with_custom_primitive(pd_es, tmpdir): NewMax = make_agg_primitive( lambda x: max(x), name="NewMax", input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), description="Calculate means ignoring nan values") features_original = ft.dfs(target_dataframe_name='sessions', entityset=pd_es, agg_primitives=["Last", "Mean", NewMax], features_only=True) assert any( [isinstance(feat.primitive, NewMax) for feat in features_original]) pickle_features_test_helper(asizeof(pd_es), features_original, str(tmpdir))
def test_custom_primitive_default_kwargs(es): def sum_n_times(numeric, n=1): return np.nan_to_num(numeric).sum(dtype=np.float) * n SumNTimes = make_agg_primitive(function=sum_n_times, input_types=[Numeric], return_type=Numeric) sum_n_1_n = 1 sum_n_1_base_f = Feature(es['log']['value']) sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n) sum_n_2_n = 2 sum_n_2_base_f = Feature(es['log']['value_2']) sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n) assert sum_n_1_base_f == sum_n_1.base_features[0] assert sum_n_1_n == sum_n_1.kwargs['n'] assert sum_n_2_base_f == sum_n_2.base_features[0] assert sum_n_2_n == sum_n_2.kwargs['n']
def test_custom_primitive_default_kwargs(es): def sum_n_times(numeric, n=1): return np.nan_to_num(numeric).sum(dtype=np.float) * n SumNTimes = make_agg_primitive(function=sum_n_times, input_types=[Numeric], return_type=Numeric) sum_n_1_n = 1 sum_n_1_base_f = Feature(es['log']['value']) sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n) sum_n_2_n = 2 sum_n_2_base_f = Feature(es['log']['value_2']) sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n) assert sum_n_1_base_f == sum_n_1.base_features[0] assert sum_n_1_n == sum_n_1.kwargs['n'] assert sum_n_2_base_f == sum_n_2.base_features[0] assert sum_n_2_n == sum_n_2.kwargs['n']
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self): where_str = self._where_str() use_prev_str = self._use_prev_str() return u"COUNT(%s%s%s)" % (self.child_entity.name, where_str, use_prev_str) Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = Count(es['log']['value'], es['sessions'], count_null=True) feature_matrix = calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
# # feature_matrix2, feature_defs2 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=["count"], # trans_primitives=["month"], max_depth=1) """ 自定义agg_primitives: 改写time since last,原函数为秒,现在改为小时输出 """ def time_since_last_by_hour(values, time=None): time_since = time - values.iloc[-1] return time_since.total_seconds() / 3600 Time_since_last_by_hour = make_agg_primitive(function=time_since_last_by_hour, input_types=[DatetimeTimeIndex], return_type=Numeric, uses_calc_time=True) """ 自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive(
end_flag = length // n * end # print(start_flag, end_flag) piece = new_s.iloc[start_flag:end_flag] # print(sum(piece)) # print() if (sum(piece) > 0): count += 1 start += 1 end += 1 return count rise_count = make_agg_primitive( function=rise_count, input_types=[Numeric], return_type=Numeric, # uses_calc_time=True, description="Calculates the rise_count max of the value.", name="rise_count") # %% """ # 生成新的特征融合矩阵 # 可以根据target_entity的不同生成不同的融合特征矩阵 """ feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity="customers", # agg_primitives=["median", "count", "num_unique", "max","avg_time_between", "n_most_common", max2nd, max3rd], agg_primitives=[rise_count], trans_primitives=["month"],
from woodwork.column_schema import ColumnSchema from featuretools.primitives import make_agg_primitive CustomMax = make_agg_primitive( lambda x: max(x), name="CustomMax", input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'})) CustomSum = make_agg_primitive( lambda x: sum(x), name="CustomSum", input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}))
def get_results(request): try: import featuretools as ft import pandas as pd import numpy as np from featuretools.primitives import make_trans_primitive, make_agg_primitive # 数据源相关的参数 types_dict = eval(request.COOKIES['types_dict']) columns_dict = eval(request.COOKIES['columns_dict']) target = request.COOKIES['target'] # 如何决定 base entity? # 目前思路是由 id 类型最多的 entity 来做 base entity # 把对应的表和id个数封装成字典,然后根据个数给表名排逆序,然后按照这个顺序merge表,是为最终思路 base_entity = '' base_index = '' max_count = 0 sorted_dict = {} for k, v in types_dict.items(): count = 0 index = '' for i in v: if '.Id' in str(i): count += 1 if '.Index' in str(i): index = i sorted_dict[k] = count if count > max_count: base_entity = k base_index = index max_count = count sorted_list = sorted(sorted_dict.items(), key=lambda item: item[1], reverse=True) sorted_table_name = [i[0] for i in sorted_list] print("sorted_table_name\n", sorted_table_name) # 把columns 和对应的 类型拼接成字典,存在一个列表中,并且找到base_index types_dict_list = [] entity_name_list = [] for key, values1, values2 in zip(columns_dict.keys(), columns_dict.values(), types_dict.values()): types_dict_list.append( {k: eval(v) for k, v in zip(values1, values2)}) entity_name_list.append(key) if key == base_entity: for k, v in zip(values2, values1): if '.Index' in k: base_index = v # 自动识别标记为Index的特征,并作为抽取实体的index参数,传入模型 # 把所有的类型字典拼成一个大字典 index_list = [] total_type_dict = {} for each_dict in types_dict_list: total_type_dict.update(each_dict) for k, v in each_dict.items(): if '.Index' in str(v): index_list.append(k) print(index_list) # print(total_type_dict) # 原表全部join在一起之后再抽取实体 # 数据接口改成处理CSV结构 import os import re if not os.path.isdir(os.getcwd() + "/demo_data"): os.mkdir(os.getcwd() + "/demo_data") os.chdir(os.getcwd() + "/demo_data") regex = re.compile("csv") raw_dict = {} for file in os.listdir(os.getcwd()): if re.search(regex, file): raw_dict[file.split(".")[0]] = pd.read_csv(file) data = raw_dict os.chdir("..") # todo : merge的逻辑比较复杂,要如何执行join操作?? if len(data) == 0: raise Exception("数据源为空,请检查数据源文件") elif len(data) > 1: data_df = data.pop(sorted_table_name.pop(0)) # print(data_df) for i in sorted_table_name: data_df = data_df.merge(data[i]) # # for i in list(data.values()): # data_df = data_df.merge(i) elif len(data) == 1: data_df = list(data.values())[0] es = ft.EntitySet() # print("+++++++++++++++++++++++") # print("data_df\n", data_df) # print("entity_id\n", base_entity) # print("base_index\n", base_index) # print("total_type_dict\n", total_type_dict) # print("+++++++++++++++++++++++") # 构造base entity, 将第一个表名作为基础实体名称 es = es.entity_from_dataframe( entity_id=base_entity, dataframe=data_df, index=base_index, # time_index="transaction_time", variable_types=total_type_dict) # 基于base entity抽取实体,逻辑比较复杂,基本逻辑是作为base entity的字段,跳过实体抽取,其余的将index 字段单独存储,设为index参数 for k, v in columns_dict.items(): if k == base_entity: continue index = '' for i in index_list: if i in v: v.remove(i) index = i # print("=========") # print(k) # print(index) # print(v) # print("=========") es = es.normalize_entity( base_entity_id=base_entity, new_entity_id=k, index=index, # make_time_index="session_start", additional_variables=v) """ 自定义agg_primitives: 改写time since last,原函数为秒,现在改为小时输出 """ def time_since_last_by_hour(values, time=None): time_since = time - values.iloc[-1] return time_since.total_seconds() / 3600 Time_since_last_by_hour = make_agg_primitive( function=time_since_last_by_hour, input_types=[ft.variable_types.DatetimeTimeIndex], return_type=ft.variable_types.Numeric, uses_calc_time=True) """ 自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive( function=log, input_types=[ft.variable_types.Numeric], return_type=ft.variable_types.Numeric, # uses_calc_time=True, description="Calculates the log of the value.", name="log") """ 自定义trans_primitives: 判断是否为正数 """ import numpy as np def is_positive(vals): return vals > 0 # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) is_positive = make_trans_primitive( function=is_positive, input_types=[ft.variable_types.Numeric], return_type=ft.variable_types.Boolean, # uses_calc_time=True, description="Calculates if the value positive.", name="is_positive") # 模型相关的参数 max_depth = request.POST['max_depth'] agg_pri = request.POST.getlist('agg_pri') agg_pri_customer = request.POST.getlist('agg_pri_customer') trans_pri_customer = request.POST.getlist('trans_pri_customer') trans_pri = request.POST.getlist('trans_pri') context = { 'max_depth': max_depth, 'agg_pri': agg_pri, 'trans_pri': trans_pri } pd.set_option('display.max_columns', 20) # 将前端页面的提交参数,保存为agg_pri列表 agg_pri = context['agg_pri'] trans_pri = context['trans_pri'] print(trans_pri_customer) # 如果勾选了参数,加上自定义的Time_since_last_by_hour if 'Time_since_last_by_hour' in agg_pri_customer: agg_pri.append(Time_since_last_by_hour) if 'log_e' in trans_pri_customer: trans_pri.append(log) if 'is_positive' in trans_pri_customer: trans_pri.append(is_positive) print("+++++++++++++++++++++++++++++") print(trans_pri) print("+++++++++++++++++++++++++++++") # 生成新的特征融合矩阵 feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity=target, agg_primitives=agg_pri, trans_primitives=trans_pri, max_depth=int( context['max_depth'])) # 将索引作为第一列插入数据矩阵 feature_matrix = feature_matrix.reset_index() new_columns = feature_matrix.columns # 保存数据矩阵,注意在特征选择界面,没有 customer_id 作为选项,因为这只是索引 # nlp 数组是将primitives替换为中文后的表头,一并显示在第二行 import os if not os.path.isdir(os.getcwd() + "/demo_data/result"): os.mkdir(os.getcwd() + "/demo_data/result") feature_matrix.to_csv("./demo_data/result/all_features.csv", index=False) # print(feature_matrix.head(5)) from .columns2NLP import columns2NLP res = [] nlp = [] for i in new_columns: res.append(str(i)) nlp.append(columns2NLP(str(i))) # print(res[0]) # print("======================") # print(res) # print(nlp) # print("======================") # 将所有的浮点数精度调整到小数点后两位 sample_data1 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[0] ] sample_data2 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[1] ] sample_data3 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[2] ] sample_data4 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[3] ] sample_data5 = [ round(i, 2) if isinstance(i, float) else i for i in feature_matrix.iloc[4] ] response = render( request, 'get_results.html', { 'res': res, 'nlp': nlp, 'sample_data1': sample_data1, 'sample_data2': sample_data2, 'sample_data3': sample_data3, 'sample_data4': sample_data4, 'sample_data5': sample_data5 }) response.set_cookie('target_id', res[0]) return response except Exception as e: response = render(request, 'erro.html', {'erro': e}) return response
def get_results(request): max_depth = request.POST['max_depth'] agg_pri = request.POST.getlist('agg_pri') agg_pri_customer = request.POST.getlist('agg_pri_customer') trans_pri_customer = request.POST.getlist('trans_pri_customer') trans_pri = request.POST.getlist('trans_pri') context = {'max_depth': max_depth, 'agg_pri': agg_pri, 'trans_pri': trans_pri} import featuretools as ft import pandas as pd import numpy as np from featuretools.primitives import make_trans_primitive, make_agg_primitive from featuretools.variable_types import DatetimeTimeIndex, Numeric pd.set_option('display.max_columns', 20) data = ft.demo.load_mock_customer() transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"]) products_df = data["products"] es = ft.EntitySet() s = es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_df, index="transaction_id", time_index="transaction_time", variable_types={"product_id": ft.variable_types.Categorical, "zip_code": ft.variable_types.ZIPCode}) es = es.entity_from_dataframe(entity_id="products", dataframe=products_df, index="product_id") new_relationship = ft.Relationship(es["products"]["product_id"], es["transactions"]["product_id"]) es = es.add_relationship(new_relationship) es = es.normalize_entity(base_entity_id="transactions", new_entity_id="sessions", index="session_id", make_time_index="session_start", additional_variables=["device", "customer_id", "zip_code", "session_start", "join_date"]) es = es.normalize_entity(base_entity_id="sessions", new_entity_id="customers", index="customer_id", make_time_index="join_date", additional_variables=["zip_code", "join_date"]) # feature_matrix1, feature_defs1 = ft.dfs(entityset=es, target_entity="products") # # feature_matrix2, feature_defs2 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=["count"], # trans_primitives=["month"], max_depth=1) """ 自定义agg_primitives: 改写time since last,原函数为秒,现在改为小时输出 """ def time_since_last_by_hour(values, time=None): time_since = time - values.iloc[-1] return time_since.total_seconds() / 3600 Time_since_last_by_hour = make_agg_primitive(function=time_since_last_by_hour, input_types=[DatetimeTimeIndex], return_type=Numeric, uses_calc_time=True) """ 自定义trans_primitives: 添加log e 的自然对数 """ import numpy as np def log(vals): return np.log(vals) # def generate_name(self, base_feature_names): # return "-(%s)" % (base_feature_names[0]) log = make_trans_primitive(function=log, input_types=[Numeric], return_type=Numeric, # uses_calc_time=True, description="Calculates the log of the value.", name="log") # 将前端页面的提交参数,保存为agg_pri列表 agg_pri = context['agg_pri'] trans_pri = context['trans_pri'] # 如果勾选了参数,加上自定义的Time_since_last_by_hour if 'Time_since_last_by_hour' in agg_pri_customer: agg_pri.append(Time_since_last_by_hour) if 'log_e' in trans_pri_customer: trans_pri.append(log) # 生成新的特征融合矩阵 feature_matrix3, feature_defs3 = ft.dfs(entityset=es, target_entity="customers", agg_primitives=agg_pri, trans_primitives=trans_pri, max_depth=int(context['max_depth'])) res = [] for i in feature_defs3: res.append(str(i)) sample_data = [i for i in feature_matrix3.iloc[0]] return render(request, 'get_results.html', {'res': res, 'sample_data': sample_data})
def absolute(column): return abs(column) Absolute = make_trans_primitive(function=absolute, input_types=[Numeric], return_type=Numeric) def maximum(columns): return max(columns) Maximum = make_agg_primitive(function=maximum, input_types=[Numeric], return_type=Numeric) #Multiple Input Types def mean_numeric(num1, num2): return (num1 + num2) / 2 Meanval = make_trans_primitive(function=mean_numeric, input_types=[Numeric, Numeric], return_type=Numeric) feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity='transactions', trans_primitives=[
sec.add(i) full.add(i) return repeat / len(set(column)) def repeat_percent(column): a = set(column) return len(a) / len(column) def count_set_length(column): a = set(column) return len(a) cunt_rpt = make_agg_primitive(function=count_repeat, input_types=[Categorical], return_type=Numeric) CountDay = make_agg_primitive(function=count_set_length, input_types=[ft.variable_types.Datetime], return_type=Numeric) RepeatPercent = make_agg_primitive(function=repeat_percent, input_types=[Categorical], return_type=Numeric) log_df = get_train_log(None) log_df = log_df.loc[log_df['action_type'] == 2] log_df["user_seller"] = np.add(np.array(log_df["user_id"].map(lambda x: str(x) + "_")), np.array(log_df["seller_id"].map(lambda x: str(x)))) log_df['data'] = log_df["time_stamp"].map(lambda x: '2016-' + str(int(x / 100)) + '-' + str(int(x // 100))) log_df["month"] = log_df["time_stamp"].map(lambda x: int(x / 100)) user_df = get_user_info() log_df = log_df.merge(user_df, on="user_id", how="inner") log_df.drop(labels=['user_id', 'seller_id', 'action_type', 'age_range', 'gender'], axis=1, inplace=True) log_df["index"] = log_df.index es = ft.EntitySet(id="logs") es = es.entity_from_dataframe(entity_id="logs",
def my_primitives(): def gmean(x): return stats.gmean(np.absolute(list(filter(lambda a: a != 0, x)))) def hmean(x): return stats.hmean(np.absolute(list(filter(lambda a: a != 0, x)))) def kstatvar1(x): return stats.kstatvar(x, 1) def kstat2(x): return stats.kstat(x, 2) def kstatvar2(x): return stats.kstatvar(x, 2) def kstat3(x): return stats.kstat(x, 3) def kstat4(x): return stats.kstat(x, 4) def avg_change(x): return np.mean(np.diff(x)) def avg_change_rate(x): return np.mean(np.nonzero((np.diff(x) / x[:-1]))[0]) def range_func(x): return max(x)-min(x) def std_first_50000(x): return x[:50000].std() def std_last_50000(x): return x[-50000:].std() def std_first_10000(x): return x[:10000].std() def std_last_10000(x): return x[-10000:].std() def avg_first_50000(x): return x[:50000].mean() def avg_last_50000(x): return x[-50000:].mean() def avg_first_10000(x): return x[:10000].mean() def avg_last_10000(x): return x[-10000:].mean() def min_first_50000(x): return x[:50000].min() def min_last_50000(x): return x[-50000:].min() def min_first_10000(x): return x[:10000].min() def min_last_10000(x): return x[-10000:].min() def max_first_50000(x): return x[:50000].max() def max_last_50000(x): return x[-50000:].max() def max_first_10000(x): return x[:10000].max() def max_last_10000(x): return x[-10000:].max() def max_to_min(x): return x.max() / np.abs(x.min()) def count_big(x): return len(x[np.abs(x) > 500]) def sum_func(x): return x.sum() def avg_change_rate_first_50000(x): return np.mean(np.nonzero((np.diff(x[:50000]) / x[:50000][:-1]))[0]) def avg_change_rate_last_50000(x): return np.mean(np.nonzero((np.diff(x[-50000:]) / x[-50000:][:-1]))[0]) def avg_change_rate_first_10000(x): return np.mean(np.nonzero((np.diff(x[:10000]) / x[:10000][:-1]))[0]) def avg_change_rate_last_10000(x): return np.mean(np.nonzero((np.diff(x[-10000:]) / x[-10000:][:-1]))[0]) def q95(x): return np.quantile(x, 0.95) def q99(x): return np.quantile(x, 0.99) def q05(x): return np.quantile(x, 0.05) def q01(x): return np.quantile(x, 0.01) def abs_q95(x): return np.quantile(np.abs(x), 0.95) def abs_q99(x): return np.quantile(np.abs(x), 0.99) def add_trend_feature(arr, abs_values=False): idx = np.array(range(len(arr))) lr = LinearRegression() lr.fit(idx.reshape(-1, 1), arr) return lr.coef_[0] def add_trend_feature_abs(arr): idx = np.array(range(len(arr))) lr = LinearRegression() lr.fit(idx.reshape(-1, 1), np.abs(arr)) return lr.coef_[0] def abs_mean(x): return np.abs(x).mean() def abs_std(x): return np.abs(x).std() def mad(x): return x.mad() def kurt(x): return x.kurtosis() def skew(x): return x.skew() def med(x): return x.median() def Hilbert_mean(x): return np.abs(hilbert(x)).mean() def Hann_window_mean(x): return (np.convolve(x, hann(150), mode='same') / sum(hann(150))).mean() def classic_sta_lta(x, length_sta, length_lta): sta = np.cumsum(x ** 2) # Convert to float sta = np.require(sta, dtype=np.float) # Copy for LTA lta = sta.copy() # Compute the STA and the LTA sta[length_sta:] = sta[length_sta:] - sta[:-length_sta] sta /= length_sta lta[length_lta:] = lta[length_lta:] - lta[:-length_lta] lta /= length_lta # Pad zeros sta[:length_lta - 1] = 0 # Avoid division by zero by setting zero values to tiny float dtiny = np.finfo(0.0).tiny idx = lta < dtiny lta[idx] = dtiny return sta / lta def classic_sta_lta1_mean(x): return classic_sta_lta(x, 500, 10000).mean() def classic_sta_lta2_mean(x): return classic_sta_lta(x, 5000, 100000).mean() def classic_sta_lta3_mean(x): return classic_sta_lta(x, 3333, 6666).mean() def classic_sta_lta4_mean(x): return classic_sta_lta(x, 10000, 25000).mean() def Moving_average_700_mean(x): return x.rolling(window=700).mean().mean(skipna=True) def Moving_average_1500_mean(x): return x.rolling(window=1500).mean().mean(skipna=True) def Moving_average_3000_mean(x): return x.rolling(window=3000).mean().mean(skipna=True) def Moving_average_6000_mean(x): return x.rolling(window=6000).mean().mean(skipna=True) def exp_Moving_average_300_mean(x): return (pd.Series.ewm(x, span=300).mean()).mean(skipna=True) def exp_Moving_average_3000_mean(x): return (pd.Series.ewm(x, span=3000).mean()).mean(skipna=True) def exp_Moving_average_30000_mean(x): return (pd.Series.ewm(x, span=30000).mean()).mean(skipna=True) def iqr(x): return np.subtract(*np.percentile(x, [75, 25])) def q999(x): return np.quantile(x, 0.999) def q001(x): return np.quantile(x, 0.001) def ave10(x): return stats.trim_mean(x, 0.1) def ave_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return x_roll_std.mean() def std_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return x_roll_std.std() def max_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return x_roll_std.max() def min_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return x_roll_std.min() def q01_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.quantile(x_roll_std, 0.01) def q05_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.quantile(x_roll_std, 0.05) def q95_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.quantile(x_roll_std, 0.95) def q99_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.quantile(x_roll_std, 0.99) def av_change_abs_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.mean(np.diff(x_roll_std)) def av_change_rate_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) def abs_max_roll_std_10(x): x_roll_std = x.rolling(10).std().dropna().values return np.abs(x_roll_std).max() def std_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return x_roll_mean.std() def max_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return x_roll_mean.max() def min_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return x_roll_mean.min() def q01_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.quantile(x_roll_mean, 0.01) def q05_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.quantile(x_roll_mean, 0.05) def q95_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.quantile(x_roll_mean, 0.95) def q99_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.quantile(x_roll_mean, 0.99) def av_change_abs_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.mean(np.diff(x_roll_mean)) def av_change_rate_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) def abs_max_roll_mean_10(x): x_roll_mean = x.rolling(10).mean().dropna().values return np.abs(x_roll_mean).max() def ave_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return x_roll_std.mean() def std_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return x_roll_std.std() def max_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return x_roll_std.max() def min_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return x_roll_std.min() def q01_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.quantile(x_roll_std, 0.01) def q05_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.quantile(x_roll_std, 0.05) def q95_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.quantile(x_roll_std, 0.95) def q99_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.quantile(x_roll_std, 0.99) def av_change_abs_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.mean(np.diff(x_roll_std)) def av_change_rate_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) def abs_max_roll_std_100(x): x_roll_std = x.rolling(100).std().dropna().values return np.abs(x_roll_std).max() def std_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return x_roll_mean.std() def max_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return x_roll_mean.max() def min_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return x_roll_mean.min() def q01_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.quantile(x_roll_mean, 0.01) def q05_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.quantile(x_roll_mean, 0.05) def q95_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.quantile(x_roll_mean, 0.95) def q99_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.quantile(x_roll_mean, 0.99) def av_change_abs_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.mean(np.diff(x_roll_mean)) def av_change_rate_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) def abs_max_roll_mean_100(x): x_roll_mean = x.rolling(100).mean().dropna().values return np.abs(x_roll_mean).max() def ave_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return x_roll_std.mean() def std_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return x_roll_std.std() def max_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return x_roll_std.max() def min_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return x_roll_std.min() def q01_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.quantile(x_roll_std, 0.01) def q05_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.quantile(x_roll_std, 0.05) def q95_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.quantile(x_roll_std, 0.95) def q99_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.quantile(x_roll_std, 0.99) def av_change_abs_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.mean(np.diff(x_roll_std)) def av_change_rate_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) def abs_max_roll_std_1000(x): x_roll_std = x.rolling(1000).std().dropna().values return np.abs(x_roll_std).max() def std_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return x_roll_mean.std() def max_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return x_roll_mean.max() def min_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return x_roll_mean.min() def q01_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.quantile(x_roll_mean, 0.01) def q05_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.quantile(x_roll_mean, 0.05) def q95_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.quantile(x_roll_mean, 0.95) def q99_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.quantile(x_roll_mean, 0.99) def av_change_abs_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.mean(np.diff(x_roll_mean)) def av_change_rate_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) def abs_max_roll_mean_1000(x): x_roll_mean = x.rolling(1000).mean().dropna().values return np.abs(x_roll_mean).max() def ave_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return x_roll_std.mean() def std_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return x_roll_std.std() def max_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return x_roll_std.max() def min_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return x_roll_std.min() def q01_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.quantile(x_roll_std, 0.01) def q05_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.quantile(x_roll_std, 0.05) def q95_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.quantile(x_roll_std, 0.95) def q99_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.quantile(x_roll_std, 0.99) def av_change_abs_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.mean(np.diff(x_roll_std)) def av_change_rate_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) def abs_max_roll_std_10000(x): x_roll_std = x.rolling(10000).std().dropna().values return np.abs(x_roll_std).max() def std_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return x_roll_mean.std() def max_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return x_roll_mean.max() def min_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return x_roll_mean.min() def q01_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.quantile(x_roll_mean, 0.01) def q05_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.quantile(x_roll_mean, 0.05) def q95_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.quantile(x_roll_mean, 0.95) def q99_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.quantile(x_roll_mean, 0.99) def av_change_abs_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.mean(np.diff(x_roll_mean)) def av_change_rate_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) def abs_max_roll_mean_10000(x): x_roll_mean = x.rolling(10000).mean().dropna().values return np.abs(x_roll_mean).max() kstat2_pr = make_agg_primitive(function = kstat2, input_types = [Numeric], return_type = Numeric) kstatvar1_pr = make_agg_primitive(function = kstatvar1, input_types = [Numeric], return_type = Numeric) kstatvar2_pr = make_agg_primitive(function = kstatvar2, input_types = [Numeric], return_type = Numeric) kstat3_pr = make_agg_primitive(function = kstat3, input_types = [Numeric], return_type = Numeric) kstat4_pr = make_agg_primitive(function = kstat4, input_types = [Numeric], return_type = Numeric) gmean_pr = make_agg_primitive(function = gmean, input_types = [Numeric], return_type = Numeric) hmean_pr = make_agg_primitive(function = hmean, input_types = [Numeric], return_type = Numeric) avg_change_pr = make_agg_primitive(function = avg_change, input_types = [Numeric], return_type = Numeric) avg_change_rate_pr = make_agg_primitive(function = avg_change_rate, input_types = [Numeric], return_type = Numeric) range_pr = make_agg_primitive(function = range_func, input_types = [Numeric], return_type = Numeric) std_first_50000_pr = make_agg_primitive(function = std_first_50000, input_types = [Numeric], return_type = Numeric) std_last_50000_pr = make_agg_primitive(function = std_last_50000, input_types = [Numeric], return_type = Numeric) std_first_10000_pr = make_agg_primitive(function = std_first_10000, input_types = [Numeric], return_type = Numeric) std_last_10000_pr = make_agg_primitive(function = std_last_10000, input_types = [Numeric], return_type = Numeric) avg_first_50000_pr = make_agg_primitive(function = avg_first_50000, input_types = [Numeric], return_type = Numeric) avg_last_50000_pr = make_agg_primitive(function = avg_last_50000, input_types = [Numeric], return_type = Numeric) avg_first_10000_pr = make_agg_primitive(function = avg_first_10000, input_types = [Numeric], return_type = Numeric) avg_last_10000_pr = make_agg_primitive(function = avg_last_10000, input_types = [Numeric], return_type = Numeric) min_first_50000_pr = make_agg_primitive(function = min_first_50000, input_types = [Numeric], return_type = Numeric) min_last_50000_pr = make_agg_primitive(function = min_last_50000, input_types = [Numeric], return_type = Numeric) min_first_10000_pr = make_agg_primitive(function = min_first_10000, input_types = [Numeric], return_type = Numeric) min_last_10000_pr = make_agg_primitive(function = min_last_10000, input_types = [Numeric], return_type = Numeric) max_first_50000_pr = make_agg_primitive(function = max_first_50000, input_types = [Numeric], return_type = Numeric) max_last_50000_pr = make_agg_primitive(function = max_last_50000, input_types = [Numeric], return_type = Numeric) max_first_10000_pr = make_agg_primitive(function = max_first_10000, input_types = [Numeric], return_type = Numeric) max_last_10000_pr = make_agg_primitive(function = max_last_10000, input_types = [Numeric], return_type = Numeric) max_to_min_pr = make_agg_primitive(function = max_to_min, input_types = [Numeric], return_type = Numeric) count_big_pr = make_agg_primitive(function = count_big, input_types = [Numeric], return_type = Numeric) sum_func_pr = make_agg_primitive(function = sum_func, input_types = [Numeric], return_type = Numeric) avg_change_rate_first_50000_pr = make_agg_primitive(function = avg_change_rate_first_50000, input_types = [Numeric], return_type = Numeric) avg_change_rate_last_50000_pr = make_agg_primitive(function = avg_change_rate_last_50000, input_types = [Numeric], return_type = Numeric) avg_change_rate_first_10000_pr = make_agg_primitive(function = avg_change_rate_first_10000, input_types = [Numeric], return_type = Numeric) avg_change_rate_last_10000_pr = make_agg_primitive(function = avg_change_rate_last_10000, input_types = [Numeric], return_type = Numeric) q95_pr = make_agg_primitive(function = q95, input_types = [Numeric], return_type = Numeric) q99_pr = make_agg_primitive(function = q99, input_types = [Numeric], return_type = Numeric) q05_pr = make_agg_primitive(function = q05, input_types = [Numeric], return_type = Numeric) q01_pr = make_agg_primitive(function = q01, input_types = [Numeric], return_type = Numeric) abs_q95_pr = make_agg_primitive(function = abs_q95, input_types = [Numeric], return_type = Numeric) abs_q99_pr = make_agg_primitive(function = abs_q99, input_types = [Numeric], return_type = Numeric) trend_pr = make_agg_primitive(function = add_trend_feature, input_types = [Numeric], return_type = Numeric) abs_trend_pr = make_agg_primitive(function = add_trend_feature_abs, input_types = [Numeric], return_type = Numeric) abs_mean_pr = make_agg_primitive(function = abs_mean, input_types = [Numeric], return_type = Numeric) abs_std_pr = make_agg_primitive(function = abs_std, input_types = [Numeric], return_type = Numeric) mad_pr = make_agg_primitive(function = mad, input_types = [Numeric], return_type = Numeric) kurt_pr = make_agg_primitive(function = kurt, input_types = [Numeric], return_type = Numeric) skew_pr = make_agg_primitive(function = skew, input_types = [Numeric], return_type = Numeric) med_pr = make_agg_primitive(function = med, input_types = [Numeric], return_type = Numeric) Hilbert_mean_pr = make_agg_primitive(function = Hilbert_mean, input_types = [Numeric], return_type = Numeric) Hann_window_mean_pr = make_agg_primitive(function = Hann_window_mean, input_types = [Numeric], return_type = Numeric) classic_sta_lta1_mean_pr = make_agg_primitive(function = classic_sta_lta1_mean, input_types = [Numeric], return_type = Numeric) classic_sta_lta2_mean_pr = make_agg_primitive(function = classic_sta_lta2_mean, input_types = [Numeric], return_type = Numeric) classic_sta_lta3_mean_pr = make_agg_primitive(function = classic_sta_lta3_mean, input_types = [Numeric], return_type = Numeric) classic_sta_lta4_mean_pr = make_agg_primitive(function = classic_sta_lta4_mean, input_types = [Numeric], return_type = Numeric) Moving_average_700_mean_pr = make_agg_primitive(function = Moving_average_700_mean, input_types = [Numeric], return_type = Numeric) Moving_average_1500_mean_pr = make_agg_primitive(function = Moving_average_1500_mean, input_types = [Numeric], return_type = Numeric) Moving_average_3000_mean_pr = make_agg_primitive(function = Moving_average_3000_mean, input_types = [Numeric], return_type = Numeric) Moving_average_6000_mean_pr = make_agg_primitive(function = Moving_average_6000_mean, input_types = [Numeric], return_type = Numeric) exp_Moving_average_300_mean_pr = make_agg_primitive(function = exp_Moving_average_300_mean, input_types = [Numeric], return_type = Numeric) exp_Moving_average_3000_mean_pr = make_agg_primitive(function = exp_Moving_average_3000_mean, input_types = [Numeric], return_type = Numeric) exp_Moving_average_30000_mean_pr = make_agg_primitive(function = exp_Moving_average_30000_mean, input_types = [Numeric], return_type = Numeric) iqr_pr = make_agg_primitive(function = iqr, input_types = [Numeric], return_type = Numeric) q999_pr = make_agg_primitive(function = q999, input_types = [Numeric], return_type = Numeric) q001 = make_agg_primitive(function = q001, input_types = [Numeric], return_type = Numeric) ave10_pr = make_agg_primitive(function = ave10, input_types = [Numeric], return_type = Numeric) ave_roll_std_10_pr = make_agg_primitive(function = ave_roll_std_10, input_types = [Numeric], return_type = Numeric) std_roll_std_10_pr = make_agg_primitive(function = std_roll_std_10, input_types = [Numeric], return_type = Numeric) max_roll_std_10_pr = make_agg_primitive(function = max_roll_std_10, input_types = [Numeric], return_type = Numeric) min_roll_std_10_pr = make_agg_primitive(function = min_roll_std_10, input_types = [Numeric], return_type = Numeric) q01_roll_std_10_pr = make_agg_primitive(function = q01_roll_std_10, input_types = [Numeric], return_type = Numeric) q05_roll_std_10_pr = make_agg_primitive(function = q05_roll_std_10, input_types = [Numeric], return_type = Numeric) q95_roll_std_10_pr = make_agg_primitive(function = q95_roll_std_10, input_types = [Numeric], return_type = Numeric) q99_roll_std_10_pr = make_agg_primitive(function = q99_roll_std_10, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_std_10_pr = make_agg_primitive(function = av_change_abs_roll_std_10, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_std_10_pr = make_agg_primitive(function = av_change_rate_roll_std_10, input_types = [Numeric], return_type = Numeric) abs_max_roll_std_10_pr = make_agg_primitive(function = abs_max_roll_std_10, input_types = [Numeric], return_type = Numeric) std_roll_mean_10_pr = make_agg_primitive(function = std_roll_mean_10, input_types = [Numeric], return_type = Numeric) max_roll_mean_10_pr = make_agg_primitive(function = max_roll_mean_10, input_types = [Numeric], return_type = Numeric) min_roll_mean_10_pr = make_agg_primitive(function = min_roll_mean_10, input_types = [Numeric], return_type = Numeric) q01_roll_mean_10_pr = make_agg_primitive(function = q01_roll_mean_10, input_types = [Numeric], return_type = Numeric) q05_roll_mean_10_pr = make_agg_primitive(function = q05_roll_mean_10, input_types = [Numeric], return_type = Numeric) q95_roll_mean_10_pr = make_agg_primitive(function = q95_roll_mean_10, input_types = [Numeric], return_type = Numeric) q99_roll_mean_10_pr = make_agg_primitive(function = q99_roll_mean_10, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_mean_10_pr = make_agg_primitive(function = av_change_abs_roll_mean_10, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_mean_10_pr = make_agg_primitive(function = av_change_rate_roll_mean_10, input_types = [Numeric], return_type = Numeric) abs_max_roll_mean_10_pr = make_agg_primitive(function = abs_max_roll_mean_10, input_types = [Numeric], return_type = Numeric) ave_roll_std_100_pr = make_agg_primitive(function = ave_roll_std_100, input_types = [Numeric], return_type = Numeric) std_roll_std_100_pr = make_agg_primitive(function = std_roll_std_100, input_types = [Numeric], return_type = Numeric) max_roll_std_100_pr = make_agg_primitive(function = max_roll_std_100, input_types = [Numeric], return_type = Numeric) min_roll_std_100_pr = make_agg_primitive(function = min_roll_std_100, input_types = [Numeric], return_type = Numeric) q01_roll_std_100_pr = make_agg_primitive(function = q01_roll_std_100, input_types = [Numeric], return_type = Numeric) q05_roll_std_100_pr = make_agg_primitive(function = q05_roll_std_100, input_types = [Numeric], return_type = Numeric) q95_roll_std_100_pr = make_agg_primitive(function = q95_roll_std_100, input_types = [Numeric], return_type = Numeric) q99_roll_std_100_pr = make_agg_primitive(function = q99_roll_std_100, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_std_100_pr = make_agg_primitive(function = av_change_abs_roll_std_100, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_std_100_pr = make_agg_primitive(function = av_change_rate_roll_std_100, input_types = [Numeric], return_type = Numeric) abs_max_roll_std_100_pr = make_agg_primitive(function = abs_max_roll_std_100, input_types = [Numeric], return_type = Numeric) std_roll_mean_100_pr = make_agg_primitive(function = std_roll_mean_100, input_types = [Numeric], return_type = Numeric) max_roll_mean_100_pr = make_agg_primitive(function = max_roll_mean_100, input_types = [Numeric], return_type = Numeric) min_roll_mean_100_pr = make_agg_primitive(function = min_roll_mean_100, input_types = [Numeric], return_type = Numeric) q01_roll_mean_100_pr = make_agg_primitive(function = q01_roll_mean_100, input_types = [Numeric], return_type = Numeric) q05_roll_mean_100_pr = make_agg_primitive(function = q05_roll_mean_100, input_types = [Numeric], return_type = Numeric) q95_roll_mean_100_pr = make_agg_primitive(function = q95_roll_mean_100, input_types = [Numeric], return_type = Numeric) q99_roll_mean_100_pr = make_agg_primitive(function = q99_roll_mean_100, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_mean_100_pr = make_agg_primitive(function = av_change_abs_roll_mean_100, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_mean_100_pr = make_agg_primitive(function = av_change_rate_roll_mean_100, input_types = [Numeric], return_type = Numeric) abs_max_roll_mean_100_pr = make_agg_primitive(function = abs_max_roll_mean_100, input_types = [Numeric], return_type = Numeric) ave_roll_std_1000_pr = make_agg_primitive(function = ave_roll_std_1000, input_types = [Numeric], return_type = Numeric) std_roll_std_1000_pr = make_agg_primitive(function = std_roll_std_1000, input_types = [Numeric], return_type = Numeric) max_roll_std_1000_pr = make_agg_primitive(function = max_roll_std_1000, input_types = [Numeric], return_type = Numeric) min_roll_std_1000_pr = make_agg_primitive(function = min_roll_std_1000, input_types = [Numeric], return_type = Numeric) q01_roll_std_1000_pr = make_agg_primitive(function = q01_roll_std_1000, input_types = [Numeric], return_type = Numeric) q05_roll_std_1000_pr = make_agg_primitive(function = q05_roll_std_1000, input_types = [Numeric], return_type = Numeric) q95_roll_std_1000_pr = make_agg_primitive(function = q95_roll_std_1000, input_types = [Numeric], return_type = Numeric) q99_roll_std_1000_pr = make_agg_primitive(function = q99_roll_std_1000, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_std_1000_pr = make_agg_primitive(function = av_change_abs_roll_std_1000, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_std_1000_pr = make_agg_primitive(function = av_change_rate_roll_std_1000, input_types = [Numeric], return_type = Numeric) abs_max_roll_std_1000_pr = make_agg_primitive(function = abs_max_roll_std_1000, input_types = [Numeric], return_type = Numeric) std_roll_mean_1000_pr = make_agg_primitive(function = std_roll_mean_1000, input_types = [Numeric], return_type = Numeric) max_roll_mean_1000_pr = make_agg_primitive(function = max_roll_mean_1000, input_types = [Numeric], return_type = Numeric) min_roll_mean_1000_pr = make_agg_primitive(function = min_roll_mean_1000, input_types = [Numeric], return_type = Numeric) q01_roll_mean_1000_pr = make_agg_primitive(function = q01_roll_mean_1000, input_types = [Numeric], return_type = Numeric) q05_roll_mean_1000_pr = make_agg_primitive(function = q05_roll_mean_1000, input_types = [Numeric], return_type = Numeric) q95_roll_mean_1000_pr = make_agg_primitive(function = q95_roll_mean_1000, input_types = [Numeric], return_type = Numeric) q99_roll_mean_1000_pr = make_agg_primitive(function = q99_roll_mean_1000, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_mean_1000_pr = make_agg_primitive(function = av_change_abs_roll_mean_1000, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_mean_1000_pr = make_agg_primitive(function = av_change_rate_roll_mean_1000, input_types = [Numeric], return_type = Numeric) abs_max_roll_mean_1000_pr = make_agg_primitive(function = abs_max_roll_mean_1000, input_types = [Numeric], return_type = Numeric) ave_roll_std_10000_pr = make_agg_primitive(function = ave_roll_std_10000, input_types = [Numeric], return_type = Numeric) std_roll_std_10000_pr = make_agg_primitive(function = std_roll_std_10000, input_types = [Numeric], return_type = Numeric) max_roll_std_10000_pr = make_agg_primitive(function = max_roll_std_10000, input_types = [Numeric], return_type = Numeric) min_roll_std_10000_pr = make_agg_primitive(function = min_roll_std_10000, input_types = [Numeric], return_type = Numeric) q01_roll_std_10000_pr = make_agg_primitive(function = q01_roll_std_10000, input_types = [Numeric], return_type = Numeric) q05_roll_std_10000_pr = make_agg_primitive(function = q05_roll_std_10000, input_types = [Numeric], return_type = Numeric) q95_roll_std_10000_pr = make_agg_primitive(function = q95_roll_std_10000, input_types = [Numeric], return_type = Numeric) q99_roll_std_10000_pr = make_agg_primitive(function = q99_roll_std_10000, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_std_10000_pr = make_agg_primitive(function = av_change_abs_roll_std_10000, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_std_10000_pr = make_agg_primitive(function = av_change_rate_roll_std_10000, input_types = [Numeric], return_type = Numeric) abs_max_roll_std_10000_pr = make_agg_primitive(function = abs_max_roll_std_10000, input_types = [Numeric], return_type = Numeric) std_roll_mean_10000_pr = make_agg_primitive(function = std_roll_mean_10000, input_types = [Numeric], return_type = Numeric) max_roll_mean_10000_pr = make_agg_primitive(function = max_roll_mean_10000, input_types = [Numeric], return_type = Numeric) min_roll_mean_10000_pr = make_agg_primitive(function = min_roll_mean_10000, input_types = [Numeric], return_type = Numeric) q01_roll_mean_10000_pr = make_agg_primitive(function = q01_roll_mean_10000, input_types = [Numeric], return_type = Numeric) q05_roll_mean_10000_pr = make_agg_primitive(function = q05_roll_mean_10000, input_types = [Numeric], return_type = Numeric) q95_roll_mean_10000_pr = make_agg_primitive(function = q95_roll_mean_10000, input_types = [Numeric], return_type = Numeric) q99_roll_mean_10000_pr = make_agg_primitive(function = q99_roll_mean_10000, input_types = [Numeric], return_type = Numeric) av_change_abs_roll_mean_10000_pr = make_agg_primitive(function = av_change_abs_roll_mean_10000, input_types = [Numeric], return_type = Numeric) av_change_rate_roll_mean_10000_pr = make_agg_primitive(function = av_change_rate_roll_mean_10000, input_types = [Numeric], return_type = Numeric) abs_max_roll_mean_10000_pr = make_agg_primitive(function = abs_max_roll_mean_10000, input_types = [Numeric], return_type = Numeric) return gmean_pr, hmean_pr, kstatvar1_pr, kstat2_pr, kstatvar2_pr, kstat3_pr, kstat4_pr, \ avg_change_pr, avg_change_rate_pr, range_pr, std_first_50000_pr, \ std_last_50000_pr, std_first_10000_pr, std_last_10000_pr, avg_first_50000_pr, \ avg_last_50000_pr, avg_first_10000_pr, avg_last_10000_pr, min_first_50000_pr, \ min_last_50000_pr, min_first_10000_pr, min_last_10000_pr, max_first_50000_pr, \ max_last_50000_pr, max_first_10000_pr, max_last_10000_pr, max_to_min_pr, \ count_big_pr, sum_func_pr, avg_change_rate_first_50000_pr, avg_change_rate_last_50000_pr, \ avg_change_rate_first_10000_pr, avg_change_rate_last_10000_pr, q95_pr, \ q99_pr, q05_pr, q01_pr, abs_q95_pr, abs_q99_pr, trend_pr, abs_trend_pr, \ abs_mean_pr, abs_std_pr, mad_pr, kurt_pr, skew_pr, med_pr, Hilbert_mean_pr, \ Hann_window_mean_pr, classic_sta_lta1_mean_pr, classic_sta_lta2_mean_pr, \ classic_sta_lta3_mean_pr, classic_sta_lta4_mean_pr, Moving_average_700_mean_pr, \ Moving_average_1500_mean_pr, Moving_average_3000_mean_pr, Moving_average_6000_mean_pr, \ exp_Moving_average_300_mean_pr, exp_Moving_average_3000_mean_pr, \ exp_Moving_average_30000_mean_pr, iqr_pr, q999_pr, q001, ave10_pr, \ ave_roll_std_10_pr, std_roll_std_10_pr, max_roll_std_10_pr, min_roll_std_10_pr, \ q01_roll_std_10_pr, q05_roll_std_10_pr, q95_roll_std_10_pr, q99_roll_std_10_pr, \ av_change_abs_roll_std_10_pr, av_change_rate_roll_std_10_pr, abs_max_roll_std_10_pr, \ std_roll_mean_10_pr, max_roll_mean_10_pr, min_roll_mean_10_pr, q01_roll_mean_10_pr, \ q05_roll_mean_10_pr, q95_roll_mean_10_pr, q99_roll_mean_10_pr, \ av_change_abs_roll_mean_10_pr, av_change_rate_roll_mean_10_pr, \ abs_max_roll_mean_10_pr, ave_roll_std_100_pr, std_roll_std_100_pr, \ max_roll_std_100_pr, min_roll_std_100_pr, q01_roll_std_100_pr, \ q05_roll_std_100_pr, q95_roll_std_100_pr, q99_roll_std_100_pr, \ av_change_abs_roll_std_100_pr, av_change_rate_roll_std_100_pr, \ abs_max_roll_std_100_pr, std_roll_mean_100_pr, max_roll_mean_100_pr, \ min_roll_mean_100_pr, q01_roll_mean_100_pr, q05_roll_mean_100_pr, \ q95_roll_mean_100_pr, q99_roll_mean_100_pr, av_change_abs_roll_mean_100_pr, \ av_change_rate_roll_mean_100_pr, abs_max_roll_mean_100_pr, ave_roll_std_1000_pr, \ std_roll_std_1000_pr, max_roll_std_1000_pr, min_roll_std_1000_pr, \ q01_roll_std_1000_pr, q05_roll_std_1000_pr, q95_roll_std_1000_pr, \ q99_roll_std_1000_pr, av_change_abs_roll_std_1000_pr, \ av_change_rate_roll_std_1000_pr, abs_max_roll_std_1000_pr, \ std_roll_mean_1000_pr, max_roll_mean_1000_pr, min_roll_mean_1000_pr, \ q01_roll_mean_1000_pr, q05_roll_mean_1000_pr, q95_roll_mean_1000_pr, \ q99_roll_mean_1000_pr, av_change_abs_roll_mean_1000_pr, \ av_change_rate_roll_mean_1000_pr, abs_max_roll_mean_1000_pr, \ ave_roll_std_10000_pr, std_roll_std_10000_pr, max_roll_std_10000_pr, \ min_roll_std_10000_pr, q01_roll_std_10000_pr, q05_roll_std_10000_pr, \ q95_roll_std_10000_pr, q99_roll_std_10000_pr, av_change_abs_roll_std_10000_pr, \ av_change_rate_roll_std_10000_pr, abs_max_roll_std_10000_pr, \ std_roll_mean_10000_pr, max_roll_mean_10000_pr, min_roll_mean_10000_pr, \ q01_roll_mean_10000_pr, q05_roll_mean_10000_pr, q95_roll_mean_10000_pr, \ q99_roll_mean_10000_pr, av_change_abs_roll_mean_10000_pr, \ av_change_rate_roll_mean_10000_pr, abs_max_roll_mean_10000_pr
end_flag = length // n * end # print(start_flag, end_flag) piece = new_s.iloc[start_flag:end_flag] # print(sum(piece)) # print() if (sum(piece) > 0): count += 1 start += 1 end += 1 return count rise_count = make_agg_primitive( function=rise_count, input_types=[Numeric], return_type=Numeric, # uses_calc_time=True, description="Calculates the rise_count max of the value.", name="rise_count") # %% """ # 生成新的特征融合矩阵 # 可以根据target_entity的不同生成不同的融合特征矩阵 """ feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity="customers", # agg_primitives=["median", "count", "num_unique", "max","avg_time_between", "n_most_common", max2nd, max3rd], agg_primitives=[rise_count], trans_primitives=["month"],
es.normalize_entity(base_entity_id='data', new_entity_id='target', index='RECHCT_USE_ITEPD_ID', make_time_index=False, additional_variables=target_col + ['target']) from featuretools.variable_types import Numeric, PandasTypes from featuretools.primitives import make_agg_primitive def range_calc(numeric): return np.max(numeric) - np.min(numeric) range_ = make_agg_primitive(function=range_calc, input_types=[PandasTypes], return_type=PandasTypes) def p_corr_calc(numeric1, numeric2): return np.corrcoef(numeric1, numeric2)[0, 1] pcorr_ = make_agg_primitive(function=p_corr_calc, input_types=[PandasTypes, PandasTypes], return_type=PandasTypes) def s_corr_calc(numeric1, numeric2): return spearmanr(numeric1, numeric2)[0]
from featuretools.primitives import make_agg_primitive from featuretools.variable_types import Numeric CustomMax = make_agg_primitive(lambda x: max(x), name="CustomMax", input_types=[Numeric], return_type=Numeric) CustomSum = make_agg_primitive(lambda x: sum(x), name="CustomSum", input_types=[Numeric], return_type=Numeric)
x = x.to_frame() for i in range(10): clf.fit(x) frst = np.argmin(clf.covariances_, axis=0) scnd = abs(frst - 1) est = pd.DataFrame( data={ 'aic': [clf.aic(x)], 'b': [clf.means_[frst][0]], 'c': [clf.means_[scnd][0]], 'd': [clf.covariances_[frst]], 'e': [clf.covariances_[scnd]], 'f': [clf.weights_[frst]], 'g': [clf.weights_[scnd]] }) if i == 0: features = est else: features = pd.concat([features, est]) features = features.reset_index(drop=True) min_index = features['aic'].idxmin() features = features.iloc[min_index] return features[0], features[1], features[2], features[3], features[ 4], features[5], features[6] GM_pr = make_agg_primitive(function=GM_fit, input_types=[Numeric], return_type=Numeric, number_output_features=7)
in an array of ['A', 'A', 'A', 'B', 'B'], the function will return 0.6.""" if x.mode().shape[0] == 0: return np.nan # Count occurence of each value counts = dict(Counter(x.values)) # Find the mode mode = x.mode().iloc[0] # Divide the occurences of mode by the total occurrences return counts[mode] / np.sum(list(counts.values())) NormalizedModeCount = make_agg_primitive(function=normalized_mode_count, input_types=[Discrete], return_type=Numeric) # Function from https://codereview.stackexchange.com/a/15095 def longest_repetition(x): """ Returns the item with most consecutive occurrences in `x`. If there are multiple items with the same number of conseqcutive occurrences, it will return the first one. If `x` is empty, returns None. """ x = x.dropna() if x.shape[0] < 1: return None
name = "mode" input_types = [Discrete] return_type = None def get_function(self): def pd_mode(x): if x.mode().shape[0] == 0: return np.nan return x.mode().iloc[0] return pd_mode Min = make_agg_primitive( np.min, [Numeric], None, name="min", stack_on_self=False, description="Finds the minimum non-null value of a numeric feature.") # class Min(AggregationPrimitive): # """Finds the minimum non-null value of a numeric feature.""" # name = "min" # input_types = [Numeric] # return_type = None # # max_stack_depth = 1 # stack_on_self = False # def get_function(self): # return np.min
def FT_process(tables, config): es = ft.EntitySet() entity_config = config['tables'] relation_config = config['relations'] flag = 0 for table in tables: id = f'{table}_id' # 主键 make_id = True if len(table.split("_")) > 2: # 中间表 id = table[6:] make_id = False if table == CONSTANT.MAIN_TABLE_NAME: # "main" tables[table][id] = tables[table].index cat_cols = [ col for col in tables[table].columns if col.startswith("c_") and not col.startswith("c_0") ] if len(cat_cols) > 10: flag = 1 make_id = False variable_Types = {} for col in tables[table].columns: if col.startswith(CONSTANT.MULTI_CAT_PREFIX): variable_Types[col] = ft.variable_types.Categorical if col.startswith(CONSTANT.CATEGORY_PREFIX): variable_Types[col] = ft.variable_types.Categorical ''' if config['time_col'] in tables[table] and table == "main": # modified 4.22, time_index的设置 es = es.entity_from_dataframe(entity_id=table, dataframe=tables[table], make_index=make_id, index=id, time_index=config['time_col'], variable_types=variable_Types ) #print(table,"using time_index") else: es = es.entity_from_dataframe(entity_id=table, dataframe=tables[table], make_index=make_id, index=id, variable_types=variable_Types ) ''' es = es.entity_from_dataframe(entity_id=table, dataframe=tables[table], make_index=make_id, index=id, variable_types=variable_Types) # print(es[table].variables) for relation in relation_config: tableA = relation['table_A'] tableB = relation['table_B'] key = relation['key'][0] new_relationship = ft.Relationship(es[tableB][key], es[tableA][key]) es = es.add_relationship(new_relationship) ''' ct = pd.DataFrame() c_id = f'{CONSTANT.MAIN_TABLE_NAME}_id' ct[c_id] = tables[CONSTANT.MAIN_TABLE_NAME].index ct["time"] = tables[CONSTANT.MAIN_TABLE_NAME][config['time_col']].values time0 = ct["time"].min() time1 = ct["time"].max() timeBucket = (time1 - time0) / 20 if "timeBucket" not in config: config["timeBucket"] = timeBucket.total_seconds() config["window_number"] = 5 ''' # print(config["timeBucket"]) # cluster = LocalCluster() ''' if mark ==1: # modified 4.23 feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="main", agg_primitives=["mean", "sum", "count"], trans_primitives=["hour", "weekday"], max_depth=2, cutoff_time=ct, training_window=ft.Timedelta(config["window_number"] * config["timeBucket"], "s"), # 参数可调 approximate=ft.Timedelta(config["timeBucket"], "s"), # 参数可调 # n_jobs=3, cutoff_time_in_index=True # 参数可调 ) # print(feature_defs) feature_matrix.reset_index(1, drop=False, inplace=True) feature_matrix.rename(columns={'time': 't_01'}, inplace=True) print("Using Cutting off Time") else: ''' def n_unique(column): return len(set(column)) def nunique2(column): l1 = len(column) return l1 * 1.0 / len(set(column)) def n_time(column): return (column.max() - column.min()).total_seconds() def n_time2(column): return (column - column.min()).apply(lambda s: s.total_seconds()) nunique = make_agg_primitive(function=n_unique, input_types=[Categorical], return_type=Numeric) # ntime = make_agg_primitive(function=n_time, input_types=[Datetime], return_type=Numeric) # ntime2 = make_trans_primitive(function=n_time2, input_types=[Datetime], return_type=Numeric) if flag == 0: agg_trans = ["mean", "sum", "count", nunique] else: agg_trans = ["mean", "sum", "count"] feature_matrix, feature_defs = ft.dfs( entityset=es, target_entity="main", agg_primitives=agg_trans, # "num_unique"太耗时 trans_primitives=[], # ["hour", "weekday"], max_depth=2) print(feature_defs) # feature_matrix.columns = ["m_"+c if ((".c_" in c) or (".m_" in c)) and ("MEAN" not in c) and ("SUM" not in c) and ("COUNT" not in c) else c for c in feature_matrix.columns] return feature_matrix