def join(u, v, v_name, key, type_): def func_rename(x): if 'mv' in x[1]: return f'{CONSTANT.MULTI_CAT_PREFIX}{x[1]}({x[0]})' elif 'cat_union' in x[1]: return f'{CONSTANT.MULTI_CAT_PREFIX}{x[1]}({x[0]})' elif 'cat_last' in x[1]: return f'{CONSTANT.CATEGORY_PREFIX}{x[1]}({x[0]})' elif CONSTANT.NUMERICAL_PREFIX in x[0]: return f"{CONSTANT.NUMERICAL_PREFIX}{x[1]}({x[0]})" elif CONSTANT.CATEGORY_PREFIX in x[0]: return f"{CONSTANT.CATEGORY_PREFIX}{x[1]}({x[0]})" if type_.split("_")[2] == 'many': agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key[0] and not col.startswith(CONSTANT.TIME_PREFIX) # and not col.startswith(CONSTANT.MULTI_CAT_PREFIX) } v = v.groupby(key).agg(agg_funcs) v.columns = v.columns.map(func_rename) # v.columns = v.columns.map(lambda a: # f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}({a[0]})") else: v = v.set_index(key) v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}") return u.join(v, on=key)
def temporal_join(u, v, v_name, key, time_col): timer = Timer() window_size = CONSTANT.WINDOW_SIZE if len(u) * CONSTANT.WINDOW_RATIO < CONSTANT.WINDOW_SIZE \ else int(len(u) * CONSTANT.WINDOW_RATIO) hash_max = CONSTANT.HASH_MAX if len(u) / CONSTANT.HASH_MAX > CONSTANT.HASH_BIN \ else int(len(u) / CONSTANT.HASH_BIN) # window_size = CONSTANT.WINDOW_SIZE # hash_max = CONSTANT.HASH_MAX if isinstance(key, list): assert len(key) == 1 key = key[0] tmp_u = u[[time_col, key]] timer.check("select") tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) timer.check("concat") # rehash_key = f'rehash_{key}' # tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX) # timer.check("rehash_key") tmp_u.sort_values(time_col, inplace=True) timer.check("sort") agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.startswith(CONSTANT.MULTI_CAT_PREFIX) } # tmp_u = tmp_u.groupby(rehash_key).rolling(window=CONSTANT.WINDOW_SIZE).agg(agg_funcs) tmp_u = tmp_u.rolling(window=window_size).agg(agg_funcs) # timer.check("group & rolling & agg") # # tmp_u.reset_index(0, drop=True, inplace=True) # drop rehash index # timer.check("reset_index") tmp_u.columns = tmp_u.columns.map( lambda a: f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})") if tmp_u.empty: log("empty tmp_u, return u") return u # ret = pd.concat([u, tmp_u3.loc['u']], axis=1, sort=False) ret = u.merge(tmp_u.loc['u'], right_index=True, left_index=True, how="outer") timer.check("final concat") del tmp_u, tmp2_u return ret
def join(u, v, v_name, key, type_): if type_.split("_")[2] == 'many': agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.startswith(CONSTANT.MULTI_CAT_PREFIX) } v = v.groupby(key).agg(agg_funcs) v.columns = v.columns.map(join_name) else: v = v.set_index(key) v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}") return u.join(v, on=key)
def temporal_join(u, v, v_name, key, time_col): timer = Timer() if isinstance(key, list): key = key[0] tmp_u = u[[time_col, key]] timer.check("select") tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) timer.check("concat") rehash_key = f'rehash_{key}' tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX) timer.check("rehash_key") tmp_u.sort_values(time_col, inplace=True) timer.check("sort") agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.startswith(CONSTANT.MULTI_CAT_PREFIX) } tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs) timer.check("group & rolling & agg") tmp_u.reset_index(0, drop=True, inplace=True) # drop rehash index timer.check("reset_index") def rename_func(x): dtype = CONSTANT.CATEGORY_PREFIX if x[1].upper() in [ 'MODE' ] else CONSTANT.NUMERICAL_PREFIX return f"{dtype}{x[1].upper()}_ROLLING10({v_name}.{x[0]})" tmp_u.columns = tmp_u.columns.map(rename_func) if tmp_u.empty: log("empty tmp_u, return u") return u ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False) timer.check("final concat") del tmp_u return ret
def temporal_join(u, v, v_name, key, time_col): timer = Timer() if isinstance(key, list): assert len(key) == 1 key = key[0] print ("-----tmp_u--------") tmp_u = u[[time_col, key]] print ("------Number of columns before concatenation---") print (len(tmp_u.columns)) timer.check("select") tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) print (len(tmp_u.columns)) timer.check("concat") rehash_key = f'rehash_{key}' tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX) timer.check("rehash_key") tmp_u.sort_values(time_col, inplace=True) print ("----after sorting----") #print (tmp_u) print ("----after sorting----") timer.check("sort") agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)} print ("-----after group by operation-----") tmp_u = tmp_u.groupby(rehash_key).rolling(5, min_periods=1).agg(agg_funcs) #print (tmp_u) print ("-----after group by operation------") print ("-----tmp_u--------") timer.check("group & rolling & agg") tmp_u.reset_index(0, drop=True, inplace=True) # drop rehash index timer.check("reset_index") tmp_u.columns = tmp_u.columns.map(lambda a: f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})") if tmp_u.empty: log("empty tmp_u, return u") return u #print ("----number of rows in u-----") #print (len(u.index)) #print ("---number of rows in u-----") #print ("----number of rows in tmp_u-----") #print (len(tmp_u.index)) #print ("----number of rows in tmp_u---------") ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False) timer.check("final concat") del tmp_u return ret
def temporal_join(u, v, v_name, key, time_col): timer = Timer() if isinstance(key, list): assert len(key) == 1 key = key[0] tmp_u = u[[time_col, key]] timer.check("select") #tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) #timer.check("concat") tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) timer.check("concat") rehash_key = f'rehash_{key}' tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX) timer.check("rehash_key") tmp_u.sort_values(time_col, inplace=True) timer.check("sort") agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col. startswith(CONSTANT.MULTI_CAT_PREFIX) and 'mul_feature_' not in col } tmp_u_2 = tmp_u ##---------------FillNA----------------- #tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs) # tmp_u_2 = tmp_u tmp_u = tmp_u.groupby(key).agg(agg_funcs) timer.check("group & rolling & agg") # tmp_u.reset_index(0, drop=True, inplace=True) # drop rehash index timer.check("reset_index") # tmp_u.columns = tmp_u.columns.map(lambda a: # f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})") if tmp_u.empty: log("empty tmp_u, return u") return u # print(u.shape,tmp_u.loc['u'].shape,tmp_u_2.shape) # ret = pd.concat([u, tmp_u_2], axis=1, sort=False) # ret = pd.concat([u, tmp_u.loc['u'],tmp_u_2], axis=1, sort=False) # ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False) timer.check("final concat") tmp_u.columns = tmp_u.columns.map(lambda a: f"{v_name}.{a})") tmpindex = u.index u["index"] = list(range(0, len(tmpindex))) ret = pd.merge(u, tmp_u, left_index=True, on=[key]) ret.sort_values("index", inplace=True) # ret.index = ret["index"] ret.index = tmpindex ret.drop("index", axis=1, inplace=True) # u[key] = u[key].apply(int) # v[key] = v[key].apply(int) # #u = u.join(v,on=key) # u = u.merge(v) # print(u) del tmp_u return u
def temporal_join_jinnian(u, v, v_name, key, time_col, type_): if isinstance(key, list): assert len(key) == 1 key = key[0] if type_.split("_")[2] == 'many': timer = Timer() tmp_u = u[[time_col, key]] timer.check("select") # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) # timer.check("concat") # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) for c in v.columns: if c != key and c.startswith(CONSTANT.CATEGORY_PREFIX): v[c] = v[c].apply(lambda x: int(x)) tmp_u = pd.concat([tmp_u, v], sort=False) #tmp_u = v # print(tmp_u.index) # input() # print(tmp_u[key].nunique()) # input() timer.check("concat") rehash_key = f'rehash_{key}' # tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX) timer.check("rehash_key") # tmp_u.sort_values(time_col, inplace=True) timer.check("sort") agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.startswith(CONSTANT.MULTI_CAT_PREFIX) and 'mul_feature_' not in col } tmp_u = tmp_u.fillna(0).groupby(key).agg(agg_funcs) '''agg_funcs_num = {col: Config.aggregate_op(col) for col in v if col != key and col.startswith(CONSTANT.NUMERICAL_PREFIX) } agg_funcs_cat = {col: Config.aggregate_op(col) for col in v if col != key and col.startswith(CONSTANT.CATEGORY_PREFIX) } num_features = [c for c in tmp_u.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX)] if key not in num_features: num_features.append(key) cat_features = [c for c in tmp_u.columns if c.startswith(CONSTANT.CATEGORY_PREFIX)] if key not in cat_features: cat_features.append(key) #print(num_features) #print(cat_features) #input() tmp_u_num = tmp_u[num_features] tmp_u_cat = tmp_u[cat_features]''' ##---------------FillNA----------------- # tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs) ##tmp_u = tmp_u.fillna(0).groupby(rehash_key).rolling(5).agg(agg_funcs) '''if len(num_features) > 1: tmp_u_cat = tmp_u_cat.groupby(key).agg(agg_funcs_cat) tmp_u_cat.reset_index(0, inplace=True) if len() tmp_u_num = tmp_u_num.fillna(0).groupby(key).agg(agg_funcs_num) tmp_u_num.reset_index(0, inplace=True) print(tmp_u_cat.index) print(tmp_u_cat.columns) print(tmp_u_num.index) print(tmp_u_num.columns) input() tmp_u = pd.merge(tmp_u_cat, tmp_u_num, on=[key])''' timer.check("group & rolling & agg") # tmp_u.reset_index(0, drop=True, inplace=True) # drop rehash index timer.check("reset_index") tmp_u.columns = tmp_u.columns.map( lambda a: f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})" ) # new_columns = [] # for a in tmp_u.columns: # if "collect_list" == a[1]: # new_columns.append(f"{'mul_'}{a[1].upper()}_ROLLING5({v_name}.{a[0]})") # else: # new_columns.append(f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})") # # tmp_u.columns = new_columns # print(tmp_u.columns) ##-------------remove duplicated rolling columns--------------- c_tmp = None count = 0 for c in tmp_u.columns: if 'COUNT_ROLLING5' in c: if c_tmp is None: c_tmp = tmp_u[c] else: tmp_u.drop(c, axis=1, inplace=True) count += 1 print("There are %d duplicated columns in temporal join!" % count) # print(tmp_u.columns) # input() ##------------check whether all n_COUNT_ROLLING_X are the same--------------- '''all_columns = tmp_u.columns print(all_columns) c_tmp = None for c in all_columns: if 'COUNT_ROLLING5' in c: if c_tmp is None: c_tmp = tmp_u[c] else: print(c) print([(tmp_u[c]-c_tmp).max(), (tmp_u[c]-c_tmp).min()]) input()''' # print(tmp_u.columns) tmp_u.reset_index(0, inplace=True) ##-------------check NAN after aggregation functions---------- '''print(tmp_u.columns) #print(tmp_u["n_COUNT_ROLLING5(table_1.c_1)"]) for c in tmp_u.columns: print(c) print(tmp_u[c].loc['u'].shape[0]) print(np.sum(np.isnan(tmp_u[c]).loc['u'])) print(tmp_u[c].loc['u']) #print(tmp_u['n_MEAN_ROLLING5(table_1.n_1)']) input()''' if tmp_u.empty: log("empty tmp_u, return u") return u # print(u.shape,tmp_u.loc['u'].shape,tmp_u_2.shape) # ret = pd.concat([u, tmp_u.loc['u'],#tmp_u_2], axis=1, sort=False) # ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False) index_tmp = u.index u["index"] = list(range(0, len(index_tmp))) ret = pd.merge(u, tmp_u, on=[key]) ret.sort_values("index", inplace=True) ret.index = index_tmp ret.drop("index", axis=1, inplace=True) timer.check("final concat") del tmp_u return ret else: ###------------ Multi-cat features will be processed in the main function --------## for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]: v[c].fillna("0", inplace=True) v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(",")) # v["mul_feature_" + c] = v[c].str.split(",") # print(v["mul_feature_" + c]) # v["mul_feature_" + c] = v[c] v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0])) ''' for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]: v[c].fillna("0",inplace=True) mul_features = get_tfidf_vector(v[c], c) v.drop(c, axis=1, inplace=True) v = pd.concat([v, mul_features], axis=1) ''' # tmp_u = u[[time_col, key]] # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False) # print(tmp_u[key].nunique()) # input() # print(u.dtypes) # input() # u[key] = u[key].astype('int64') v = v.set_index(key) v.columns = v.columns.map( lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}") return u.join(v, on=key)
def join(u, v, v_name, key, type_): if isinstance(key, list): assert len(key) == 1 key = key[0] if type_.split("_")[2] == 'many': v_features = v.columns for c in v_features: if c != key and c.startswith(CONSTANT.CATEGORY_PREFIX): v[c] = v[c].apply(lambda x: int(x)) agg_funcs = { col: Config.aggregate_op(col) for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.startswith(CONSTANT.MULTI_CAT_PREFIX) and 'mul_feature_' not in col } v = v.fillna(0).groupby(key).agg(agg_funcs) v.columns = v.columns.map( lambda a: f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})" ) ## --------------- remove duplicated rolling columns ------------ c_tmp = None count = 0 for c in v.columns: if 'COUNT_ROLLING5' in c: if c_tmp is None: c_tmp = v[c] else: v.drop(c, axis=1, inplace=True) count += 1 print("There are %d duplicated columns in temporal join!" % count) v.reset_index(0, inplace=True) v = v.set_index(key) else: # for c in [c for c in v if c.startswith(CONSTANT.CATEGORY_PREFIX) and "c_0"]: # # v[c] = v[c].apply(lambda x: int(x)) ###-------------- Multi-cat features will be processed in the main function--------### for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]: v[c].fillna("0", inplace=True) v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(",")) #v["mul_feature_" + c] = v[c].str.split(",") #print(v["mul_feature_" + c]) #v["mul_feature_" + c] = v[c] v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0])) ''' for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]: v[c].fillna("0",inplace=True) mul_features = get_tfidf_vector(v[c], c) v.drop(c, axis=1, inplace=True) v = pd.concat([v, mul_features], axis=1) ''' #v["mul_feature_" + c] = v[c].parallel_apply(lambda x:str(x).split(",")) #v["mul_feature_" + c] = v[c].apply(lambda x:str(x).split(",")) #v["mul_feature_" + c] = v[c].str.split(",") #print(v["mul_feature_" + c]) #v["mul_feature_" + c] = v[c] #v[c] = v["mul_feature_" + c].parallel_apply(lambda x: int(x[0])) #v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0])) #v[key].astype(str, copy=True) v = v.set_index(key) v.columns = v.columns.map( lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}") #u[key].astype(str,copy=True) '''print(u.dtypes) print("*"*50) print(v.dtypes) print(v.index) input()''' return u.join(v, on=key)