def extract_feature_hour(data, feature, ans): target = data.copy() t = feature.groupby(['uid'])['hour'].agg( ['mean', 'std']).reset_index().rename(columns={ 'mean': 'uid_invite_hour_mean', 'std': 'uid_invite_hour_std' }) s_feature = feature[feature['label'] == 1] target = pd.merge(target, t, how='left', on='uid') t = s_feature.groupby(['uid'])['hour'].agg( ['mean', 'std']).reset_index().rename(columns={ 'mean': 'uid_invite_ans_hour_mean', 'std': 'uid_invite_ans_hour_std' }) target = pd.merge(target, t, how='left', on='uid') t = ans.groupby(['uid'])['hour'].agg( ['mean', 'std']).reset_index().rename(columns={ 'mean': 'uid_ans_hour_mean', 'std': 'uid_ans_hour_std' }) target = pd.merge(target, t, how='left', on='uid') return target
def extract_uid_seq_more(target): target = target.reset_index(drop=True) target['ii'] = target.index target['i_time'] = target['day'] + 0.04166 * target['hour'] ds = target.sort_values(by=['uid', 'i_time'], axis=0, ascending=True).reset_index(drop=True) ds['slabel'] = ds['m_label'].astype(str) t = ds.groupby('uid')['slabel'].apply(lambda x: '-' + ''.join(x.tolist( ))).reset_index().rename(columns={'slabel': 'uid_m_seq'}) t1 = ds.groupby('uid')['label'].apply( lambda x: [i for i in range(len(x.tolist()))]).reset_index().rename( columns={'label': 'uid_rank'}) t_rank = pd.DataFrame({ 'uid': t1.uid.repeat(t1.uid_rank.str.len()), 'uid_rank': np.concatenate(t1.uid_rank.values) }).reset_index(drop=True) ds = pd.merge(ds, t, how='left', on='uid').reset_index(drop=True) ds = pd.concat([ds, t_rank], axis=1) # 计算用户的访问序列 ds['uid_mm_seq'] = ds.apply(lambda x: '-' + (x['uid_m_seq'])[:x['uid_rank']], axis=1) target = pd.merge(target, ds[['ii', 'uid_mm_seq']], how='left', on='ii') return target
def extract_feature_ans_dif(data, feature, ans_feature): target = data.copy() print_time('extract feature ans dif') que = pd.read_csv('../datasets/question_info.csv', usecols=['qid', 'topic_id']) ans_feature = pd.merge(ans_feature, que, how='left', on='qid').fillna(0) target = pd.merge(target, que, how='left', on='qid').fillna(0) ans_feature[ 'a_time'] = ans_feature['a_day'] + 0.04166 * ans_feature['a_hour'] target['i_time'] = target['day'] + 0.04166 * target['hour'] total_extend = ans_feature['topic_id'].str.split(',', expand=True).stack() \ .reset_index(level=0).set_index('level_0') \ .rename(columns={0: 'topic'}).join(ans_feature.drop('topic_id', axis=1)) \ .reset_index(drop=True) t = total_extend.groupby(['uid', 'topic'])['a_time'].agg([ 'max' ]).reset_index().rename(columns={'max': 'uid_topic_ans_recent_time'}) topic_df = target['topic_id'].str.split(',', expand=True) topic_df = topic_df.fillna(0) target = pd.concat([target, topic_df], axis=1) fea_name = 'uid_topic_ans_recent_time' tmp_name = [] result_list = [] for field in [0, 1, 2, 3, 4, 5]: target = pd.merge(target, t, how='left', left_on=['uid', field], right_on=['uid', 'topic' ]).rename(columns={ fea_name: fea_name + str(field) }).fillna(1000) target['s' + str(field)] = target['i_time'] - target[fea_name + str(field)] tmp_name.append('s' + str(field)) target[fea_name + '_mean'] = target[tmp_name].mean(axis=1) target[fea_name + '_min'] = target[tmp_name].min(axis=1) target[fea_name + '_max'] = target[tmp_name].max(axis=1) result_list.append(fea_name + '_min') result_list.append(fea_name + '_mean') result_list.append(fea_name + '_max') return target[result_list]
def createAuthorsTable(noDuplicates, stars_list, avatars, bios, nFollowers, authors_loves, authors_views, authors_ranking, finalAuthorsName): import modin.pandas as pd data = { 'Authors': noDuplicates, 'Stars': stars_list, 'Has Avatar': avatars, 'Has Bio': bios, 'Followers': nFollowers, 'Tot loves': authors_loves, 'Tot views': authors_views, 'author_ranking': authors_ranking } dataAuthors = {'Authors': finalAuthorsName} dfAuthors = pd.DataFrame(dataAuthors, columns=['Authors']) df = pd.DataFrame(data, columns=[ 'Authors', 'Stars', 'Has Avatar', 'Has Bio', 'Followers', 'Tot loves', 'Tot views', 'author_ranking' ]) mergedAuthors = pd.merge(dfAuthors, df, on='Authors') return mergedAuthors
def createTableWithAuthorsPanels(authorsTable, mergedTable): import modin.pandas as pd tableWithAuthorsPanels = pd.merge(mergedTable, authorsTable, on='panel_author') tableWithAuthorsPanels = tableWithAuthorsPanels.drop_duplicates( subset="id_panel") return tableWithAuthorsPanels
def mergeTime(idProjects, mergedTable, time): import modin.pandas as pd print('mergeTime') data = {'id_prog': idProjects, 'time': time} df = pd.DataFrame(data, columns=['id_prog', 'time']) mergedTable = pd.merge(df, mergedTable, on='id_prog') return mergedTable
def merge_func(df1): for field in [0, 1, 2, 3, 4, 5, 6, 7, 8]: df1 = pd.merge(df1, t, how='left', left_on=[fea, field], right_on=[ fea, 'word' ]).rename(columns={fea_name: fea_name + str(field)}) return df1
def mergePanelsFeature(tableWithAuthorsPanels): import modin.pandas as pd projectTable = pd.read_excel("..\\data\\TabellaProgettiPanelJam.xlsx") Table = pd.merge(tableWithAuthorsPanels, projectTable, left_on='id_prog', right_on='project') Table = Table.drop(columns=['project', 'Remixed', 'Time', 'Project depth']) return Table
def extract_usr_unique(data, feature, ans_feature): target = data.copy() que = pd.read_csv('../datasets/question_info.csv', usecols=['qid', 'topic_id']) ans = ans_feature[['uid', 'qid']] ans = pd.merge(ans, que, how='left', on='qid') feature = pd.merge(feature, que, how='left', on='qid').fillna('0') ## 获取用户被邀请的话题种类 t = feature.groupby(['uid'])['topic_id'].apply(lambda x: len( set(','.join(x.tolist()).split(',')))).reset_index().rename( columns={'topic_id': 'sw_uid_invite_topic_unique'}) target = pd.merge(target, t, how='left', on='uid') t = ans.groupby(['uid'])['topic_id'].apply( lambda x: len(set(','.join(x.tolist())))).reset_index().rename( columns={'topic_id': 'sw_uid_ans_topic_unique'}) target = pd.merge(target, t, how='left', on='uid') fealist = ['sw_uid_invite_topic_unique', 'sw_uid_ans_topic_unique'] return target[fealist]
def extract_feature_smilar(data, feature, ans): target = data.copy() print_time('extract feature smilar') uid_qid_list = ans.groupby(['uid'])['qid'] \ .apply(lambda x: ','.join(x.tolist())).reset_index().rename(columns={'qid': 'uid_qid_list'}) target = pd.merge(target, uid_qid_list, how='left', on='uid').fillna(0) print_time('start') target = multiprocessing_apply_data_frame(tmp_func, target, 10) print_time('end') return target
def extract_topic_whole_count(data, feature): print('extract_feature') target = data.copy() que = pd.read_csv('../datasets/question_info.csv', usecols=['qid', 'topic_id']) target = pd.merge(target, que, how='left', on='qid').fillna('0') feature = pd.merge(feature, que, how='left', on='qid').fillna('0') print_time('extenf') total_extend = feature['topic_id'].str.split(',', expand=True).stack() \ .reset_index(level=0).set_index('level_0') \ .rename(columns={0: 'topic'}).join(feature.drop('topic_id', axis=1)) \ .reset_index(drop=True) topic_df = target['topic_id'].str.split(',', expand=True) target = pd.concat([target, topic_df], axis=1) print_time('extend_finish') stat_feat = [ (['topic'], ['label'], ['count']), (['topic'], ['qid'], ['nunique']), (['topic'], ['uid'], ['nunique']), (['topic', 'day', 'hour'], ['uid'], ['nunique']), ] final_list = [] for stat in stat_feat: fea_name = '_'.join(stat[0]) + '_' + '_'.join( stat[1]) + '_' + '_'.join(stat[2]) print('extract', fea_name) t = total_extend.groupby(stat[0])[stat[1][0]].agg(stat[2]).reset_index() \ .rename(columns={stat[2][0]: fea_name}) t.loc[t['topic'] == '0', fea_name] = 0 tmp_name = [] for field in [0, 1, 2, 3, 4]: lefton = [] for i in stat[0]: if i == 'topic': lefton.append(field) else: lefton.append(i) target = pd.merge(target, t, how='left', left_on=lefton, right_on=stat[0]).rename( columns={fea_name: fea_name + str(field)}) tmp_name.append(fea_name + str(field)) target[fea_name + '_max'] = target[tmp_name].max(axis=1) target[fea_name + '_sum'] = target[tmp_name].sum(axis=1) final_list.append(fea_name + '_max') final_list.append(fea_name + '_sum') for field in [0, 1, 2, 3, 4]: target = target.drop([fea_name + str(field)], axis=1) return target[final_list], final_list
return pd.Series({"x_cross":np.mean(x['x']),'y_cross':np.mean(x['y'])}) junT1 = junG.groupby("id_jun").apply(clampF).reset_index() junT = gpd.GeoDataFrame.from_file(baseDir + "gis/motorway/de_junct_unique.shp") junT junT.to_file(baseDir + "gis/motorway/de_junct_unique.shp") if False: nodB = gpd.GeoDataFrame.from_file(baseDir + "gis/destatis/junct_bundesland.shp") nodc = nodB[['id_jun','GEN']].groupby('GEN').agg(len).reset_index() if False: junT = pd.read_csv(baseDir + "gis/motorway/de_junct_unique.csv") nodJ = gpd.GeoDataFrame.from_file(baseDir + "gis/motorway/motorway_link_nodes.shp") nodJ.columns = ['id_jun', 'y', 'node_id', 'x', 'geometry'] nodJ = pd.merge(nodJ,junT,left_on=["id_jun"],right_on=["id_jun"],how="left") nodJ.to_file(baseDir + "gis/motorway/motorway_link_nodes.shp") if False: motL = motG.geometry.unary_union if True: nodS = nodJ.copy() else: nodS = nodJ.loc[nodJ['id_jun'] == "A 4-53"] print('--------------------projection-on-the-motorway----------------------') neip = nodS.apply(lambda x: motL.interpolate(motL.project(x['geometry'])),axis=1) nodS.loc[:,"x_mot"] = [x.xy[0][0] for x in neip] nodS.loc[:,"y_mot"] = [x.xy[1][0] for x in neip] nodS.loc[:,"m_dist"] = nodS.apply(lambda x: (x['x']-x['x_mot'])**2 + (x['y']-x['y_mot'])**2,axis=1) def chirality(x1,y1,x2,y2,xo,yo): vp = [x1 - xo,y1 - yo]
def extruct_cross_feature_topic(data, feature): target = data.copy() print_time('extract cross feature topic') que = pd.read_csv('../datasets/question_info.csv', usecols=['qid', 'topic_id']) member_info = pd.read_csv( '../datasets/member_info.csv', usecols=['uid', 'sex', 'visit', 'CA', 'CB', 'CC', 'CD', 'CE']) feature = pd.merge(feature, que, how='left', on='qid').fillna('0') feature = pd.merge(feature, member_info, how='left', on='uid').fillna('0') target = pd.merge(target, que, how='left', on='qid').fillna('0') target = pd.merge(target, member_info, how='left', on='uid').fillna('0') target['flag'] = 1 feature['flag'] = 1 total_extend = feature['topic_id'].str.split(',', expand=True).stack() \ .reset_index(level=0).set_index('level_0') \ .rename(columns={0: 'topic'}).join(feature.drop('topic_id', axis=1)) \ .reset_index(drop=True) topic_df = target['topic_id'].str.split(',', expand=True) target = pd.concat([target, topic_df], axis=1) fea_list = ['flag', 'uid', 'sex', 'visit', 'CA', 'CB', 'CC', 'CD', 'CE'] result_list = [] for fea in fea_list: fea_name = 'topic_' + fea + '_rate' print(fea_name) t = total_extend.groupby(['topic', fea])['label'].agg(['count','sum']).reset_index() \ .rename(columns={'count':'count_s','sum':'sum_s'}) HP = HyperParam(1, 1) HP.update_from_data_by_moment(t['count_s'].values, t['sum_s'].values) # 矩估计 t[fea_name] = np.divide(t['sum_s'] + HP.alpha, t['count_s'] + HP.alpha + HP.beta) t = t.drop(['count_s', 'sum_s'], axis=1) tmp_name = [] for field in [0, 1, 2, 3, 4, 5]: target = pd.merge( target, t, how='left', left_on=[fea, field], right_on=[fea, 'topic' ]).rename(columns={fea_name: fea_name + str(field)}) tmp_name.append(fea_name + str(field)) target[fea_name + '_max'] = target[tmp_name].max(axis=1) target[fea_name + '_mean'] = target[tmp_name].mean(axis=1) result_list.append(fea_name + '_max') result_list.append(fea_name + '_mean') for field in [0, 1, 2, 3, 4, 5]: target = target.drop([fea_name + str(field)], axis=1) return target[result_list]
def extruct_feature(data, feature, ans_feature): target = data.copy() print_time('extract feature ') # 统计uid t = feature.groupby('uid')['label'].agg(['count','sum','mean','std']).reset_index()\ .rename(columns={'count':'sw_' + 'ulc','sum':'sw_' + 'uls', 'mean':'sw_' + 'ulm','std':'sw_' + 'uld'}) print_time('pinghua') HP = HyperParam(1, 1) HP.update_from_data_by_moment(t['sw_ulc'].values, t['sw_uls'].values) # 矩估计 t['sw_uid_rate_hp'] = np.divide(t['sw_uls'] + HP.alpha, t['sw_ulc'] + HP.alpha + HP.beta) print('pinghua', HP.alpha, HP.beta) target = pd.merge(target, t, how='left', on='uid') # 统计qid t = feature.groupby('qid')['label'].agg(['count', 'sum', 'mean', 'std']).reset_index() \ .rename(columns={'count': 'sw_' + 'qlc', 'sum': 'sw_' + 'qls', 'mean': 'sw_' + 'qlm', 'std': 'sw_' + 'qld'}) HP = HyperParam(1, 1) HP.update_from_data_by_moment(t['sw_qlc'].values, t['sw_qls'].values) # 矩估计 t['sw_qid_rate_hp'] = np.divide(t['sw_qls'] + HP.alpha, t['sw_qlc'] + HP.alpha + HP.beta) print('pinghua', HP.alpha, HP.beta) target = pd.merge(target, t, how='left', on='qid') #统计ansinfo gu = ans_feature.groupby('uid') t = gu['qid'].agg([ 'count' ]).reset_index().rename(columns={'count': 'sw_' + 'u_ans_q_num'}) target = pd.merge(target, t, how='left', on='uid') for feat in [ 'bit7', 'bit8', 'bit10', 'bit11', 'bit12', 'bit13', 'bit15', 'bit16', 'bit17' ]: t = gu[feat].agg(['sum', 'mean']).reset_index().rename(columns={ 'sum': 'sw_uc_' + feat, 'mean': 'sw_um_' + feat }) target = pd.merge(target, t, how='left', on='uid') feature['i_time'] = feature['day'] * 24 + feature['hour'] feature_sorted = feature.sort_values(by=['i_time'], axis=0, ascending=True).reset_index(drop=True) feature_sorted['slabel'] = feature_sorted['label'].astype(str) t = feature_sorted.groupby('uid')['slabel'].apply(lambda x: '-' + ''.join( x.tolist())).reset_index().rename(columns={'slabel': 'sw_uid_seq'}) t['sw_uid_seq_5'] = t['sw_uid_seq'].apply(lambda x: x[-5:]) t['sw_uid_recent_uclick'] = t['sw_uid_seq'].apply( lambda x: len(x) - 1 - x.rfind('1') if x.rfind('1') != -1 else len(x)) target = pd.merge(target, t, how='left', on='uid') t = feature_sorted.groupby('qid')['slabel'].apply(lambda x: '-' + ''.join( x.tolist())).reset_index().rename(columns={'slabel': 'sw_qid_seq'}) t['sw_qid_seq_5'] = t['sw_qid_seq'].apply(lambda x: x[-5:]) t['sw_qid_recent_uclick'] = t['sw_qid_seq'].apply( lambda x: len(x) - 1 - x.rfind('1') if x.rfind('1') != -1 else len(x)) target = pd.merge(target, t, how='left', on='qid') ans_feature[ 'ans_time'] = ans_feature['a_day'] + 0.04166 * ans_feature['a_hour'] qustion_info = pd.read_csv('../datasets2/question_info.csv', usecols=['qid', 'q_day', 'q_hour']) qustion_info[ 'qus_time'] = qustion_info['q_day'] + 0.04166 * qustion_info['q_hour'] ans_feature = pd.merge(ans_feature, qustion_info, how='left', on='qid') print(ans_feature['qus_time'].isnull().sum(), ans_feature.shape) ans_feature['qus_time'] = ans_feature['qus_time'].fillna(3000) ans_feature[ 'ans_dif_time'] = ans_feature['ans_time'] - ans_feature['qus_time'] gu = ans_feature.groupby('uid')['ans_dif_time'].agg(['mean','max','min','std']).reset_index()\ .rename(columns={'mean': 'at_mean','max':'at_max','min':'at_min','std':'at_std'}) target = pd.merge(target, gu, how='left', on='uid') ans_feature['week'] = ans_feature['a_day'] % 7 ans_feature['new_hour'] = ans_feature['a_hour'].apply(lambda x: int(x / 6)) target['week'] = target['day'] % 7 target['new_hour'] = target['hour'].apply(lambda x: int(x / 6)) t = ans_feature.groupby(['uid'])['qid'].agg( ['count']).reset_index().rename(columns={'count': 'u_t_count'}) t1 = ans_feature.groupby(['uid', 'week'])['qid'].agg( ['count']).reset_index().rename(columns={'count': 'uid_week_count'}) target = pd.merge(target, t, how='left', on='uid').fillna(0) target = pd.merge(target, t1, how='left', on=['uid', 'week']).fillna(0) print(target.columns.tolist()) target['uid_week_ans_rate'] = np.divide(target['uid_week_count'], target['u_t_count'] + 0.001) t1 = ans_feature.groupby(['uid', 'new_hour'])['qid'].agg( ['count']).reset_index().rename(columns={'count': 'uid_nhour_count'}) target = pd.merge(target, t1, how='left', on=['uid', 'new_hour']) target['uid_week_ans_rate'] = np.divide(target['uid_nhour_count'], target['u_t_count'] + 0.001) return target
def extract_id_whole_count(data, feature): target = data.copy() fea_list = [] feaname = 'uid_min_day' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['uid'])['day'].agg( ['min']).reset_index().rename(columns={'min': feaname}) target = pd.merge(target, t, how='left', on='uid') # feaname = 'uid_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['uid'])['label'].agg( ['count']).reset_index().rename(columns={'count': feaname}) target = pd.merge(target, t, how='left', on='uid') # feaname = 'uid_day_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['uid', 'day'])['label'].agg( ['count']).reset_index().rename(columns={'count': feaname}) target = pd.merge(target, t, how='left', on=['uid', 'day']) # feaname = 'uid_day_hour_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['uid', 'day', 'hour'])['label'].agg( ['count']).reset_index().rename(columns={'count': feaname}) target = pd.merge(target, t, how='left', on=['uid', 'day', 'hour']) # feaname = 'uid_min_day_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['uid', 'day'])['label'].agg( ['count']).reset_index().rename(columns={ 'count': feaname, 'day': 'uid_min_day' }) target = pd.merge(target, t, how='left', on=['uid', 'uid_min_day']) feaname = 'uid_day_nuinque' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['uid' ])['day'].agg(['nunique' ]).reset_index().rename(columns={ 'nunique': feaname, 'day': feaname }) target = pd.merge(target, t, how='left', on=['uid']) feaname = 'qid_day_nuinque' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['qid' ])['day'].agg(['nunique' ]).reset_index().rename(columns={ 'nunique': feaname, 'day': feaname }) target = pd.merge(target, t, how='left', on=['qid']) print_time(target.columns.tolist()) feaname = 'qid_min_day' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['qid'])['day'].agg( ['min']).reset_index().rename(columns={'min': feaname}) target = pd.merge(target, t, how='left', on='qid') print_time(target.columns.tolist()) feaname = 'qid_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['qid'])['label'].agg( ['count']).reset_index().rename(columns={'count': feaname}) target = pd.merge(target, t, how='left', on='qid') feaname = 'qid_day_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['qid', 'day'])['label'].agg( ['count']).reset_index().rename(columns={'count': feaname}) target = pd.merge(target, t, how='left', on=['qid', 'day']) feaname = 'qid_day_hour_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['qid', 'day', 'hour'])['label'].agg( ['count']).reset_index().rename(columns={'count': feaname}) target = pd.merge(target, t, how='left', on=['qid', 'day', 'hour']) feaname = 'qid_min_day_count' fea_list.append(feaname) print('extract', feaname) t = feature.groupby(['qid', 'day'])['label'].agg( ['count']).reset_index().rename(columns={ 'count': feaname, 'day': 'qid_min_day' }) target = pd.merge(target, t, how='left', on=['qid', 'qid_min_day']) return target[fea_list], fea_list
def test_merge(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: with warns_that_defaulting_to_pandas( ) if how == "outer" else contextlib.nullcontext(): modin_result = pd.merge(modin_df, modin_df2, how=how) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how) df_equals(modin_result, pandas_result) # left_on and right_index with warns_that_defaulting_to_pandas(): modin_result = pd.merge(modin_df, modin_df2, how=how, left_on="col1", right_index=True) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_on="col1", right_index=True) df_equals(modin_result, pandas_result) # left_index and right_on with warns_that_defaulting_to_pandas(): modin_result = pd.merge(modin_df, modin_df2, how=how, left_index=True, right_on="col1") pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_index=True, right_on="col1") df_equals(modin_result, pandas_result) # left_on and right_on col1 if how == "outer": warning_catcher = warns_that_defaulting_to_pandas() else: warning_catcher = contextlib.nullcontext() with warning_catcher: modin_result = pd.merge(modin_df, modin_df2, how=how, left_on="col1", right_on="col1") pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_on="col1", right_on="col1") df_equals(modin_result, pandas_result) # left_on and right_on col2 if how == "outer": warning_catcher = warns_that_defaulting_to_pandas() else: warning_catcher = contextlib.nullcontext() with warning_catcher: modin_result = pd.merge(modin_df, modin_df2, how=how, left_on="col2", right_on="col2") pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_on="col2", right_on="col2") df_equals(modin_result, pandas_result) # left_index and right_index modin_result = pd.merge(modin_df, modin_df2, how=how, left_index=True, right_index=True) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_index=True, right_index=True) df_equals(modin_result, pandas_result) s = pd.Series(frame_data.get("col1")) with pytest.raises(ValueError): pd.merge(s, modin_df2) with pytest.raises(TypeError): pd.merge("Non-valid type", modin_df2)
if not isVaildDate(str(df_row[field])): df_row[field] = np.nan return df_row purge_pat_files('../data', r'^[^_]+_log.csv$') logs1 = pd.read_csv('../data/log_reduced.csv', encoding='utf-8', chunksize=c_sz) for df, _ in zip(logs1, trange(1000)): df = pd.merge(df, ad_static, left_on='曝光广告id', right_on='广告id', how='inner') # 3. 去掉非法时间行 df['广告请求时间'] = pd.to_datetime(df['广告请求时间'], unit='s') # 转为日期 df['广告请求时间_date'] = df['广告请求时间'].apply(lambda x: x.date()) df = df[col_names1] # 1. 去空值 df.dropna(axis=0, how='any', inplace=True) # 数据分割 _ = df.apply(save_csv, axis=1) print('done')
def extract_topic_score(data, ttt, feature): print('extract_topic count feature') target = data.copy() feature['label'] = 1 que = pd.read_csv('../datasets2/question_info.csv', usecols=['qid', 'topic_id']) m_list = ['uid', 'SCORE'] meb = pd.read_csv('../datasets2/member_info.csv', usecols=m_list) target = pd.merge(target, que, how='left', on='qid').fillna('0') feature = pd.merge(feature, que, how='left', on='qid').fillna('0') target = pd.merge(target, meb, how='left', on='uid').fillna('0') feature = pd.merge(feature, meb, how='left', on='uid').fillna('0') total_extend = feature['topic_id'].str.split(',', expand=True).stack() \ .reset_index(level=0).set_index('level_0') \ .rename(columns={0: 'topic'}).join(feature.drop('topic_id', axis=1)) \ .reset_index(drop=True) topic_df = target['topic_id'].str.split(',', expand=True) target = pd.concat([target, topic_df], axis=1) fealist = m_list final_list = [] ###统计topic的总量 fealist = [] for stat in fealist: s_total_extend = total_extend[['topic', stat, 'label']] fea_name = stat + '_ans_topic_count_ratio' print('extract', fea_name) ###统计话题和用户属性的交叉量 t = total_extend.groupby(['topic', stat])['label'].agg( ['count']).reset_index().rename(columns={'count': 'sum_count'}) t.loc[t['topic'] == '0', 'sum_count'] = 0 t = pd.merge(t, t1, how='left', on='topic') #平滑求占比 HP = HyperParam(1, 1) HP.update_from_data_by_moment(t['topic_count'].values, t['sum_count'].values) # 矩估计 t[fea_name] = np.divide(t['sum_count'] + HP.alpha, t['topic_count'] + HP.alpha + HP.beta) t = t.drop(['topic_count', 'sum_count'], axis=1) stat = ['topic', stat] tmp_name = [] for field in [0, 1, 2, 3, 4]: lefton = [] for i in stat: if i == 'topic': lefton.append(field) else: lefton.append(i) target = pd.merge(target, t, how='left', left_on=lefton, right_on=stat).rename( columns={fea_name: fea_name + str(field)}) tmp_name.append(fea_name + str(field)) target[fea_name + '_max'] = target[tmp_name].max(axis=1) target[fea_name + '_mean'] = target[tmp_name].mean(axis=1) final_list.append(fea_name + '_max') final_list.append(fea_name + '_mean') for field in [0, 1, 2, 3, 4]: target = target.drop([fea_name + str(field)], axis=1) return target[final_list]
def test_merge(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: # Defaults modin_result = pd.merge(modin_df, modin_df2, how=how) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how) df_equals(modin_result, pandas_result) # left_on and right_index modin_result = pd.merge( modin_df, modin_df2, how=how, left_on="col1", right_index=True ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_on="col1", right_index=True ) df_equals(modin_result, pandas_result) # left_index and right_on modin_result = pd.merge( modin_df, modin_df2, how=how, left_index=True, right_on="col1" ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_index=True, right_on="col1" ) df_equals(modin_result, pandas_result) # left_on and right_on col1 modin_result = pd.merge( modin_df, modin_df2, how=how, left_on="col1", right_on="col1" ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_on="col1", right_on="col1" ) df_equals(modin_result, pandas_result) # left_on and right_on col2 modin_result = pd.merge( modin_df, modin_df2, how=how, left_on="col2", right_on="col2" ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_on="col2", right_on="col2" ) df_equals(modin_result, pandas_result) # left_index and right_index modin_result = pd.merge( modin_df, modin_df2, how=how, left_index=True, right_index=True ) pandas_result = pandas.merge( pandas_df, pandas_df2, how=how, left_index=True, right_index=True ) df_equals(modin_result, pandas_result) s = pd.Series(frame_data.get("col1")) with pytest.raises(ValueError): pd.merge(s, modin_df2) with pytest.raises(TypeError): pd.merge("Non-valid type", modin_df2)
df_list = [] # Go through each chromosome and create its own dataframe, with properly labeled columns for chrom in range(1, 23): # Filter for the current chromosome's dataframe tmp_df = abs_anttd_seg_df[abs_anttd_seg_df.index.get_level_values( 'Chromosome') == chrom] # Change the column names to identify the chromosome tmp_df.columns = [f'{col}_chromosome_{chrom}' for col in tmp_df.columns] # Remove now redundant `Chromosome` column tmp_df = tmp_df.reset_index().drop(columns='Chromosome', axis=1) # Add to the dataframes list df_list.append(tmp_df) df_list[3] abs_anttd_seg_df = reduce(lambda x, y: pd.merge(x, y, on='Sample'), df_list) abs_anttd_seg_df.head() abs_anttd_seg_df.Sample.nunique() len(abs_anttd_seg_df) # Remove duplicate columns (redundant features that are independent of the chromosome): unique_features = set( [col.split('_chromosome')[0] for col in abs_anttd_seg_df.columns]) unique_features [col for col in abs_anttd_seg_df.columns if 'Cancer_cell_frac_a1' in col] # Save the feature names that are redundant (i.e. no difference between chromosomes)