def _ans_topic_feat(train,feature,mode,ans): feature['i_time'] = feature['day'] + 0.04166 * feature['hour'] train['i_time'] = train['day'] + 0.04166 * train['hour'] feature = pd.merge(feature,qus_info,how='left',on='qid').fillna('0') train = pd.merge(train, qus_info, how='left', on='qid').fillna('0') if mode == 'test_B': s = pd.read_csv('../datasets/invite_test.csv') s_score = pd.read_csv('../datasets/feature/test/p_score') s['i_time'] = s['day'] + 0.04166 * s['hour'] s = pd.concat([s, s_score], axis=1) s = s[s['p_score'] >= 0.42] s = pd.merge(s, qus_info, how='left', on='qid').fillna('0') train_score = pd.read_csv('../datasets/feature/' + mode + '/p_score') print(train_score.shape) s1 = feature[feature['label'] == 1] train = pd.concat([train, train_score], axis=1) s2 = train[train['p_score'] >= 0.42] s1 = s1[['uid', 'qid','i_time']] s2 = s2[['uid', 'qid','i_time']] if mode == 'test_B': s = s[['uid', 'qid', 'i_time']] s = pd.concat([s1, s2, s], axis=0) else: s = pd.concat([s1,s2],axis=0) s = pd.concat([s1, s2, s], axis=0) # 得到每个用户7天前的回答列表 def get_time_list(df): qid_list = df['qid'].tolist() time_list = df['i_time'].tolist() tmp = [] for i in range(len(qid_list)): tmp.append((qid_list[i],time_list[i])) return sorted(tmp,key=lambda x:x[1]) t = s.groupby(['uid']).apply(get_time_list).reset_index().rename(columns={0: 'answer_time_list'}) train = pd.merge(train, t, on='uid', how='left') train['answer_time_list'] = train['answer_time_list'].fillna(0) train = multiprocessing_apply_data_frame(tmp_func3, train, 10) train['merge_ans_dif_topic_1'] = train['i_time'] - train['merge_last_ans_time_topic_1b'] train = train[['merge_ans_dif_topic_1']] return train
def ans_time_feat(train, feature, mode, ans): feature['i_time'] = feature['day'] + 0.04166 * feature['hour'] train['i_time'] = train['day'] + 0.04166 * train['hour'] if mode == 'test_B': s = pd.read_csv('../datasets1/invite_test.csv') s_score = pd.read_csv('../datasets3/feature/test/p_score') s['i_time'] = s['day'] + 0.04166 * s['hour'] s = pd.concat([s, s_score], axis=1) s = s[s['p_score'] >= 0.42] train_score = pd.read_csv('../datasets3/feature/' + mode + '/p_score') print(train_score.shape) s1 = feature[feature['label'] == 1] train = pd.concat([train, train_score], axis=1) s2 = train[train['p_score'] >= 0.42] s1 = s1[['uid', 'i_time']] s2 = s2[['uid', 'i_time']] if mode == 'test_B': s = s[['uid', 'i_time']] s = pd.concat([s1, s2, s], axis=0) else: s = pd.concat([s1,s2],axis=0) # 得到每个用户7天前的回答列表 def get_time_list(df): return sorted(df['i_time'].tolist()) t = s.groupby(['uid']).apply(get_time_list).reset_index().rename(columns={0: 'answer_time_list'}) train = pd.merge(train, t, on='uid', how='left') train['answer_time_list'] = train['answer_time_list'].fillna(0) train = multiprocessing_apply_data_frame(tmp_func1, train, 10) train = multiprocessing_apply_data_frame(tmp_func7, train, 10) train['merge_ans_dif_1'] = train['i_time'] - train['merge_last_ans_time_1b'] train['merge_ans_dif_7'] = train['i_time'] - train['merge_last_ans_time_7b'] train = train[['merge_ans_dif_1', 'merge_ans_dif_7']] return train
def uid_seq_mlabel(train,feature,mode,ans): feature['i_time'] = feature['day'] + 0.04166 * feature['hour'] train['i_time'] = train['day'] + 0.04166 * train['hour'] if mode == 'test_B': s = pd.read_csv('../datasets/invite_test.csv') s_score = pd.read_csv('../datasets/feature/test/p_score') s['i_time'] = s['day'] + 0.04166 * s['hour'] s = pd.concat([s, s_score], axis=1) s['m_label'] = s['p_score'].apply(lambda x: 1 if x >= 0.42 else 0) train_score = pd.read_csv('../datasets/feature/' + mode + '/p_score') print(train_score.shape) feature['m_label'] = feature['label'] train = pd.concat([train, train_score], axis=1) train['m_label'] = train['p_score'].apply(lambda x: 1 if x >= 0.42 else 0) train = train[['uid','qid','i_time','m_label']] feature = feature[['uid', 'qid', 'i_time', 'm_label']] if mode == 'test_B': s = s[['uid', 'qid', 'i_time', 'm_label']] s = pd.concat([train, feature,s], axis=0) else: s = pd.concat([train, feature], axis=0) # 得到每个用户7天前的回答列表 def get_time_list(df): qid_list = df['m_label'].tolist() time_list = df['i_time'].tolist() tmp = [] for i in range(len(qid_list)): tmp.append((qid_list[i],time_list[i])) tmp = sorted(tmp,key=lambda x:x[1]) return tmp t = s.groupby(['uid']).apply(get_time_list).reset_index().rename(columns={0: 'answer_time_list'}) train = pd.merge(train, t, on='uid', how='left') train['answer_time_list'] = train['answer_time_list'].fillna(0) train = multiprocessing_apply_data_frame(tmp_func4, train, 10) train = train[['merge_recent_7_day_click_rate']] return train
def extract_feature_smilar(data, feature, ans): target = data.copy() print_time('extract feature smilar') uid_qid_list = ans.groupby(['uid'])['qid'] \ .apply(lambda x: ','.join(x.tolist())).reset_index().rename(columns={'qid': 'uid_qid_list'}) target = pd.merge(target, uid_qid_list, how='left', on='uid').fillna(0) print_time('start') target = multiprocessing_apply_data_frame(tmp_func, target, 10) print_time('end') return target