def main(): feature_name = 'other_features1' train_features = pd.read_csv('other_feature1/feature_train.csv') test_features = pd.read_csv('other_feature1/feature_test.csv') feature_score = pd.read_csv( 'other_feature1/action_process_features_importances.csv') feature_score = feature_score.sort_values( by='importance', ascending=False).reset_index(drop=True) used_features = feature_score['feature'].values.tolist()[:80] if 'userid' not in used_features: used_features.append('userid') train = train_features[used_features] test = test_features[used_features] train_features = pd.read_csv('other_feature2/type_typevalue_train.csv') test_features = pd.read_csv('other_feature2/type_typevalue_test.csv') train = pd.merge(train, train_features, on='userid', how='left') test = pd.merge(test, test_features, on='userid', how='left') # train_features = pd.read_csv('other_feature2/type_type_train.csv') # test_features = pd.read_csv('other_feature2/type_type_test.csv') # train = pd.merge(train, train_features, on='userid', how='left') # test = pd.merge(test, test_features, on='userid', how='left') # train_features = pd.read_csv('other_feature2/actiontype_cloest_train.csv') # test_features = pd.read_csv('other_feature2/actiontype_cloest_test.csv') # train = pd.merge(train, train_features, on='userid', how='left') # test = pd.merge(test, test_features, on='userid', how='left') data_utils.save_features(train, test, feature_name)
def main(): feature_name = 'basic_user_action_features' if data_utils.is_feature_created(feature_name): return train_action = pd.read_csv(Configure.base_path + 'train/action_train.csv') test_action = pd.read_csv(Configure.base_path + 'test/action_test.csv') train_action = build_time_features(train_action) test_action = build_time_features(test_action) print('save cleaned datasets') train_action.to_csv(Configure.cleaned_path + 'cleaned_action_train.csv', index=False, columns=train_action.columns) test_action.to_csv(Configure.cleaned_path + 'cleaned_action_test.csv', index=False, columns=test_action.columns) train_action_features = basic_action_info(train_action) test_action_features = basic_action_info(test_action) print('save ', feature_name) data_utils.save_features(train_action_features, test_action_features, features_name=feature_name)
def main(): # 待预测订单的数据 (原始训练集和测试集) train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8') test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8') orderHistory_train = pd.read_csv(Configure.base_path + 'train/orderHistory_train.csv', encoding='utf8') orderHistory_test = pd.read_csv(Configure.base_path + 'test/orderHistory_test.csv', encoding='utf8') orderHistory_train = build_time_category_encode(orderHistory_train) orderHistory_test = build_time_category_encode(orderHistory_test) orderHistory_train.to_csv(Configure.cleaned_path + 'cleaned_orderHistory_train.csv', index=False, columns=orderHistory_train.columns) orderHistory_test.to_csv(Configure.cleaned_path + 'cleaned_orderHistory_test.csv', index=False, columns=orderHistory_test.columns) feature_name = 'user_order_history_features' if not data_utils.is_feature_created(feature_name): print('build train user_order_history_features') train_features = build_order_history_features(train, orderHistory_train) print('build test user_order_history_features') test_features = build_order_history_features(test, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'user_order_history_features2' if not data_utils.is_feature_created(feature_name): print('build train user_order_history_features2') train_features = build_order_history_features2(train, orderHistory_train) print('build test user_order_history_features2') test_features = build_order_history_features2(test, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'user_order_history_features3' if not data_utils.is_feature_created(feature_name): orderHistory = pd.concat([orderHistory_train, orderHistory_test]) print('build train user_order_history_features3') train_features = build_order_history_features3(train, orderHistory, orderHistory_train) print('build test user_order_history_features3') test_features = build_order_history_features3(test, orderHistory, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'user_order_history_features4' if not data_utils.is_feature_created(feature_name): print('build train user_order_history_features4') train_features = build_order_history_features4(train, orderHistory_train) print('build test user_order_history_features4') test_features = build_order_history_features4(test, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'user_order_history_features_wxr' if not data_utils.is_feature_created(feature_name): orderHistory = pd.concat([orderHistory_train, orderHistory_test]) print('build train user_order_history_features3') train_features = build_order_history_features_wxr(train, orderHistory, orderHistory_train) print('build test user_order_history_features3') test_features = build_order_history_features_wxr(test, orderHistory, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name)
def main(): feature_name = 'basic_user_info' if data_utils.is_feature_created(feature_name): return # 用户个人基本信息 train_user = pd.read_csv(Configure.base_path + 'train/userProfile_train.csv', encoding='utf8') test_user = pd.read_csv(Configure.base_path + 'test/userProfile_test.csv', encoding='utf8') # 1. 性别 dummy code train_user['gender'] = train_user['gender'].map(gender_convert) test_user['gender'] = test_user['gender'].map(gender_convert) dummies = pd.get_dummies(train_user['gender'], prefix='gender') train_user[dummies.columns] = dummies dummies = pd.get_dummies(test_user['gender'], prefix='gender') test_user[dummies.columns] = dummies # province = pd.read_csv('province_economic.csv', encoding='utf8') # train_user = train_user.merge(province, on='province', how='left') # test_user = test_user.merge(province, on='province', how='left') # 2. 省份进行 LabelEncoder train_user['province'] = train_user['province'].map(province_convert) test_user['province'] = test_user['province'].map(province_convert) le = LabelEncoder() le.fit(train_user['province'].values) train_user['province_code'] = le.transform(train_user['province']) test_user['province_code'] = le.transform(test_user['province']) # 3. 年龄段进行 dummy code train_user['age'] = train_user['age'].map(lambda age: 'lg' + age[:2] if age == age else 'None') test_user['age'] = test_user['age'].map(lambda age: 'lg' + age[:2] if age == age else 'None') print('save cleaned datasets') train_user.to_csv(Configure.cleaned_path + 'cleaned_userProfile_train.csv', index=False, columns=train_user.columns) test_user.to_csv(Configure.cleaned_path + 'cleaned_userProfile_test.csv', index=False, columns=test_user.columns) dummies = pd.get_dummies(train_user['age'], prefix='age') train_user[dummies.columns] = dummies dummies = pd.get_dummies(test_user['age'], prefix='age') test_user[dummies.columns] = dummies print('save ', feature_name) data_utils.save_features(train_user, test_user, features_name=feature_name)
def main(): # 待预测订单的数据 (原始训练集和测试集) train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8') test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8') orderHistory_train = pd.read_csv(Configure.cleaned_path + 'cleaned_orderHistory_train.csv', encoding='utf8') orderHistory_test = pd.read_csv(Configure.cleaned_path + 'cleaned_orderHistory_test.csv', encoding='utf8') action_train = pd.read_csv(Configure.base_path + 'train/action_train.csv') action_test = pd.read_csv(Configure.base_path + 'test/action_test.csv') action_train = build_time_features(action_train) action_test = build_time_features(action_test) orderHistory_train['city'] = orderHistory_train['city'].astype(str) orderHistory_test['city'] = orderHistory_test['city'].astype(str) orderHistory_train['orderTime'] = pd.to_datetime( orderHistory_train['orderTime']) orderHistory_test['orderTime'] = pd.to_datetime( orderHistory_test['orderTime']) feature_name = 'advance_order_history_features' if not data_utils.is_feature_created(feature_name): print('build train advance_order_history_features') train_features = gen_history_features(train, orderHistory_train) print('build test advance_order_history_features') test_features = gen_history_features(test, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'advance_action_features' if not data_utils.is_feature_created(feature_name): print('build train advance_action_features') train_features = gen_action_features(train, action_train) print('build test advance_action_features') test_features = gen_action_features(test, action_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'advance_action_features1' if not data_utils.is_feature_created(feature_name): print('build train advance_action_features1') train_features = gen_action_features1(train, action_train) print('build test advance_action_features1') test_features = gen_action_features1(test, action_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'advance_action_features2' if not data_utils.is_feature_created(feature_name): print('build train advance_action_features2') train_features = gen_action_features2(train, action_train) print('build test advance_action_features2') test_features = gen_action_features2(test, action_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name)
def save_all_features(train, test, s): funcs = { 'speed_variance_mean': speed_variance_mean, 'trip_id_count': trip_id_count, 'trip_id_interval_mean': trip_id_interval_mean, 'speed_final_mean': speed_final_mean, 'time_gap_direction_change_feat': time_gap_direction_change_feat, # 通话时间(在总行程时间中的占比) "calling_time": calling_time, 'callstate_feat': callstate_feat, "build_time_features": build_time_features, "height": height_feet, } for feat_name in Configure.features: save_features(funcs[feat_name](train), 'train', feat_name, s) save_features(funcs[feat_name](test), 'test', feat_name, s)
def main(): feature_name = 'user_order_comment_features' if data_utils.is_feature_created(feature_name): return # 待预测订单的数据 (原始训练集和测试集) train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8') test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8') userComment_train = pd.read_csv(Configure.base_path + 'train/userComment_train.csv', encoding='utf8') userComment_test = pd.read_csv(Configure.base_path + 'test/userComment_test.csv', encoding='utf8') userComment_train.loc[userComment_train['rating'] == 4.33, 'rating'] = 4 userComment_train.loc[userComment_train['rating'] == 3.67, 'rating'] = 4 userComment_test.loc[userComment_train['rating'] == 2.33, 'rating'] = 2 userComment_train['rating'] = userComment_train['rating'].astype(int) userComment_test['rating'] = userComment_test['rating'].astype(int) print('save cleaned datasets') userComment_train.to_csv(Configure.cleaned_path + 'cleaned_userComment_train.csv', index=False, columns=userComment_train.columns, encoding='utf8') userComment_test.to_csv(Configure.cleaned_path + 'cleaned_userComment_test.csv', index=False, columns=userComment_test.columns, encoding='utf8') print('build train features') train_features = built_comment_features(train, userComment_train) print('build test features') test_features = built_comment_features(test, userComment_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name)
def main(): feature_name = 'sqg_features' # if data_utils.is_feature_created(feature_name): # return print('add stage_one_features') # train_features = pd.read_csv('train_sqg_stage_one_features.csv') # test_features = pd.read_csv('test_sqg_stage_one_features.csv') # # used_features = ['userid', 'rate_all_good', # 'big_than_mean', 'rate_user_click', # 'click_1_rate', 'click_2_rate', 'click_3_rate', 'click_4_rate', # 'click_5_rate', 'click_6_rate', 'click_7_rate', 'click_8_rate', # 'click_9_rate', 'less_than_4_rate', 'more_than_5_rate', # 'click_1_num', 'click_2_num', 'click_3_num', 'click_4_num', # 'click_5_num', 'click_6_num', 'click_7_num', 'click_8_num', # 'click_9_num', 'less_than_4_num', 'more_than_6_num', # 'action_time_min', 'action_time_max', 'diff_time_num_click', # 'max_diff_days', 'rate_diff_num_time_in_max', # 'diff_max_x', 'diff_median_x', 'diff_min_x', 'diff_max_y', 'diff_median_y', 'diff_min_y', # 'rate_orderNum_in_clickNum', 'rate_goodNum_in_clickNum', 'lessthan4_Num_minus_more_than_6_num', # 'order_time_max', 'order_time_min', 'order_time_median'] # # train_features = train_features[used_features] # test_features = test_features[used_features] print('add stage_two_features') # u'userid',:用户ID # u'action_last_week_count':用户最后一周点击的次数 # u'action_last_month_count':用户最后一个月点击的总次数 # u'rate_last_weekcount_vs_last_monthcount':用户最后一周点击的次数除以用户最后一个月的总次数 # u'rate_last_week_count_in_all_usercount':用户最后一周点击次数占所有用户最后一周点击总次数比值(可能不加这个好些, 测试使用) # u'rate_last_month_count_in_all_usercount':用户最后一个月点击次数占所有用户最后一月点击总次数比值(可能不加这个好些, 测试使用) # u'action_last_momnth_to_now_days':用户最后一个月点击总频次除以距离现在的时间间隔(现在使用的间隔是天) train_features = pd.read_csv('train_sqg_stage_two_features.csv')[[ 'userid', 'action_last_week_count', 'action_last_month_count' ]] test_features = pd.read_csv('test_sqg_stage_two_features.csv')[[ 'userid', 'action_last_week_count', 'action_last_month_count' ]] print('save ', feature_name, train_features.shape, test_features.shape) data_utils.save_features(train_features, test_features, feature_name)
def main(): train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8') test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8') orderHistory_train = pd.read_csv(Configure.base_path + 'train/orderHistory_train.csv', encoding='utf8') orderHistory_test = pd.read_csv(Configure.base_path + 'test/orderHistory_test.csv', encoding='utf8') action_train = pd.read_csv(Configure.base_path + 'train/action_train.csv') action_test = pd.read_csv(Configure.base_path + 'test/action_test.csv') action_train = generate_new_action(action_train, orderHistory_train) action_test = generate_new_action(action_test, orderHistory_test) train_action_grouped = dict(list(action_train.groupby('userid'))) test_action_grouped = dict(list(action_test.groupby('userid'))) feature_name = 'action_order_features1' if not data_utils.is_feature_created(feature_name): print('build train action_order_features1') train_features = build_action_order_features1(train, train_action_grouped) print('build test action_order_features1') test_features = build_action_order_features1(test, test_action_grouped) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'action_order_features2' if not data_utils.is_feature_created(feature_name): print('build train action_order_features2') train_features = build_action_order_features2(train, train_action_grouped) print('build test action_order_features2') test_features = build_action_order_features2(test, test_action_grouped) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'action_order_features3' if not data_utils.is_feature_created(feature_name): print('build train action_order_features3') train_features = build_action_order_features3(train, train_action_grouped) print('build test action_order_features3') test_features = build_action_order_features3(test, test_action_grouped) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name)
def main(): feature_name = 'wxr_features' if data_utils.is_feature_created(feature_name): return print('add comment score features') with open('wxr_train_comment_features.pkl', "rb") as f: user_comment_train = cPickle.load(f) with open('wxr_test_comment_features.pkl', "rb") as f: user_comment_test = cPickle.load(f) user_comment_train.fillna(-1, inplace=True) user_comment_test.fillna(-1, inplace=True) train_features = user_comment_train test_features = user_comment_test # print('add user_info features') # with open('wxr_train_user_info_features.pkl', "rb") as f: # train_user_info = cPickle.load(f) # with open('wxr_test_user_info_features.pkl', "rb") as f: # test_user_info = cPickle.load(f) # train_user_info.drop(['gender', 'province', 'age'], axis=1, inplace=True) # test_user_info.drop(['gender', 'province', 'age'], axis=1, inplace=True) # # train_features = train_features.merge(train_user_info, on='userid', how='left') # test_features = test_features.merge(test_user_info, on='userid', how='left') print('add history features') with open('wxr_operate_4_train_order_history_features.pkl', "rb") as f: history_features_train = cPickle.load(f) with open('wxr_operate_4_test_order_history_features.pkl', "rb") as f: history_features_test = cPickle.load(f) use_features = [ 'userid', 'avg_days_between_order', 'days_ratio_since_last_order', 'city_num', 'country_num', 'continent_num', 'city_rich', 'city_avg_rich', 'country_rich', 'country_avg_rich', 'histord_time_last_1_year', 'histord_time_last_1_month', 'histord_sum_cont1', 'histord_sum_cont2', 'histord_sum_cont3', 'histord_sum_cont4', 'histord_sum_cont5', 'timespan_lastord_1_2', 'timespan_lastord_2_3' ] history_features_train = history_features_train[use_features] history_features_test = history_features_test[use_features] train_features = train_features.merge(history_features_train, on='userid', how='left') test_features = test_features.merge(history_features_test, on='userid', how='left') print('add action features') with open('wxr_operate_3_train_action_features.pkl', "rb") as f: action_features_train = cPickle.load(f) with open('wxr_operate_3_test_action_features.pkl', "rb") as f: action_features_test = cPickle.load(f) use_features = [ 'userid', 'avg_browse_num_after_last_order', 'browse_num_after_last_order', 'operate_num_after_last_order', 'avg_operate_num_after_last_order', 'open_num_after_last_order', 'action_1_num_after_last_order', 'action_2_num_after_last_order', 'action_3_num_after_last_order', 'action_4_num_after_last_order', 'action_5_num_after_last_order', 'action_6_num_after_last_order', 'action_7_num_after_last_order', 'action_8_num_after_last_order', 'action_9_num_after_last_order' ] action_features_train = action_features_train[use_features] action_features_test = action_features_test[use_features] train_features = train_features.merge(action_features_train, on='userid', how='left') test_features = test_features.merge(action_features_test, on='userid', how='left') print('add someother features') some_other_train = pd.read_csv('some_other_train_features.csv') some_other_test = pd.read_csv('some_other_test_features.csv') train_features = train_features.merge(some_other_train, on='userid', how='left') test_features = test_features.merge(some_other_test, on='userid', how='left') print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name)
def save_all_features(train, test): funcs = { } for name in Configure.features: save_features(*funcs[name](train, test), name)
def main(): print('load datasets') questions = pd.read_csv(Configure.question_file) train = pd.read_csv(Configure.train_data_file).sample(n=1000) test = pd.read_csv(Configure.test_data_file).sample(n=1000) train['id'] = np.arange(train.shape[0]) train = pd.merge(train, questions, left_on=['q1'], right_on=['qid'], how='left') train = train.rename(columns={'words': 'q1_words', 'chars': 'q1_chars'}) del train['qid'] train = pd.merge(train, questions, left_on=['q2'], right_on=['qid'], how='left') train = train.rename(columns={'words': 'q2_words', 'chars': 'q2_chars'}) train.drop(['q1', 'q2', 'qid'], axis=1, inplace=True) test['id'] = np.arange(test.shape[0]) test = pd.merge(test, questions, left_on=['q1'], right_on=['qid'], how='left') test = test.rename(columns={'words': 'q1_words', 'chars': 'q1_chars'}) del test['qid'] test = pd.merge(test, questions, left_on=['q2'], right_on=['qid'], how='left') test = test.rename(columns={'words': 'q2_words', 'chars': 'q2_chars'}) test.drop(['q1', 'q2', 'qid'], axis=1, inplace=True) feature_name = 'basic_features' if not data_utils.is_feature_created(feature_name): train_words = pd.Series( train['q1_words'].map(lambda x: x.split(' ')).tolist() + train['q2_words'].map(lambda x: x.split(' ')).tolist()) words = [x for y in train_words for x in y] counts = Counter(words) words_weights = { word: get_weight(count) for word, count in counts.items() } train_chars = pd.Series( train['q1_chars'].map(lambda x: x.split(' ')).tolist() + train['q2_chars'].map(lambda x: x.split(' ')).tolist()) chars = [x for y in train_chars for x in y] counts = Counter(chars) chars_weights = { word: get_weight(count) for word, count in counts.items() } ques = pd.concat( [train[['q1_words', 'q2_words']], test[['q1_words', 'q2_words']]], axis=0).reset_index(drop='index') q_dict = defaultdict(set) for i in range(ques.shape[0]): q_dict[ques.q1_words[i]].add(ques.q2_words[i]) q_dict[ques.q2_words[i]].add(ques.q1_words[i]) print('train build_basic_features') train_features = build_features1(train, words_weights, chars_weights, q_dict) print('test build_basic_features') test_features = build_features1(test, words_weights, chars_weights, q_dict) data_utils.save_features(train_features, test_features, feature_name) feature_name = 'basic_features2' if data_utils.is_feature_created(feature_name): print('create gensim model') word_model = gensim.models.KeyedVectors.load_word2vec_format( Configure.word_embed_path, binary=False) char_model = gensim.models.KeyedVectors.load_word2vec_format( Configure.char_embed_path, binary=False) norm_word_model = gensim.models.KeyedVectors.load_word2vec_format( Configure.word_embed_path, binary=False) norm_word_model.init_sims(replace=True) norm_char_model = gensim.models.KeyedVectors.load_word2vec_format( Configure.char_embed_path, binary=False) norm_char_model.init_sims(replace=True) print('train build_features2') train_features = build_features2(train, word_model, char_model) print('test build_features2') test_features = build_features2(test, word_model, char_model) data_utils.save_features(train_features, test_features, feature_name)
def main(): feature_name = 'user_order_comment_features' if data_utils.is_feature_created(feature_name): return # 待预测订单的数据 (原始训练集和测试集) train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8') test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8') userComment_train = pd.read_csv(Configure.base_path + 'train/userComment_train.csv', encoding='utf8') userComment_test = pd.read_csv(Configure.base_path + 'test/userComment_test.csv', encoding='utf8') userComment_train.loc[userComment_train['rating'] == 4.33, 'rating'] = 4 userComment_train.loc[userComment_train['rating'] == 3.67, 'rating'] = 4 userComment_test.loc[userComment_train['rating'] == 2.33, 'rating'] = 2 userComment_train['rating'] = userComment_train['rating'].astype(int) userComment_test['rating'] = userComment_test['rating'].astype(int) orderHistory_train = pd.read_csv(Configure.cleaned_path + 'cleaned_orderHistory_train.csv', encoding='utf8') orderHistory_test = pd.read_csv(Configure.cleaned_path + 'cleaned_orderHistory_test.csv', encoding='utf8') userComment_train = pd.merge(userComment_train, orderHistory_train[['orderid', 'orderType']], on='orderid', how='left') userComment_test = pd.merge(userComment_test, orderHistory_test[['orderid', 'orderType']], on='orderid', how='left') userComment_train = commentKey_score(userComment_train) userComment_test = commentKey_score(userComment_test) userComment_train = tag_score(userComment_train) userComment_test = tag_score(userComment_test) print('save cleaned datasets') userComment_train.to_csv(Configure.cleaned_path + 'cleaned_userComment_train.csv', index=False, columns=userComment_train.columns, encoding='utf8') userComment_test.to_csv(Configure.cleaned_path + 'cleaned_userComment_test.csv', index=False, columns=userComment_test.columns, encoding='utf8') print('build train features') train_features = built_comment_features(train, userComment_train) print('build test features') test_features = built_comment_features(test, userComment_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name) print('build wxr features') feature_name = 'user_order_comment_features_wxr' if not data_utils.is_feature_created(feature_name): print('build train action history features11') train_features = built_comment_features_wxr(train, userComment_train, orderHistory_train) print('build test action history features11') test_features = built_comment_features_wxr(test, userComment_test, orderHistory_test) print('save ', feature_name) data_utils.save_features(train_features, test_features, feature_name)