def load_test_data(): with timer("loading test data"): print('loading test data...') if FLAGS.test_for_train: path_prefix = "test_Cnt_ForTrain" else: path_prefix = "test_Cnt" if FLAGS.debug: test_data_path = path + path_prefix + "_Top.ss.csv" test_df = pd.read_csv(test_data_path, dtype=dtypes, usecols=['click_id'] + keras_train.USED_FEATURE_LIST) else: test_data_path = path + path_prefix + ".csv" test_df = pd.read_csv( test_data_path, dtype=dtypes, header=None, sep='\t', names=['id', 'click_id'] + keras_train.DATA_HEADER, #nrows = 10000, # usecols = ['click_id'] + keras_train.USED_FEATURE_LIST ) if FLAGS.test_for_train: train_df = train_df.append(test_df[['is_attributed'] + keras_train.USED_FEATURE_LIST]) test_df = test_df[:100000] print(test_df.info()) gc.collect() return test_df
def find_best_iteration_search(bst): """ """ valide_df = load_valide_data() valide_data = valide_df[keras_train.USED_FEATURE_LIST].values.astype( DENSE_FEATURE_TYPE) valide_label = valide_df['is_attributed'].values.astype(np.uint8) del valide_df gc.collect() if FLAGS.stacking: valide_data = gen_stacking_data(valide_data) pos_cnt = valide_label.sum() neg_cnt = len(valide_label) - pos_cnt print("valide type: {0} valide size: {1} valide data pos: {2} neg: {3}". format(valide_data.dtype, len(valide_data), pos_cnt, neg_cnt)) with timer("finding best iteration..."): search_iterations = [ int(ii.strip()) for ii in FLAGS.search_iterations.split(',') ] for i in range(search_iterations[0], search_iterations[1], search_iterations[2]): y_pred = bst.predict(valide_data, num_iteration=i) score = metrics.roc_auc_score(valide_label, y_pred) loss = metrics.log_loss(valide_label, y_pred) print("Iteration: {0} AUC: {1} Logloss: {2}".format( i, score, loss))
def neg_sample(input_data, labels, C=1): """ Param: labels shape: (n_sample,) preds shape: (n_sample,) input_data shape: (n_sample, feature_dim) C: neg_number = C * pos_number return: data after sampling """ with timer("Negative sampling"): print('Negative sampling...') pos_ind = np.where(labels == 1)[0] neg_ind = np.where(labels == 0)[0] accept_rate = float(C * len(pos_ind)) / float(len(neg_ind)) neg_select_ind = nrs.choice(neg_ind, len(pos_ind) * C, replace=True) select_ind = np.append(pos_ind, neg_select_ind) nrs.shuffle(select_ind) sample_data = input_data[select_ind, :] sample_labels = labels[select_ind] sample_neg_ind = np.where(sample_labels == 0)[0] weight = np.ones(len(sample_labels)) weight[sample_neg_ind] = 1.0 / accept_rate print('-----Neg Sampling Before All: {} Pos: {} Neg: {}'.format( len(labels), np.sum(labels == 1), np.sum(labels == 0))) print('-----Neg Sampling After All: {} Pos: {} Neg: {}'.format( len(sample_labels), np.sum(sample_labels == 1), np.sum(sample_labels == 0))) print('-----Neg Sampling Rate: {}'.format( float(len(sample_labels)) / float(len(labels)))) return sample_data, sample_labels, weight
def load_train_data(): with timer("loading train data"): print('loading train data...') if FLAGS.split_train_val: path_prefix = "train_Cnt_Id" else: path_prefix = "train_part_Cnt_Neg20" if FLAGS.debug: train_data_path = path + path_prefix + "_Top.ss.csv" train_df = pd.read_csv(train_data_path, dtype=dtypes, usecols=['is_attributed'] + keras_train.USED_FEATURE_LIST) else: train_data_path = path + path_prefix + ".csv" if FLAGS.split_train_val: train_df = pd.read_csv( train_data_path, dtype=dtypes, header=None, sep='\t', names=['is_attributed'] + keras_train.DATA_HEADER, skiprows=range(0, 184903890 - FLAGS.train_eval_len), usecols=['is_attributed'] + keras_train.USED_FEATURE_LIST) else: if FLAGS.stacking: train_df = pd.read_csv( train_data_path, dtype=dtypes, header=None, sep='\t', names=['is_attributed'] + keras_train.DATA_HEADER, #skiprows=range(0,10000000), usecols=['is_attributed'] + keras_train.USED_FEATURE_LIST) else: train_df = pd.read_csv( train_data_path, dtype=dtypes, header=None, sep='\t', names=['is_attributed'] + keras_train. DATA_HEADER, #nrows = 10000000, #skiprows=range(0,10000000), # usecols = ['is_attributed'] + keras_train.USED_FEATURE_LIST ) print(train_df.info()) return train_df
def predict_test(bst): test_df = load_test_data() test_data = test_df[keras_train.USED_FEATURE_LIST].values.astype( DENSE_FEATURE_TYPE) test_id = test_df['click_id'].values #.astype(np.uint32) print("test type {0}".format(test_data.dtype)) del test_df gc.collect() if FLAGS.stacking: test_data = gen_stacking_data(test_data) with timer("predicting test data"): print('predicting test data...') sub_re = pd.DataFrame(test_id, columns=['click_id']) sub_re['is_attributed'] = bst.predict( test_data, num_iteration=FLAGS.best_iteration) time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime()) sub_name = FLAGS.output_model_path + "sub" + time_label + ".csv" sub_re.to_csv(sub_name, index=False)
def load_valide_data(): with timer("loading valide data"): print('loading valide data...') if not FLAGS.split_train_val: path_prefix = "valide_Cnt" if FLAGS.debug: valide_data_path = path + path_prefix + "_Top.ss.csv" valide_df = pd.read_csv(valide_data_path, dtype=dtypes, usecols=['is_attributed'] + keras_train.USED_FEATURE_LIST) else: valide_data_path = path + path_prefix + ".csv" valide_df = pd.read_csv( valide_data_path, dtype=dtypes, header=None, sep='\t', names=['id', 'is_attributed'] + keras_train.DATA_HEADER, #nrows = 10000, # usecols = ['id', 'is_attributed'] + keras_train.USED_FEATURE_LIST ) print(valide_df.info()) return valide_df