def sub(models, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \ scores_text = None, tid = None, sub_re = None, col = None, leak_target = None, aug_data_target = None): tmp_model_dir = "./model_dir/" if not os.path.isdir(tmp_model_dir): os.makedirs(tmp_model_dir, exist_ok=True) if FLAGS.stacking: np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data) np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label) np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data) elif FLAGS.model_type == 'v': np.save(os.path.join(tmp_model_dir, "vae_data.npy"), stacking_data) else: # if FLAGS.load_stacking_data: # sub2[coly] = sub_re # else: sub_re = pd.DataFrame(models_eval(models, test),columns=["target"],index=tid) sub_re["target"] = np.expm1(sub_re["target"].values) # sub_re["target"][leak_target.index] = leak_target # blend = sub2 #blend[sub2.columns] if FLAGS.predict_feature: time_label = "_" + col + time.strftime('_%Y_%m_%d_%H', time.gmtime()) sub_name = tmp_model_dir + time_label + ".csv" elif FLAGS.aug_data: time_label = "_" + aug_data_target + time.strftime('_%Y_%m_%d_%H', time.gmtime()) sub_name = tmp_model_dir + time_label + ".csv" else: time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime()) sub_name = tmp_model_dir + "sub" + time_label + ".csv" sub_re.to_csv(sub_name) # save model to file for i, model in enumerate(models): if (model[1] == 'l'): model_name = tmp_model_dir + "model_" + str(i) + time_label + ".txt" model[0].save_model(model_name) elif (model[1] == 'k' or model[1] == 'r'): model_name = tmp_model_dir + "model_" + str(i) + time_label + ".h5" model[0].model.save(model_name) scores_text_frame = pd.DataFrame(scores_text, columns = ["score_text"]) score_text_file = tmp_model_dir + "score_text" + time_label + ".csv" scores_text_frame.to_csv(score_text_file, index=False) scores = scores_text_frame["score_text"] for i in range(FLAGS.epochs): scores_epoch = scores.loc[scores.str.startswith('epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1])) print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \ scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median())) if not os.path.isdir(FLAGS.output_model_path): os.makedirs(FLAGS.output_model_path, exist_ok=True) for fileName in os.listdir(tmp_model_dir): dst_file = os.path.join(FLAGS.output_model_path, fileName) if os.path.exists(dst_file): os.remove(dst_file) shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path)
def sub(models, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \ scores_text = None, tid = None, sub_re = None, col = None, leak_target = None, aug_data_target = None): tmp_model_dir = "./model_dir/" if not os.path.isdir(tmp_model_dir): os.makedirs(tmp_model_dir, exist_ok=True) if FLAGS.stacking: np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data) np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label) np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data) elif FLAGS.model_type == 'v': np.save(os.path.join(tmp_model_dir, "vae_data.npy"), stacking_data) else: sub_re = pd.DataFrame(models_eval(models, test), index=tid) time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime()) sub_name = tmp_model_dir + "sub" + time_label + ".csv" sub_re.to_csv(sub_name) # save model to file for i, model in enumerate(models): if (model[1] == 'l'): model_name = tmp_model_dir + "model_" + str( i) + time_label + ".txt" model[0].save_model(model_name) elif (model[1] == 'k' or model[1] == 'r'): model_name = tmp_model_dir + "model_" + str( i) + time_label + ".h5" model[0].save(model_name) # scores_text_frame = pd.DataFrame(scores_text, columns = ["score_text"]) score_text_file = tmp_model_dir + "score_text" + time_label + ".csv" scores_text_df = pd.concat(scores_text) scores_text_df.groupby(scores_text_df.index).agg( ['max', 'min', 'mean', 'median', 'std']).T.to_csv(score_text_file, index=True) # scores = scores_text_frame["score_text"] # for i in range(FLAGS.epochs): # scores_epoch = scores.loc[scores.str.startswith('epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1])) # print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \ # scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median())) if not os.path.isdir(FLAGS.output_model_path): os.makedirs(FLAGS.output_model_path, exist_ok=True) for fileName in os.listdir(tmp_model_dir): dst_file = os.path.join(FLAGS.output_model_path, fileName) if os.path.exists(dst_file): os.remove(dst_file) shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path)
def sub(mdoels, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \ scores_text = None, tid = None, sub_re = None): tmp_model_dir = "./model_dir/" if not os.path.isdir(tmp_model_dir): os.makedirs(tmp_model_dir, exist_ok=True) if False: #FLAGS.stacking: np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data) np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label) np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data) else: # if FLAGS.load_stacking_data: # sub2[coly] = sub_re # else: sub_re = pd.DataFrame(tid, columns=['click_id']) sub_re['is_attributed'] = models_eval(models, test) # sub2[c] = sub2[c].clip(0+1e12, 1-1e12) # blend = sub2 #blend[sub2.columns] time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime()) sub_name = tmp_model_dir + "sub" + time_label + ".csv" sub_re.to_csv(sub_name, index=False) # save model to file if (models[0][1] == 'l'): model_name = tmp_model_dir + "model" + ".txt" models[0][0].save_model(model_name) elif (models[0][1] == 'k'): model_name = tmp_model_dir + "model" + ".h5" models[0][0].model.save(model_name) scores_text_frame = pd.DataFrame(scores_text, columns=["score_text"]) score_text_file = tmp_model_dir + "score_text" + time_label + ".csv" scores_text_frame.to_csv(score_text_file, index=False) scores = scores_text_frame["score_text"] for i in range(FLAGS.epochs): scores_epoch = scores.loc[scores.str.startswith( 'epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1])) print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \ scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median())) if not os.path.isdir(FLAGS.output_model_path): os.makedirs(FLAGS.output_model_path, exist_ok=True) for fileName in os.listdir(tmp_model_dir): dst_file = os.path.join(FLAGS.output_model_path, fileName) if os.path.exists(dst_file): os.remove(dst_file) shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path)
def sub(mdoels, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \ scores_text = None, coly = None, tid = None, sub_re = None): tmp_model_dir = "./model_dir/" if not os.path.isdir(tmp_model_dir): os.makedirs(tmp_model_dir, exist_ok=True) if FLAGS.stacking: np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data) np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label) np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data) else: sub2 = pd.DataFrame(np.zeros((test.shape[0], len(coly))), columns=coly) if FLAGS.load_stacking_data: sub2[coly] = sub_re else: sub2[coly] = models_eval(models, test) sub2['id'] = tid for c in coly: sub2[c] = sub2[c].clip(0 + 1e12, 1 - 1e12) blend = sub2 #blend[sub2.columns] time_label = strftime('_%Y_%m_%d_%H_%M_%S', gmtime()) sub_name = tmp_model_dir + "sub" + time_label + ".csv" blend.to_csv(sub_name, index=False) scores_text_frame = pd.DataFrame(scores_text, columns=["score_text"]) score_text_file = tmp_model_dir + "score_text" + time_label + ".csv" scores_text_frame.to_csv(score_text_file, index=False) scores = scores_text_frame["score_text"] for i in range(FLAGS.epochs): scores_epoch = scores.loc[scores.str.startswith( 'epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1])) print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \ scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median())) if not os.path.isdir(FLAGS.output_model_path): os.makedirs(FLAGS.output_model_path, exist_ok=True) for fileName in os.listdir(tmp_model_dir): dst_file = os.path.join(FLAGS.output_model_path, fileName) if os.path.exists(dst_file): os.remove(dst_file) shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path)
def train_sub(col): scores_text = [] aug_data_target = None if FLAGS.aug_data: aug_data_target = 'pred_nz_min' train_data, train_label, test_data, tid, valide_data, valide_label, weight, leak_target = LoadAugDdata(aug_data_target) else: train_data, train_label, test_data, tid, valide_data, valide_label, weight, leak_target = load_data(col) if not FLAGS.load_stacking_data: models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label, flags = FLAGS, \ model_types = [FLAGS.model_type], scores = scores_text, test_data = test_data, \ valide_data = valide_data, valide_label = valide_label, cat_max = None, emb_weight = None, leak_target = leak_target) else: for i in range(train_label.shape[1]): models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label[:, i], flags = FLAGS, \ model_types = [FLAGS.model_type], scores = scores_text, emb_weight = emb_weight, test_data = test_data \ # , valide_data = train_data[:100], valide_label = train_label[:100, i] ) sub_re[:, i] = models_eval(models, test_data) sub(models, stacking_data = stacking_data, stacking_label = stacking_label, stacking_test_data = stacking_test_data, \ test = test_data, scores_text = scores_text, tid = tid, col = col, leak_target = leak_target, aug_data_target = aug_data_target)
if not os.path.isdir(FLAGS.output_model_path): os.makedirs(FLAGS.output_model_path, exist_ok=True) for fileName in os.listdir(tmp_model_dir): dst_file = os.path.join(FLAGS.output_model_path, fileName) if os.path.exists(dst_file): os.remove(dst_file) shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path) if __name__ == "__main__": scores_text = [] train_data, train_label, test_data, tid, valide_data, valide_label, weight = load_data( ) if not FLAGS.load_only_singleCnt and FLAGS.model_type == 'k': test_data = list(test_data.transpose()) if not FLAGS.load_stacking_data: models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label, flags = FLAGS, \ model_types = [FLAGS.model_type], scores = scores_text, test_data = test_data, \ valide_data = valide_data, valide_label = valide_label) else: for i in range(train_label.shape[1]): models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label[:, i], flags = FLAGS, \ model_types = [FLAGS.model_type], scores = scores_text, emb_weight = emb_weight, test_data = test_data \ # , valide_data = train_data[:100], valide_label = train_label[:100, i] ) sub_re[:, i] = models_eval(models, test_data) sub(models, stacking_data = stacking_data, stacking_label = stacking_label, stacking_test_data = stacking_test_data, \ test = test_data, scores_text = scores_text, tid = tid)
train_label, flags=FLAGS, model_types=['cnn'], tokenizer=tokenizer ) #, valide_data = train_data, valide_label = train_label) # exit(0) # for c in coly: # print("------Label: {0}".format(c)) # label = train_label[c].values # models, _, _, _ = nfold_train(train_data, label, fold = 5, model_types = ['k']) #, valide_label = train_label) # multi_label_models.append(models) # sub2[c] = models_eval(models, test_data) #model = ensemble.ExtraTreesClassifier(n_jobs=-1, random_state=3) #model.fit(data[:nrow], y[:nrow]) # print(1- model.score(data[:nrow], y[:nrow])) sub2[coly] = models_eval(models, test_data) # sub2 = pd.DataFrame([[c[1] for c in sub2[row]] for row in range(len(sub2))]).T # sub2.columns = coly sub2['id'] = tid for c in coly: sub2[c] = sub2[c].clip(0 + 1e12, 1 - 1e12) ''' #blend 1 sub2.columns = [x+'_' if x not in ['id'] else x for x in sub2.columns] blend = pd.merge(sub1, sub2, how='left', on='id') for c in coly: blend[c] = blend[c] * 0.8 + blend[c+'_'] * 0.2 blend[c] = blend[c].clip(0+1e12, 1-1e12) blend = blend[sub1.columns] #blend 2 sub2 = blend[:]
def sub(models, stacking_data = None, stacking_label = None, stacking_test_data = None, test_data = None, \ scores_text = None, tid = None, sub_re = None, col = None, leak_target = None, aug_data_target = None, \ train_part_img_id = None, validate_part_img_id = None, train_data = None): tmp_model_dir = "./model_dir/" time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime()) # tmp_model_dir = "./model_dir/" + time_label if not os.path.isdir(tmp_model_dir): os.makedirs(tmp_model_dir, exist_ok=True) if FLAGS.stacking: # np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data) # np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label) # np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data) # stacking_data.to_csv(tmp_model_dir + '/stacking_train_data' + time_label + '.csv', index = False) # stacking_label.to_csv(tmp_model_dir + '/stacking_train_label' + time_label + '.csv', index = False) with open( tmp_model_dir + '/stacking_train_data' + time_label + '.pickle', 'wb+') as handle: pickle.dump(stacking_data, handle) with open( tmp_model_dir + '/stacking_train_label' + time_label + '.pickle', 'wb+') as handle: pickle.dump(stacking_label, handle) elif FLAGS.predict: with open(tmp_model_dir + '/train_data' + time_label + '.pickle', 'wb+') as handle: pickle.dump(stacking_data, handle) with open(tmp_model_dir + '/test_data' + time_label + '.pickle', 'wb+') as handle: pickle.dump(stacking_test_data, handle) else: # pass flat_models = [ (Model(inputs=m[0].model.inputs, outputs=m[0].model.get_layer(name='avg_pool').output), 'k') for m in models ] flat_train_re = models_eval(flat_models, preprocess_img(train_data['img'])) flat_test_re = models_eval(flat_models, preprocess_img(test_data['img'])) with open(tmp_model_dir + '/flat_train_re' + time_label + '.pickle', 'wb+') as handle: pickle.dump(flat_train_re, handle) with open(tmp_model_dir + '/flat_test_re' + time_label + '.pickle', 'wb+') as handle: pickle.dump(flat_test_re, handle) # save model to file for i, model in enumerate(models): if (model[1] == 'l'): model_name = tmp_model_dir + "model_" + str( i) + time_label + ".txt" model[0].save_model(model_name) elif (model[1] == 'k' or model[1] == 'r'): model_name = tmp_model_dir + "model_" + str( i) + time_label + ".h5" model[0].model.save(model_name) train_part_img_id[i].to_csv( tmp_model_dir + 'train_part_img_id_' + str(i) + '.csv', index=False) validate_part_img_id[i].to_csv( tmp_model_dir + 'validate_part_img_id_' + str(i) + '.csv', index=False) # scores_text_frame = pd.DataFrame(scores_text, columns = ["score_text"]) score_text_file = tmp_model_dir + "score_text" + time_label + ".csv" scores_text_df = pd.concat(scores_text) scores_text_df.groupby(scores_text_df.index).agg( ['max', 'min', 'mean', 'median', 'std']).T.to_csv(score_text_file, index=True) # scores = scores_text_frame["score_text"] # for i in range(FLAGS.epochs): # scores_epoch = scores.loc[scores.str.startswith('epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1])) # print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \ # scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median())) if not os.path.isdir(FLAGS.output_model_path): os.makedirs(FLAGS.output_model_path, exist_ok=True) for fileName in os.listdir(tmp_model_dir): dst_file = os.path.join(FLAGS.output_model_path, fileName) if os.path.exists(dst_file): os.remove(dst_file) shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path)
def load_data(): print("\nData Load Stage") if FLAGS.debug: nrow = 10000 else: nrow = None training = pd.read_csv(path + '/train.csv', index_col="item_id", parse_dates=["activation_date"], nrows=nrow) traindex = training.index testing = pd.read_csv(path + '/test.csv', index_col="item_id", parse_dates=["activation_date"], nrows=nrow) testdex = testing.index ntrain = training.shape[0] ntest = testing.shape[0] y = training.deal_probability.copy() training.drop("deal_probability", axis=1, inplace=True) print('Train shape: {} Rows, {} Columns'.format(*training.shape)) print('Test shape: {} Rows, {} Columns'.format(*testing.shape)) print("Combine Train and Test") df = pd.concat([training, testing], axis=0) del training, testing gc.collect() print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape)) print("Feature Engineering") df["price"] = np.log(df["price"] + 0.001) df["price"].fillna(-999, inplace=True) df["image_top_1"].fillna(-999, inplace=True) print("\nCreate Time Variables") df["Weekday"] = df['activation_date'].dt.weekday df["WeekdOfYear"] = df['activation_date'].dt.week df["DayOfMonth"] = df['activation_date'].dt.day # Create Validation Index and Remove Dead Variables # training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index # validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index df.drop(["activation_date", "image"], axis=1, inplace=True) print("\nEncode Variables") categorical = [ "user_id", "region", "city", "parent_category_name", "category_name", "user_type", "image_top_1", "param_1", "param_2", "param_3" ] print("Encoding :", categorical) # Encoder: lbl = preprocessing.LabelEncoder() for col in categorical: df[col].fillna('Unknown') df[col] = lbl.fit_transform(df[col].astype(str)) cat_max = df[keras_train.USED_CATEGORY_FEATURES].max().astype('int64') print(cat_max) textfeats = ["description", "title"] # print(df.head) # exit(0) if FLAGS.lgb_boost_dnn: models = [] for i in range(FLAGS.lgb_ensemble_nfold): bst = lgb.Booster(model_file=FLAGS.input_previous_model_path + '/model_' + str(0) + '_2018_05_31_04_04_47.txt') models.append((bst, 'l')) df['lgb_pred'] = models_eval(models, df) keras_train.USED_FEATURE_LIST += ['lgb_pred'] emb_weight = None if FLAGS.model_type == 'k': print('Tokenizer...') for cols in textfeats: df[cols] = df[cols].astype(str).fillna('missing') # FILL NA data = df[textfeats].apply(lambda x: ' '.join(x), axis=1).values tokenizer = Tokenizer(num_words=FLAGS.vocab_size) tokenizer.fit_on_texts(data) for i, cols in enumerate(textfeats): data = pad_sequences(tokenizer.texts_to_sequences(df[cols]), maxlen=FLAGS.max_len[i]) df[cols] = data.tolist() if FLAGS.load_wv_model: emb_weight = get_word2vec_embedding(location = FLAGS.input_training_data_path + FLAGS.wv_model_file, \ tokenizer = tokenizer, nb_words = FLAGS.vocab_size, embed_size = FLAGS.gram_embedding_dim, \ model_type = FLAGS.wv_model_type, uniform_init_emb = FLAGS.uniform_init_emb) else: if FLAGS.uniform_init_emb: emb_weight = np.random.uniform( 0, 1, (FLAGS.vocab_size, FLAGS.emb_dim)) else: emb_weight = np.zeros((FLAGS.vocab_size, FLAGS.emb_dim)) # df.drop(textfeats, axis=1,inplace=True) print(df.info()) # df.to_pickle('lgb_pred.pickle') # exit(0) train_data = df.loc[traindex, keras_train.USED_FEATURE_LIST] train_label = y.values test_data = df.loc[testdex, keras_train.USED_FEATURE_LIST] test_id = testdex valide_data = None valide_label = None weight = None return train_data, train_label, test_data, test_id, valide_data, valide_label, weight, cat_max, emb_weight