def train(): space = "words" train_df = DataSet.load_train() xtr_df, xval_df = train_test_split(train_df, test_size=0.25) test_df = DataSet.load_test() ### Generate data generator train_dg = DataGenerator(data_df=xtr_df,space=space,bucket_num=5,batch_size=5000,is_prefix_pad=True,is_shuffle=True,is_test=False) val_dg = DataGenerator(data_df=xval_df,space=space,bucket_num=5,batch_size=5000,is_prefix_pad=True,is_shuffle=False,is_test=False) test_dg = DataGenerator(data_df=test_df,space=space,bucket_num=5,batch_size=5000,is_prefix_pad=True,is_shuffle=False,is_test=True) ### Must do prepare before using train_dg.prepare() val_dg.prepare() test_dg.prepare() ### load word embedding, can use train_df, val_dg or test_dg item_embed = train_dg.get_item_embed_tensor(space) ### Initialize network siamese_cnn = Siamese_CNN(item_embed,is_freeze=True) ### Initialize model using network siamese_model = Model(siamese_cnn) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_cnn.parameters()), lr=8e-4) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=0.9) ### Train siamese_model.train(train_dg,val_dg,criteria,optimizer_ft,exp_lr_scheduler,25) preds = siamese_model.predict(test_dg).numpy() preds = pd.DataFrame({"y_pre":preds}) preds.to_csv("submission.csv",index=False)
def cv_main(): kf = KFold(n_splits=folder, shuffle=True, random_state=19920618) all_train_df = DataSet.load_train() test_df = DataSet.load_test() test_dg = DataGenerator(data_df=test_df, space=space, bucket_num=5, batch_size=256, is_prefix_pad=False, is_shuffle=False, is_test=True) print("prepare test data generator") test_dg.prepare() item_embed = test_dg.get_item_embed_tensor(space) train_eval = np.zeros(len(all_train_df)) test_eval = np.zeros((len(test_df), folder)) for i, (train_index, val_index) in enumerate(kf.split(all_train_df)): print() train_name = version + "_cv_%s" % (i) xtr_df = all_train_df.iloc[train_index] xval_df = all_train_df.iloc[val_index] train_dg = DataGenerator(data_df=xtr_df, space=space, bucket_num=5, batch_size=batch_size, is_prefix_pad=False, is_shuffle=True, is_test=False) val_dg = DataGenerator(data_df=xval_df, space=space, bucket_num=5, batch_size=256, is_prefix_pad=False, is_shuffle=False, is_test=False) print("prepare train data generator, cv_%s" % i) train_dg.prepare() print("prepare val data generator, cv_%s" % i) val_dg.prepare() siamese_lstm = Siamese_LSTM(pre_trained_embedding=item_embed, is_freeze=is_freeze, hidden_size=hidden_size, number_layers=num_layers, lstm_dropout_p=lstm_drop_p, bidirectional=bidirectional, linear_hid_size=linear_hidden_size, linear_hid_drop_p=linear_hid_drop_p, input_drop_p=lstm_input_drop_p) siamese_lstm.init_weights( ) ##TODO Whether to initialize customised weights as Keras siamese_model = Model(train_name, siamese_lstm) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_lstm.parameters()), lr=LR) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=Gamma) ### Train siamese_model.train(train_dg=train_dg, valid_dg=val_dg, criterion=criteria, optimizer=optimizer_ft, scheduler=exp_lr_scheduler, num_epochs=num_epochs, early_stop_rounds=early_stop) siamese_model.save_plot_() val_pred = siamese_model.predict(val_dg).numpy() train_eval[val_index] = val_pred test_preds = siamese_model.predict(test_dg).numpy() test_eval[:, i] = test_preds train_pred_df = pd.DataFrame({version + "_train_pred_cv": train_eval}) train_pred_df.to_csv(version + "_train_pred_cv.csv", index=False) test_pred_df = pd.DataFrame( test_eval, columns=[version + "_test_pred_cv_%s" % (i) for i in xrange(folder)]) test_pred_df["y_pre"] = test_pred_df.mean(axis=1) test_pred_df.to_csv(version + "_test_pred_cv.csv", index=False) test_pred_df[["y_pre"]].to_csv(version + "_submission_cv.csv", index=False)
def train_main(): train_name = version + "_sm" ##--------------parameters-------------------## train_df = DataSet.load_train() xtr_df, xval_df = train_test_split(train_df, test_size=0.20) test_df = DataSet.load_test() ### Generate data generator train_dg = DataGenerator(data_df=xtr_df, space=space, bucket_num=5, batch_size=batch_size, is_prefix_pad=False, is_shuffle=True, is_test=False) val_dg = DataGenerator(data_df=xval_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=False, is_shuffle=False, is_test=False) test_dg = DataGenerator(data_df=test_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=False, is_shuffle=False, is_test=True) ### Must do prepare before using train_dg.prepare() val_dg.prepare() test_dg.prepare() ### load word embedding, can use train_df, val_dg or test_dg item_embed = train_dg.get_item_embed_tensor(space) ### Initialize network siamese_lstm = Siamese_LSTM(pre_trained_embedding=item_embed, is_freeze=is_freeze, hidden_size=hidden_size, number_layers=num_layers, lstm_dropout_p=lstm_drop_p, bidirectional=bidirectional, linear_hid_size=linear_hidden_size, linear_hid_drop_p=linear_hid_drop_p, input_drop_p=lstm_input_drop_p) siamese_lstm.init_weights( ) ##TODO Whether initialize customised weights as Keras ### Initialize model using network siamese_model = Model(train_name, siamese_lstm) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_lstm.parameters()), lr=LR) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=Gamma) ### Train siamese_model.train(train_dg=train_dg, valid_dg=val_dg, criterion=criteria, optimizer=optimizer_ft, scheduler=exp_lr_scheduler, num_epochs=num_epochs, early_stop_rounds=early_stop) siamese_model.save_plot_() preds = siamese_model.predict(test_dg).numpy() preds = pd.DataFrame({"y_pre": preds}) preds.to_csv(version + "_submission_sm.csv", index=False)
def train(): space = "words" is_freeze = True hidden_size = 100 num_layers = 2 lstm_dropput_p = 0.6 ##TODO 0.4->0.5->0.6 lstm_input_dropout = 0.6 bidirectional = True linear_hidden_size = 200 linear_hid_drop_p = 0.3 train_name = "v0.2" train_df = DataSet.load_train() xtr_df, xval_df = train_test_split(train_df, test_size=0.25) test_df = DataSet.load_test() ### Generate data generator train_dg = DataGenerator(data_df=xtr_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=True, is_shuffle=True, is_test=False) val_dg = DataGenerator(data_df=xval_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=True, is_shuffle=False, is_test=False) test_dg = DataGenerator(data_df=test_df, space=space, bucket_num=5, batch_size=512, is_prefix_pad=True, is_shuffle=False, is_test=True) ### Must do prepare before using train_dg.prepare() val_dg.prepare() test_dg.prepare() ### load word embedding, can use train_df, val_dg or test_dg item_embed = train_dg.get_item_embed_tensor(space) ### Initialize network siamese_lstm = Siamese_LSTM(pre_trained_embedding=item_embed, is_freeze=is_freeze, hidden_size=hidden_size, number_layers=num_layers, lstm_dropout_p=lstm_dropput_p, bidirectional=bidirectional, linear_hid_size=linear_hidden_size, linear_hid_drop_p=linear_hid_drop_p, input_drop_p=lstm_input_dropout) ### Initialize model using network siamese_model = Model(siamese_lstm) criteria = nn.BCEWithLogitsLoss() optimizer_ft = optim.Adam(ifilter(lambda p: p.requires_grad, siamese_lstm.parameters()), lr=0.001) ##TODO 0.001 exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=0.99) ##TODO 0.99 ### Train siamese_model.train(train_name, train_dg, val_dg, criteria, optimizer_ft, exp_lr_scheduler, 150) ##TODO 150 preds = siamese_model.predict(test_dg).numpy() preds = pd.DataFrame({"y_pre": preds}) preds.to_csv("submission.csv", index=False)