def train_on_all_set(): train_config = get_config() bert_config = get_bert_config(train_config) import pickle with open('tok_text_uncased.pkl', 'rb') as h: text = pickle.load(h) with open('y_train.pkl', 'rb') as h: label = pickle.load(h) with open('y_aux.pkl', 'rb') as h: aux = pickle.load(h) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) del iden_df lw = 1 / np.mean(weights) train_seg = [[0 for _ in t] for t in text] train_gen = GeneralDataGenerator( inputs=[text, train_seg], outputs=[label, aux], sample_weights=[weights, np.ones_like(weights)], batch_size=32, pad_fn=[ lambda x: seq_padding(x, truncate=False), lambda x: seq_padding(x, truncate=False) ]) # train_gen = AllDataGenerator(text, label, aux, sample_weight) with tf.device('/cpu:0'): model = get_bert_multi_model(bert_config) # model.load_weights('save_models/bert.weights.h5') # OPTIMIZER PARAMs lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 1 * len(train_gen) warmup_steps = int(0.1 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, ) parallel_model = multi_gpu_model(model, gpus=2) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer, loss_weights=[lw, 1.]) parallel_model.fit_generator( train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=1, max_queue_size=100, ) model.save('save_models/bert.weights-uncased-new_weight_all.h5') print("DONE")
def get_weight(): iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) w0 = get_weights2(iden_df) del iden_df df = pd.read_csv('new_processed_data/train.csv') df['weight'] = weights df['weight0'] = w0 df.to_csv('new_processed_data/train_weight.csv', index=False)
def train_ml_all_set(): train_config = get_config() bert_config = get_bert_config(train_config) import pickle with open('tok_text_uncased.pkl', 'rb') as h: text = pickle.load(h) with open('y_train.pkl', 'rb') as h: label = pickle.load(h) with open('y_aux.pkl', 'rb') as h: aux = pickle.load(h) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) del iden_df train_text, _, train_label, _, train_aux, _, train_weights, _ = train_test_split( text, label, aux, weights, test_size=0.055, random_state=59) train_seg = [[0 for _ in t] for t in train_text] train_gen = GeneralDataGenerator( inputs=[train_text, train_seg], outputs=[train_label, train_aux], sample_weights=[train_weights, np.ones_like(train_weights)], pad_fn=[ lambda x: seq_padding(x, truncate=False), lambda x: seq_padding(x, truncate=False) ], batch_size=64) with tf.device('/cpu:0'): model = get_bert_multi_model(bert_config) # optimizer = Adam(lr=2e-5) # OPTIMIZER PARAMs lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 1 * len(train_gen) warmup_steps = int(0.1 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, weight_decay_pattern=[ 'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo' ], ) parallel_model = multi_gpu_model(model, gpus=4) # parallel_model.compile(loss=[focal_loss(gamma=2., alpha=.25), 'binary_crossentropy'], optimizer=optimizer) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer) parallel_model.fit_generator( train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=1, max_queue_size=20, ) model.save('save_models/bert.weights-large-nw.h5') # print('SAVED') # parallel_model.fit_generator(train_gen.__iter__(), # steps_per_epoch=len(train_gen), # epochs=1, # max_queue_size=20, # ) # model.save('save_models/bert.weights-uncased-ml2-e2.h5') print("DONE")
def train_elmo(): # 489603 # <S> 489604 # </S> 489605 df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') datadir = os.path.join('elmo_model') options_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json') weight_file = os.path.join( datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5') texts = df['comment_text'].values labels_aux = df[AUX_COLUMNS].values identities = df[IDENTITY_COLUMNS].fillna(0).values labels = df[TARGET_COLUMN].values weights = get_weights_new(iden_df) # import pickle # word_index = pickle.load(open('new_processed_data/word_index.pkl', 'rb')) # with open('new_processed_data/vocab.txt', 'w', encoding='utf8') as f: # for k in word_index.keys(): # f.write(f'{k}\n') # f.write('</S>') # f.write('<S>') import pickle embedding_matrix = pickle.load(open('new_processed_data/emb.pkl', 'rb')) texts_ids = pickle.load(open('new_processed_data/texts.pkl', 'rb')) batcher = BT('new_processed_data/vocab.txt', 50) del iden_df del df train_ind, val_ind = train_test_split(range(len(texts)), random_state=59, test_size=0.055) # FOR ELMO train_texts, val_texts = texts[train_ind], texts[val_ind] train_texts = [s.split(' ')[:512] for s in train_texts] val_texts = [s.split(' ')[:512] for s in val_texts] # FOR W2v train_texts_ids = [texts_ids[i] for i in train_ind] val_texts_ids = [texts_ids[i] for i in val_ind] train_texts_ids = [ti[:512] for ti in train_texts_ids] val_texts_ids = [ti[:512] for ti in val_texts_ids] train_labels, val_labels = labels[train_ind], labels[val_ind] train_weight = weights[train_ind] lw = 1 / np.mean(train_weight) train_aux_labels = labels_aux[train_ind] train_iden, val_iden = identities[train_ind], identities[val_ind] pad_fn = [ lambda x: seq_padding(x, truncate=False), batcher.batch_sentences ] train_gen = GeneralDataGenerator( inputs=[train_texts_ids, train_texts], outputs=[train_labels, train_aux_labels], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=32, pad_fn=pad_fn) val_gen = ELMoPredictGenerator(text_ids=val_texts_ids, text=val_texts, pad_fn=pad_fn, batch_size=32) model = get_lstm_elmo_model(embedding_matrix, weight_file, options_file, 1024, len(AUX_COLUMNS)) # lr = 1e-3 # weight_decay = 0.01 # bsz = 32 # decay_steps = 3 * len(train_gen) # warmup_steps = int(0.05 * decay_steps) # # optimizer = AdamWarmup( # decay_steps=decay_steps, # warmup_steps=warmup_steps, # lr=lr, # weight_decay=weight_decay # ) optimizer = Adam(1e-3) load_model_weights(model, 'save_models/weights.3-93.597.elmo_w2v_lstm2_dp05.pkl') model.compile(loss='binary_crossentropy', optimizer=optimizer, loss_weights=[lw, 1.]) model.summary() logger = KFoldLogger('elmo_w2v_lstm2_dp05', val_gen, val_true=val_labels, val_iden=val_iden) model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=5, callbacks=[logger], initial_epoch=3)
def train_split(): df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') labels_aux = df[AUX_COLUMNS].values identities = iden_df[IDENTITY_COLUMNS].fillna(0).values labels = df[TARGET_COLUMN].values weights = get_weights_new(iden_df) # labels = (labels >= 0.5).astype(np.float) # labels_aux = (labels_aux >= 0.5).astype(np.float) import pickle embedding_matrix = pickle.load(open('new_processed_data/emb.pkl', 'rb')) # bpe_embedding_matrix = pickle.load(open('new_processed_data/bpe_embedding_matrix.pkl', 'rb')) texts = pickle.load(open('new_processed_data/texts.pkl', 'rb')) # bpe_embedding_matrix = np.concatenate([bpe_embedding_matrix, bpe_embedding_matrix], axis=1) # embedding_matrix += bpe_embedding_matrix del iden_df del df train_ind, val_ind = train_test_split(range(len(texts)), random_state=59, test_size=0.055) train_texts = [texts[i][:1024] for i in train_ind] val_texts = [texts[i][:1024] for i in val_ind] train_labels, val_labels = labels[train_ind], labels[val_ind] train_weight = weights[train_ind] # train_weight = train_weight / np.mean(train_weight) lw = 1 / np.mean(train_weight) train_aux_labels = labels_aux[train_ind] train_iden, val_iden = identities[train_ind], identities[val_ind] train_gen = GeneralDataGenerator( inputs=[train_texts], outputs=[train_labels, train_aux_labels], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=512) val_gen = GeneralPredictGenerator(text=val_texts, batch_size=512) # model = get_dcnn_model(embedding_matrix, len(AUX_COLUMNS)) # model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) # model.compile(loss=[binary_crossentropy_with_ranking, 'binary_crossentropy'], optimizer='adam') # opt = RMSprop(lr=1e-3) opt = Adam(1e-3) # lr = 1e-3 # weight_decay = 0.01 # bsz = 32 # decay_steps = 10 * len(train_gen) # warmup_steps = int(0.1 * decay_steps) # # opt = AdamWarmup( # decay_steps=decay_steps, # warmup_steps=warmup_steps, # lr=lr, # weight_decay=weight_decay # ) # load_model_weights(model, 'save_models/weights.0.9380885160416297.dcnn_dp0.5_n_deep.pkl') model.compile(loss='binary_crossentropy', optimizer=opt, loss_weights=[lw, 1.]) model.summary() EMAer = ExponentialMovingAverage(model) EMAer.inject() logger = KFoldLogger('lstm_w1_final', val_gen, val_true=val_labels, val_iden=val_iden, patience=10, lr_patience=5) model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=15, callbacks=[logger], verbose=1)
def train_gpt(): bpe = get_bpe_from_files(encoder_path, vocab_path) import pickle # with open('tok_text_uncased.pkl', 'rb') as h: # text = pickle.load(h) with open('y_train.pkl', 'rb') as h: label = pickle.load(h) with open('y_aux.pkl', 'rb') as h: aux = pickle.load(h) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) del iden_df df = pd.read_csv('new_processed_data/train.csv') text = df['comment_text'].values del df train_text, _, train_label, _, train_aux, _, train_weights, _ = train_test_split( text, label, aux, weights, test_size=0.055, random_state=59) def pad_fn(ts): ts = [bpe.encode(t)[:512] for t in ts] return seq_padding(ts, truncate=False) train_gen = GeneralDataGenerator( inputs=[ train_text, ], outputs=[train_label, train_aux], sample_weights=[train_weights, np.ones_like(train_weights)], batch_size=16, pad_fn=[ pad_fn, ]) with tf.device('/cpu:0'): model = get_gpt_model(config_path, checkpoint_path) lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 2 * len(train_gen) warmup_steps = int(0.05 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, ) lw = 1 / np.mean(train_weights) model.load_weights('save_models/gpt.weights-new_weight.h5') parallel_model = multi_gpu_model(model, gpus=2) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer, loss_weights=[lw, 1.]) parallel_model.fit_generator(train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=2, max_queue_size=100, initial_epoch=1) model.save('save_models/gpt.weights-new_weight-2.h5') print("DONE")