def __init__(self, train, test, dir): self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model) self.pW = self.model.add_parameters((args.m, args.s)) self.pU = self.model.add_parameters((vocab.size(), args.m)) self.trainData = train self.testData = test self.dir = dir if os.path.exists(self.dir): shutil.rmtree(self.dir)
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000, 1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W * W * W * W * W * W * W z = C.squared_distance(W, W) z.value() z.backward() print("CPU time:", time.time() - s)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000, 1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W * W * W * W * W * W * W z = G.squared_distance(W, W) z.value() z.backward() print("GPU time:", time.time() - s)
def __init__(self, input_vector_size, *argc): if input_vector_size == 0: return model = dy.Model() self.params = { "builders": [ dy.LSTMBuilder(1, input_vector_size, LSTM_HIDDEN_DIM, model) for _ in range(2) ] + [ dy.LSTMBuilder(1, LSTM_HIDDEN_DIM * 2, LSTM_HIDDEN_DIM, model) for _ in range(2) ], "W": model.add_parameters((LINEAR_DIM, LSTM_HIDDEN_DIM * 2)), "v": model.add_parameters(LINEAR_DIM) } self.model = model
def main(): parser = argparse.ArgumentParser( description= 'Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument( '--train_x_path', type=str, default='./data/train_x.txt', help='File path of train x data [default: `./data/train_x.txt`]') parser.add_argument( '--train_y_path', type=str, default='./data/train_y.txt', help='File path of train y data [default: `./data/train_x.txt`]') parser.add_argument( '--valid_x_path', type=str, default='./data/valid_x.txt', help='File path of valid x data [default: `./data/valid_x.txt`]') parser.add_argument( '--valid_y_path', type=str, default='./data/valid_y.txt', help='File path of valid y data [default: `./data/valid_y.txt`]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--batch_size', type=int, default=64, help='Mini batch size [default: 64]') parser.add_argument('--win_sizes', type=int, nargs='*', default=[3, 4, 5], help='Window sizes of filters [default: [3, 4, 5]]') parser.add_argument( '--num_fil', type=int, default=100, help='Number of filters in each window size [default: 100]') parser.add_argument('--s', type=float, default=3.0, help='L2 norm constraint on w [default: 3.0]') parser.add_argument('--dropout_prob', type=float, default=0.5, help='Dropout probability [default: 0.5]') parser.add_argument( '--v_strategy', type=str, default='static', help= 'Embedding strategy. rand: Random initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]' ) parser.add_argument( '--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) N_EPOCHS = args.n_epochs WIN_SIZES = args.win_sizes BATCH_SIZE = args.batch_size EMB_DIM = 300 OUT_DIM = 1 L2_NORM_LIM = args.s NUM_FIL = args.num_fil DROPOUT_PROB = args.dropout_prob V_STRATEGY = args.v_strategy ALLOC_MEM = args.alloc_mem if V_STRATEGY in ['rand', 'static', 'non-static']: NUM_CHA = 1 else: NUM_CHA = 2 # FILE paths W2V_PATH = './GoogleNews-vectors-negative300.bin' TRAIN_X_PATH = args.train_x_path TRAIN_Y_PATH = args.train_y_path VALID_X_PATH = args.valid_x_path VALID_Y_PATH = args.valid_y_path # DyNet setting dyparams = dy.DynetParams() dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load pretrained embeddings pretrained_model = gensim.models.KeyedVectors.load_word2vec_format( W2V_PATH, binary=True) vocab = pretrained_model.wv.vocab.keys() w2v = pretrained_model.wv # Build dataset ======================================================================================================= w2c = build_w2c(TRAIN_X_PATH, vocab=vocab) w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk') train_x, train_y = build_dataset(TRAIN_X_PATH, TRAIN_Y_PATH, w2i, unk='unk') valid_x, valid_y = build_dataset(VALID_X_PATH, VALID_Y_PATH, w2i, unk='unk') train_x, train_y = sort_data_by_length(train_x, train_y) valid_x, valid_y = sort_data_by_length(valid_x, valid_y) VOCAB_SIZE = len(w2i) print('VOCAB_SIZE:', VOCAB_SIZE) V_init = init_V(w2v, w2i) with open(os.path.join(RESULTS_DIR, './w2i.dump'), 'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'), 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w) # Build model ================================================================================= model = dy.Model() trainer = dy.AdamTrainer(model) # V1 V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) if V_STRATEGY in ['static', 'non-static', 'multichannel']: V1.init_from_array(V_init) if V_STRATEGY in ['static', 'multichannel']: V1_UPDATE = False else: # 'rand', 'non-static' V1_UPDATE = True make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) # V2 if V_STRATEGY == 'multichannel': V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) V2.init_from_array(V_init) V2_UPDATE = True make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) layers = [ CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh, DROPOUT_PROB), Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic) ] # Train model ================================================================================ n_batches_train = math.ceil(len(train_x) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] pred_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(train_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=False) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_train.append(mb_loss.value()) pred_all_train.extend(list(binary_pred(y.npvalue().flatten()))) # Backward prop mb_loss.backward() trainer.update() # L2 norm constraint layers[1].scale_W(L2_NORM_LIM) # Make padding embs zero if V_STRATEGY in ['rand', 'non-static']: make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) elif V_STRATEGY in ['multichannel']: make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) # Valid loss_all_valid = [] pred_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(valid_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=True) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_valid.append(mb_loss.value()) pred_all_valid.extend(list(binary_pred(y.npvalue().flatten()))) print( 'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]' % ( epoch + 1, np.mean(loss_all_train), f1_score(train_y, pred_all_train), accuracy_score(train_y, pred_all_train), np.mean(loss_all_valid), f1_score(valid_y, pred_all_valid), accuracy_score(valid_y, pred_all_valid), time.time() - start_time, )) # Save model ========================================================================================================================= if V_STRATEGY in ['rand', 'static', 'non-static']: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1] + layers) else: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1, V2] + layers)
def main(): parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet') parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]') parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]') parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]') parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = args.vocab_size EMB_DIM = args.emb_dim HID_DIM = args.hid_dim MAXOUT_DIM = args.maxout_dim ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset dataset = Dataset( TRAIN_X_FILE, TRAIN_Y_FILE, VALID_X_FILE, VALID_Y_FILE, vocab_size=VOCAB_SIZE, batch_size=BATCH_SIZE, n_train=N_TRAIN, n_valid=N_VALID ) VOCAB_SIZE = len(dataset.w2i) print('VOCAB_SIZE', VOCAB_SIZE) # Build model model = dy.Model() trainer = dy.AdamTrainer(model) V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM) decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE) # Train model start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] dataset.reset_train_iter() for train_x_mb, train_y_mb in tqdm(dataset.train_iter): # Create a new computation graph dy.renew_cg() associate_parameters([encoder, decoder]) losses = [] for x, t in zip(train_x_mb, train_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) # Decoder decoder.set_initial_states(hp, hb_1) t_embs = [dy.lookup(V, t_t) for t_t in t_in] y = decoder(t_embs) # Loss loss = dy.esum( [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)] ) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] dataset.reset_valid_iter() for valid_x_mb, valid_y_mb in dataset.valid_iter: # Create a new computation graph dy.renew_cg() associate_parameters([encoder, decoder]) losses = [] for x, t in zip(valid_x_mb, valid_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) # Decoder decoder.set_initial_states(hp, hb_1) t_embs = [dy.lookup(V, t_t) for t_t in t_in] y = decoder(t_embs) # Loss loss = dy.esum( [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)] ) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % ( epoch+1, np.mean(loss_all_train), np.mean(loss_all_valid), time.time()-start_time )) # Save model dy.save('./model_e'+str(epoch+1), [V, encoder, decoder]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(dataset.w2i, f_w2i) pickle.dump(dataset.i2w, f_i2w)
def __init__(self, num_layers, input_dim, hidden_dim, tasks, src_domain, main_task, adversarial_task, vocabularies, update_embs, exp_path, prediction_layer, additional_params): # parameter collection self.model = dn.Model() #self.basename = basename self.exp_path = exp_path # dimension of the word embeddings self.input_dim = input_dim # dimension of the rnn hidden states self.hidden_dim = hidden_dim # number of layers of the rnn self.num_layers = num_layers # the tasks self.tasks = tasks # the src domain self.src_domain = src_domain # the task-specific layer used to predict the main task in case the main task is not trainable if tasks[main_task].trainable: self.prediction_layer = main_task else: self.prediction_layer = prediction_layer # additional parameters, e.g. dynet parameters, that are not set for the model directly but should be stored when reporting results self.additional_params = additional_params # the name of the main task that is the target of optimization self.main_task = main_task self.vocabularies = vocabularies self.update_embs = update_embs # setup the shared rnn self.rnn = self.setup_rnn(model=self.model, num_layers=self.num_layers, input_dim=self.input_dim, hidden_dim=self.hidden_dim) # setup the embedding layers for each vocabulary, then associate each task with the respective embedding layer (some tasks (or all tasks) might share the same embedding layer) self.embedding_layers = {} for voc_name, vocab_builder in sorted(iter(self.vocabularies.items())): self.embedding_layers[voc_name] = self.setup_embedding_layer( model=self.model, emb_dim=self.input_dim, vocab_size=vocab_builder.vocab_size, layername='{}#emb'.format(voc_name), update_embs=self.update_embs, embs=vocab_builder.embeds) # associate each task with the respective embedding layer self.task2embedding_layers = {} for tid, task in sorted(iter(self.tasks.items())): self.task2embedding_layers[tid] = task.vocab_name # set up the task specific output layers for each task. # don't set up an output layer if there is no training data for the task self.output_layers = {} for tid, task in sorted(iter(self.tasks.items())): if task.trainable: self.output_layers[tid] = self.setup_output_layer( model=self.model, input_dim=self.hidden_dim, output_dim=task.num_classes, layername='{}#out'.format(task.task_name)) # add an embedding layer for the adversarial data self.task2embedding_layers['adversarial'] = adversarial_task.vocab_name # add a special output layer for the adversarial # we model a binary output layer predicting domain self.output_layers['adversarial'] = self.setup_output_layer( model=self.model, input_dim=self.hidden_dim, output_dim=2, layername='adversarialout') self.gradient_reversal_layer = self.setup_gradient_reversal_layer( model=self.model, input_dim=self.hidden_dim, output_dim=self.hidden_dim, layername='gr') #store all the model parameters in the model_params dict self._set_model_params()
import _gdynet as G print() import _dynet as C cm = C.Model() gm = G.Model() cpW = cm.add_parameters((1000,1000)) gpW = gm.add_parameters((1000,1000)) def do_cpu(): C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward() def do_gpu(): G.renew_cg() W = G.parameter(gpW) W = W*W*W*W*W*W*W z = G.squared_distance(W,W) z.value() z.backward() import time s = time.time() do_cpu() print("CPU time:",time.time() - s)
def main(): parser = argparse.ArgumentParser( description= 'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet' ) parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -]') parser.add_argument('--n_test', type=int, default=189651, help='Number of test examples [default: 189651]') parser.add_argument('--beam_size', type=int, default=5, help='Beam size [default: 5]') parser.add_argument('--max_len', type=int, default=100, help='Maximum length of decoding [default: 100]') parser.add_argument('--model_file', type=str, default='./model_e1', help='Trained model file path [default: ./model_e1]') parser.add_argument( '--input_file', type=str, default='./data/valid.article.filter.txt', help='Test file path [default: ./data/valid.article.filter.txt]') parser.add_argument('--output_file', type=str, default='./pred_y.txt', help='Output file path [default: ./pred_y.txt]') parser.add_argument('--w2i_file', type=str, default='./w2i.dump', help='Word2Index file path [default: ./w2i.dump]') parser.add_argument('--i2w_file', type=str, default='./i2w.dump', help='Index2Word file path [default: ./i2w.dump]') parser.add_argument( '--alloc_mem', type=int, default=1024, help='Amount of memory to allocate [mb] [default: 1024]') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_TEST = args.n_test K = args.beam_size MAX_LEN = args.max_len ALLOC_MEM = args.alloc_mem # File paths MODEL_FILE = args.model_file INPUT_FILE = args.input_file OUTPUT_FILE = args.output_file W2I_FILE = args.w2i_file I2W_FILE = args.i2w_file # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load trained model ============================================================================================== with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w: w2i = pickle.load(f_w2i) i2w = pickle.load(f_i2w) test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, n_data=N_TEST, target=False) model = dy.Model() V, encoder, decoder = dy.load(MODEL_FILE, model) # Decode pred_y = [] for x in tqdm(test_X): dy.renew_cg() associate_parameters([encoder, decoder]) # Initial states x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) decoder.set_initial_states(hp, hb_1) s_0, c_0 = decoder.s_0, decoder.c_0 # candidates candidates = [[0, w2i['<s>'], s_0, c_0, []]] t = 0 while t < MAX_LEN: t += 1 tmp_candidates = [] end_flag = True for score_tm1, y_tm1, s_tm1, c_tm1, y_02tm1 in candidates: if y_tm1 == w2i['</s>']: tmp_candidates.append( [score_tm1, y_tm1, s_tm1, c_tm1, y_02tm1]) else: end_flag = False y_tm1_emb = dy.lookup(V, y_tm1) s_t, c_t, _q_t = decoder(y_tm1_emb, tm1s=[s_tm1, c_tm1], test=True) _q_t = np.log(_q_t.npvalue()) # Calculate log probs q_t, y_t = np.sort(_q_t)[::-1][:K], np.argsort( _q_t )[::-1][:K] # Pick K highest log probs and their ids score_t = score_tm1 + q_t # Accumulate log probs tmp_candidates.extend( [[score_tk, y_tk, s_t, c_t, y_02tm1 + [y_tk]] for score_tk, y_tk in zip(score_t, y_t)]) if end_flag: break candidates = sorted( tmp_candidates, key=lambda x: -x[0] / len(x[-1]) )[:K] # Sort in normalized log probs and pick K highest candidates # Pick the candidate with the highest score pred = candidates[0][-1] if w2i['</s>'] in pred: pred.remove(w2i['</s>']) pred_y.append(pred) pred_y_txt = '' for pred in pred_y: pred_y_txt += ' '.join([i2w[com] for com in pred]) + '\n' with open(OUTPUT_FILE, 'w') as f: f.write(pred_y_txt)
#me = - e #last = dy.cmult(layers[-1], me) + e #print("gradient", last.value()) #log_loss = dy.log(last + epsilon) #print(log_loss.value()) ys = dy.vecInput(self.dim_out) ys.set([1 if i in targets else 0 for i in range(self.dim_out)]) loss = dy.binary_log_loss(layers[-1], ys) return dy.sum_elems(loss) if __name__ == "__main__": import dynet model = dy.Model() trainer = dy.SimpleSGDTrainer(model) classifier = MLP_sigmoid(2, 2, 2, 10, dy.rectify, model) dataset = [([-1, -1], {0}), ([-1, 1], {1}), ([1, -1], {1}), ([1, 1], {0})] for e in range(10040): for xs, y in dataset: dy.renew_cg() x = dy.vecInput(2) x.set(xs) l = classifier.get_loss(x, y) l.backward() trainer.update()
def main(): parser = argparse.ArgumentParser( description= 'A Neural Attention Model for Abstractive Sentence Summarization in DyNet' ) parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: `-`]') parser.add_argument('--n_test', type=int, default=189651, help='Number of test examples [default: `189651`]') parser.add_argument('--beam_size', type=int, default=5, help='Beam size [default: `5`]') parser.add_argument('--max_len', type=int, default=100, help='Maximum length of decoding [default: `100`]') parser.add_argument('--model_file', type=str, default='./model_e1', help='Trained model file path [default: `./model_e1`]') parser.add_argument( '--input_file', type=str, default='./data/valid.article.filter.txt', help='Test file path [default: `./data/valid.article.filter.txt`]') parser.add_argument('--output_file', type=str, default='./pred_y.txt', help='Output file path [default: `./pred_y.txt`]') parser.add_argument('--w2i_file', type=str, default='./w2i.dump', help='Word2Index file path [default: `./w2i.dump`]') parser.add_argument('--i2w_file', type=str, default='./i2w.dump', help='Index2Word file path [default: `./i2w.dump`]') parser.add_argument( '--alloc_mem', type=int, default=1024, help='Amount of memory to allocate [mb] [default: `1024`]') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_TEST = args.n_test K = args.beam_size MAX_LEN = args.max_len ALLOC_MEM = args.alloc_mem # File paths MODEL_FILE = args.model_file INPUT_FILE = args.input_file OUTPUT_FILE = args.output_file W2I_FILE = args.w2i_file I2W_FILE = args.i2w_file # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load trained model ============================================================================================== with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w: w2i = pickle.load(f_w2i) i2w = pickle.load(f_i2w) test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, n_data=N_TEST) model = dy.Model() rush_abs = dy.load(MODEL_FILE, model)[0] ENCODER_TYPE = rush_abs.encoder_type C = rush_abs.c # Decode pred_y = [] for x in tqdm(test_X): dy.renew_cg() rush_abs.associate_parameters() # Initial states rush_abs.set_initial_states(x) # [accum log prob, BOS, t_c, decoded sequence] candidates = [[0, w2i['<s>'], [w2i['<s>']] * C, []]] t = 0 while t < MAX_LEN: t += 1 tmp_candidates = [] end_flag = True for score_tm1, y_tm1, y_c, y_02tm1 in candidates: if y_tm1 == w2i['</s>']: tmp_candidates.append([score_tm1, y_tm1, y_c, y_02tm1]) else: end_flag = False _q_t = rush_abs(t=y_c, test=True) _q_t = np.log(_q_t.npvalue()) # Log probs q_t, y_t = np.sort(_q_t)[::-1][:K], np.argsort( _q_t )[::-1][:K] # Pick K highest log probs and their ids score_t = score_tm1 + q_t # Accum log probs tmp_candidates.extend( [[score_tk, y_tk, y_c[1:] + [y_tk], y_02tm1 + [y_tk]] for score_tk, y_tk in zip(score_t, y_t)]) if end_flag: break candidates = sorted( tmp_candidates, key=lambda x: -x[0] / len(x[-1]) )[:K] # Sort in normalized score and pick K highest candidates # Pick the highest-scored candidate pred_y.append(candidates[0][-1]) pred_y_txt = '' for pred in pred_y: pred_y_txt += ' '.join([i2w[com] for com in pred]) + '\n' with open(OUTPUT_FILE, 'w') as f: f.write(pred_y_txt)
def main(): parser = argparse.ArgumentParser(description='Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=-1, help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--model_file', type=str, default='./model', help='Model to use for prediction [default: ./model]') parser.add_argument('--input_file', type=str, default='./data/valid_x.txt', help='Input file path [default: ./data/valid_x.txt]') parser.add_argument('--output_file', type=str, default='./pred_y.txt', help='Output file path [default: ./pred_y.txt]') parser.add_argument('--w2i_file', type=str, default='./w2i.dump', help='Word2Index file path [default: ./w2i.dump]') parser.add_argument('--i2w_file', type=str, default='./i2w.dump', help='Index2Word file path [default: ./i2w.dump]') parser.add_argument('--alloc_mem', type=int, default=1024, help='Amount of memory to allocate [mb] [default: 1024]') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) MODEL_FILE = args.model_file INPUT_FILE = args.input_file OUTPUT_FILE = args.output_file W2I_FILE = args.w2i_file I2W_FILE = args.i2w_file ALLOC_MEM = args.alloc_mem # DyNet setting dyparams = dy.DynetParams() dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load model model = dy.Model() pretrained_model = dy.load(MODEL_FILE, model) if len(pretrained_model) == 3: V1, layers = pretrained_model[0], pretrained_model[1:] MULTICHANNEL = False else: V1, V2, layers = pretrained_model[0], pretrained_model[1], pretrained_model[2:] MULTICHANNEL = True EMB_DIM = V1.shape()[0] WIN_SIZES = layers[0].win_sizes # Load test data with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w: w2i = pickle.load(f_w2i) i2w = pickle.load(f_i2w) max_win = max(WIN_SIZES) test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, unksym='unk') test_X = [[0]*max_win + instance_x + [0]*max_win for instance_x in test_X] # Pred pred_y = [] for instance_x in tqdm(test_X): # Create a new computation graph dy.renew_cg() associate_parameters(layers) sen_len = len(instance_x) if MULTICHANNEL: x_embs1 = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1) x_embs2 = dy.concatenate([dy.lookup(V2, x_t, update=False) for x_t in instance_x], d=1) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) else: x_embs = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) y = f_props(layers, x_embs, train=False) pred_y.append(str(int(binary_pred(y.value())))) with open(OUTPUT_FILE, 'w') as f: f.write('\n'.join(pred_y))
def main(): parser = argparse.ArgumentParser(description='A Neural Attention Model for Abstractive Sentence Summarization in DyNet') parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]') parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651]') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--vocab_size', type=int, default=60000, help='Vocabulary size [default: 60000]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--encoder_type', type=str, default='attention', help='Encoder type. bow: Bag-of-words encoder. attention: Attention-based encoder [default: attention]') parser.add_argument('--c', type=int, default=5, help='Window size in neural language model [default: 5]') parser.add_argument('--q', type=int, default=2, help='Window size in attention-based encoder [default: 2]') parser.add_argument('--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = args.vocab_size EMB_DIM = args.emb_dim HID_DIM = args.hid_dim ENCODER_TYPE = args.encoder_type C = args.c Q = args.q ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset ==================================================================================== w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN) w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN) train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE) train_y, _, _ = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN) valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID) valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID) VOCAB_SIZE = len(w2i) OUT_DIM = VOCAB_SIZE print('VOCAB_SIZE:', VOCAB_SIZE) # Build model ====================================================================================== model = dy.Model() trainer = dy.AdamTrainer(model) rush_abs = ABS(model, EMB_DIM, HID_DIM, VOCAB_SIZE, Q, C, encoder_type=ENCODER_TYPE) # Padding train_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in train_y] valid_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in valid_y] n_batches_train = math.ceil(len(train_X)/BATCH_SIZE) n_batches_valid = math.ceil(len(valid_X)/BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train train_X, train_y = shuffle(train_X, train_y) loss_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() rush_abs.associate_parameters() # Create a mini batch start = i*BATCH_SIZE end = start + BATCH_SIZE train_X_mb = train_X[start:end] train_y_mb = train_y[start:end] losses = [] for x, t in zip(train_X_mb, train_y_mb): t_in, t_out = t[:-1], t[C:] y = rush_abs(x, t_in) loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() rush_abs.associate_parameters() # Create a mini batch start = i*BATCH_SIZE end = start + BATCH_SIZE valid_X_mb = valid_X[start:end] valid_y_mb = valid_y[start:end] losses = [] for x, t in zip(valid_X_mb, valid_y_mb): t_in, t_out = t[:-1], t[C:] y = rush_abs(x, t_in) loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % ( epoch+1, np.mean(loss_all_train), np.mean(loss_all_valid) )) # Save model ======================================================================== dy.save('./model_e'+str(epoch+1), [rush_abs]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w)
def main(args): import dynet as dy get_data = {"ag": lambda : ag_data_reader.get_dataset(args.num_NE), "dw": lambda : dw_data_reader.get_dataset(args.num_NE), "bl": lambda : blog_data_reader.get_dataset(), "tp_fr": lambda : trustpilot_data_reader.get_dataset("fr"), "tp_de": lambda : trustpilot_data_reader.get_dataset("de"), "tp_dk": lambda : trustpilot_data_reader.get_dataset("dk"), "tp_us": lambda : trustpilot_data_reader.get_dataset("us"), "tp_uk": lambda : trustpilot_data_reader.get_dataset("uk")} train, dev, test = get_data[args.dataset]() labels_main_task = set([ex.get_label() for ex in train]) labels_main_task.add(0) assert(sorted(labels_main_task) == list(range(len(labels_main_task)))) labels_adve_task = get_aux_labels(train) print("Train size: {}".format(len(train))) print("Dev size: {}".format(len(dev))) print("Test size: {}".format(len(test))) print("Train data distribution") mfb_train = print_data_distributions(train) print("Dev data distribution") mfb_dev = print_data_distributions(dev) print("Test data distribution") mfb_test = print_data_distributions(test) results = {} model = dy.Model() #if args.use_demographics: symbols = ["<g={}>".format(i) for i in ["F", "M"]] + ["<a={}>".format(i) for i in ["U", "O"]] vocabulary = extract_vocabulary(train, add_symbols=symbols) bilstm = HierarchicalBiLSTM(args, vocabulary, model) input_size = bilstm.size() main_classifier = MLP(input_size, len(labels_main_task), args.hidden_layers, args.dim_hidden, dy.rectify, model) trainer = dy.AdamTrainer(model) trainer.set_clip_threshold(5) args.learning_rate = trainer.learning_rate if args.subset: train = train[:args.subset] dev = dev[:args.subset] output_size = len(labels_adve_task) adversary_classifier = MLP_sigmoid(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model) discriminator = None if args.atraining: discriminator = Discriminator(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model, trainer) generator = None if args.generator: generator = Generator(args, vocabulary, model, trainer) #### add adversary classifier mod = PrModel(args, model, trainer, bilstm, main_classifier, adversary_classifier, discriminator, generator, vocabulary) if args.baseline: _, ftest = mod.train_baseline(train, dev, test, args.iterations) print(ftest) return print("Train main task") results["000_main_dev_acc"] = mod.train_main(train, dev) targets_test = [ex.get_label() for ex in test] loss_test, acc_test, _ = mod.evaluate_main(test, targets_test) print("\t Test results : l={} acc={}".format(loss_test, acc_test)) results["001_main_test_acc"] = acc_test ############## ############## ############## ############## ############## ############## ############## Adversary training / evaluate privacy ############## ############## ############## ############## ############## train_hidden, dev_hidden, test_hidden = [mod.get_adversary_dataset(dataset) for dataset in [train, dev, test]] trainer.restart() print("Train adversary") results["002_adv_dev_F"] = mod.train_adversary(train_hidden, dev_hidden) targets_test = [ex.get_aux_labels() for ex in test] loss_test, acc_test, predictions_test = mod.evaluate_adversary(test_hidden) print("\t Adversary Test results : l={} acc={}".format(loss_test, acc_test)) outsize = mod.adversary_classifier.output_size() Fscore = compute_eval_metrics(outsize, targets_test, predictions_test) print("\tF = {} ".format(Fscore)) results["003_adv_test_fscore"] = Fscore[2] results["004_adv_test_precision"] = Fscore[0] results["005_adv_test_recall"] = Fscore[1] for i, acc in enumerate(Fscore[3]): results["{}_adv_test_acc_task_{}".format(str(i+6).zfill(3), i)] = acc preds = [set(range(outsize)) for _ in targets_test] Fscore = compute_eval_metrics(outsize, targets_test, preds) baseline_str = [Fscore[2], Fscore[0], Fscore[1]] + [x if x > 50.0 else 100 - x for x in Fscore[3]] line = ["Baseline", "NA", "NA", "NA", "NA", "NA", "NA", "NA", str(round(mfb_train * 100, 2)), str(round(mfb_test*100, 2)), "0"] print("\t".join(line) + "\t" + "\t".join(map(str, baseline_str))) for k in results: if type(results[k]) == float: results[k] = round(results[k], 2) results["#H"] = args.dim_hidden results["#h"] = args.hidden_layers results["#w"] = args.dim_word results["#W"] = args.dim_wrnn results["#Zatr"] = int(args.atraining) results["#Zptr"] = int(args.ptraining) results["#Zalpha"] = args.alpha keys = sorted(results) print("Model\t", end="") print("\t".join(keys)) print("\t".join(map(str, [results[k] for k in keys])))
def main(): parser = argparse.ArgumentParser( description= 'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet' ) parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]') parser.add_argument( '--n_train', type=int, default=3803957, help= 'Number of training examples (up to 3803957 in gigaword) [default: 3803957]' ) parser.add_argument( '--n_valid', type=int, default=189651, help= 'Number of validation examples (up to 189651 in gigaword) [default: 189651])' ) parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--lat_dim', type=int, default=256, help='Latent size [default: 256]') parser.add_argument( '--alloc_mem', type=int, default=8192, help='Amount of memory to allocate [mb] [default: 8192]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = 60000 EMB_DIM = args.emb_dim HID_DIM = args.hid_dim LAT_DIM = args.lat_dim ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset ==================================================================================== w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN) w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN) train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE) train_y, _, _ = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN) valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID) valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID) VOCAB_SIZE = len(w2i) OUT_DIM = VOCAB_SIZE print(VOCAB_SIZE) # Build model ====================================================================================== model = dy.Model() trainer = dy.AdamTrainer(model) V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM) decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM, OUT_DIM) # Train model ======================================================================================= n_batches_train = math.ceil(len(train_X) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train train_X, train_y = shuffle(train_X, train_y) loss_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() encoder.associate_parameters() decoder.associate_parameters() # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE train_X_mb = train_X[start:end] train_y_mb = train_y[start:end] losses = [] for x, t in zip(train_X_mb, train_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] he = encoder(x_embs) # Decoder t_embs = [dy.lookup(V, t_t) for t_t in t_in] decoder.set_initial_states(he) y, KL = decoder(t_embs) loss = dy.esum([ dy.pickneglogsoftmax(y_t, t_t) + KL_t for y_t, t_t, KL_t in zip(y, t_out, KL) ]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() encoder.associate_parameters() decoder.associate_parameters() # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE valid_X_mb = valid_X[start:end] valid_y_mb = valid_y[start:end] losses = [] for x, t in zip(valid_X_mb, valid_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] he = encoder(x_embs) # Decoder t_embs = [dy.lookup(V, t_t) for t_t in t_in] decoder.set_initial_states(he) y, KL = decoder(t_embs) loss = dy.esum([ dy.pickneglogsoftmax(y_t, t_t) + KL_t for y_t, t_t, KL_t in zip(y, t_out, KL) ]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid))) # Save model ====================================================================================== dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w)