def __init__(self, src_vocab_size, input_size, output_size, bidirectional=False, with_ln=False, prefix='Encoder', **kwargs): super(Encoder, self).__init__() self.output_size = output_size f = lambda name: str_cat(prefix, name) # return 'Encoder_' + parameters name self.src_lookup_table = nn.Embedding(src_vocab_size, wargs.src_wemb_size, padding_idx=PAD) if wargs.enc_rnn_type == 'gru': self.forw_gru = GRU(input_size, output_size, with_ln=with_ln, prefix=f('Forw')) self.back_gru = GRU(output_size, output_size, with_ln=with_ln, prefix=f('Back')) elif wargs.enc_rnn_type == 'sru': self.rnn = SRU( input_size=input_size, hidden_size=output_size, num_layers=wargs.enc_layer_cnt, dropout=wargs.drop_rate, bidirectional=bidirectional)
def __init__(self, src_vocab_size, input_size, output_size, with_ln=False, prefix='Encoder', **kwargs): super(Encoder, self).__init__() self.output_size = output_size f = lambda name: str_cat(prefix, name ) # return 'Encoder_' + parameters name self.src_lookup_table = nn.Embedding(src_vocab_size, wargs.src_wemb_size, padding_idx=PAD) self.forw_gru = GRU(input_size, output_size, with_ln=with_ln, prefix=f('Forw')) self.back_gru = GRU(output_size, output_size, with_ln=with_ln, prefix=f('Back'))
def __init__(self, src_vocab_size, input_size, output_size, with_ln=False, prefix='Encoder', **kwargs): super(Encoder, self).__init__() self.output_size = output_size f = lambda name: str_cat(prefix, name ) # return 'Encoder_' + parameters name self.src_lookup_table = nn.Embedding(src_vocab_size, wargs.src_wemb_size, padding_idx=PAD) self.forw_gru = GRU(input_size, output_size, with_ln=with_ln, prefix=f('Forw')) #self.relay0 = RelationLayer(output_size, output_size, wargs.filter_window_size, # wargs.filter_feats_size, wargs.mlp_size) #self.laynorm0 = LayerNormalization(wargs.enc_hid_size) self.back_gru = GRU(output_size, output_size, with_ln=with_ln, prefix=f('Back')) self.rn = RelationLayer(output_size, output_size, wargs.filter_window_size, wargs.filter_feats_size, wargs.mlp_size)
def __init__(self, trg_vocab_size, trg_lookup_table, max_out=True, com=True): super(Decoder, self).__init__() self.max_out = max_out self.attention = Attention( wargs.dec_hid_size if com else wargs.dec_hid_size_pri, wargs.align_size if com else wargs.align_size_pri) self.trg_lookup_table = trg_lookup_table self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() self.gru1 = GRU( wargs.trg_wemb_size if com else wargs.trg_wemb_size_pri, wargs.dec_hid_size if com else wargs.dec_hid_size_pri) self.gru2 = GRU(wargs.enc_hid_size if com else wargs.enc_hid_size_pri, wargs.dec_hid_size if com else wargs.dec_hid_size_pri) out_size = 2 * wargs.out_size if max_out else wargs.out_size self.ls = nn.Linear( wargs.dec_hid_size if com else wargs.dec_hid_size_pri, out_size) self.ly = nn.Linear( wargs.trg_wemb_size if com else wargs.trg_wemb_size_pri, out_size) self.lc = nn.Linear( wargs.enc_hid_size if com else wargs.enc_hid_size_pri, out_size)
def __init__(self, trg_vocab_size, with_ln=False, max_out=True): super(Decoder, self).__init__() self.max_out = max_out self.attention = Attention(wargs.dec_hid_size, wargs.align_size) self.trg_lookup_table = nn.Embedding(trg_vocab_size, wargs.trg_wemb_size, padding_idx=PAD) self.tanh = nn.Tanh() if wargs.dec_rnn_type == 'gru': self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size, with_ln=with_ln) self.gru2 = GRU(wargs.enc_hid_size, wargs.dec_hid_size, with_ln=with_ln) elif wargs.dec_rnn_type == 'sru': self.gru1 = SRU(input_size=wargs.trg_wemb_size, hidden_size=wargs.dec_hid_size, num_layers=wargs.dec_layer_cnt, dropout=0., bidirectional=False) self.gru2 = SRU(input_size=2*wargs.enc_hid_size, hidden_size=wargs.dec_hid_size, num_layers=wargs.dec_layer_cnt, dropout=0., bidirectional=False) out_size = 2 * wargs.out_size if max_out else wargs.out_size self.ls = nn.Linear(wargs.dec_hid_size, out_size) self.ly = nn.Linear(wargs.trg_wemb_size, out_size) self.lc = nn.Linear(2*wargs.enc_hid_size, out_size) self.classifier = Classifier(wargs.out_size, trg_vocab_size, self.trg_lookup_table if wargs.proj_share_weight is True else None)
def __init__(self, src_vocab_size, d_in, d_out, with_ln=False, prefix='Encoder', **kwargs): super(Encoder, self).__init__() self.d_out = d_out f = lambda name: str_cat(prefix, name) # return 'Encoder_' + parameters name self.src_lookup_table = nn.Embedding(src_vocab_size, d_in, padding_idx=PAD) self.forw_gru = GRU(d_in, d_out, with_ln=with_ln, prefix=f('Forw')) self.rnlay0 = RelationLayer(d_out, d_out, wargs.fltr_windows, wargs.d_fltr_feats, wargs.d_mlp) #self.map_in_out = nn.Linear(d_in, d_out) #self.laynorm0 = Layer_Norm(wargs.enc_hid_size) self.back_gru = GRU(d_out, d_out, with_ln=with_ln, prefix=f('Back')) self.rnlay1 = RelationLayer(d_out, d_out, wargs.fltr_windows, wargs.d_fltr_feats, wargs.d_mlp) #self.laynorm1 = LayerNormalization(wargs.enc_hid_size) #self.dropout = nn.Dropout(0.1) self.down0 = nn.Linear(d_in + d_out, d_out) self.down1 = nn.Linear(d_in + 2 * d_out, d_out) self.down2 = nn.Linear(d_in + 3 * d_out, d_out) self.down3 = nn.Linear(d_in + 4 * d_out, d_out)
class CHAR_RNN(nn.Module): def __init__(self, vocab_size, hidden_size=256, lr=2e-3, rnn='gru', sampling='sample'): super(CHAR_RNN, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden_size self.sampling = sampling if rnn == 'rnn': self.rnn = RNN(self.vocab_size, self.hidden_size) elif rnn == 'gru': self.rnn = GRU(self.vocab_size, self.hidden_size) else: raise NotImplementedError() self.optimizer = optim.Adam(self.parameters(), lr=lr) self.criterion = nn.CrossEntropyLoss() def forward(self, idxs): pred = self.rnn(idxs) return pred def lossFn(self, inputs, targets): loss = torch.tensor(0.) for in_idxs, trg_idxs in zip(inputs.transpose(0, 1), targets.transpose(0, 1)): trg_idxs = trg_idxs preds = self(in_idxs) loss += self.criterion(input=preds, target=trg_idxs) return loss def sample(self, seed_ix, n, ix_to_char): ixes = [] self.init_hidden() current_ix = seed_ix for i in range(n): # sample if self.sampling == 'sample': probs = torch.softmax(self(current_ix), dim=-1) pred_ix = np.random.choice(len(probs), p=probs.detach().numpy()) elif self.sampling == 'max': pred_ix = torch.argmax(self(current_ix)) else: raise NotImplementedError() ixes.append(pred_ix) current_ix = torch.tensor(pred_ix) pred_chars = ''.join([ix_to_char[int(ix)] for ix in ixes]) return pred_chars def init_hidden(self): self.rnn.init_hidden()
def __init__(self, trg_vocab_size, max_out=True): super(Decoder, self).__init__() self.max_out = max_out self.attention = Attention(wargs.dec_hid_size, wargs.align_size) self.trg_lookup_table = nn.Embedding(trg_vocab_size, wargs.trg_wemb_size, padding_idx=PAD) self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size) #self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size, enc_hid_size=wargs.trg_wemb_size) self.gru2 = GRU(wargs.enc_hid_size, wargs.dec_hid_size) out_size = 2 * wargs.out_size if max_out else wargs.out_size self.ls = nn.Linear(wargs.dec_hid_size, out_size) self.ly = nn.Linear(wargs.trg_wemb_size, out_size) self.lc = nn.Linear(wargs.enc_hid_size, out_size) #self.map_vocab = nn.Linear(wargs.out_size, trg_vocab_size) self.classifier = Classifier( wargs.out_size, trg_vocab_size, self.trg_lookup_table if wargs.proj_share_weight is True else None) if wargs.dynamic_cyk_decoding is True: self.gru2 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size, enc_hid_size=wargs.dec_hid_size) self.fwz = wargs.filter_window_size self.ffs = wargs.filter_feats_size #self.ha = nn.Linear(wargs.enc_hid_size, wargs.align_size) self.ha_btg = nn.Linear(wargs.enc_hid_size, wargs.align_size) self.U_att1 = nn.Linear(wargs.enc_hid_size, wargs.enc_hid_size) self.U_att2 = nn.Linear(wargs.enc_hid_size, wargs.enc_hid_size) #self.ha = nn.Sequential( # nn.Linear(wargs.enc_hid_size, wargs.mlp_size), #nn.LeakyReLU(0.1), #nn.Linear(wargs.mlp_size, wargs.mlp_size), #nn.LeakyReLU(0.1), # nn.Linear(wargs.mlp_size, wargs.align_size) #nn.LeakyReLU(0.1) #) for i in range(len(self.fwz)): self.l_f1_0 = nn.Linear(wargs.enc_hid_size, wargs.enc_hid_size) self.l_f1_1 = nn.Linear(wargs.enc_hid_size, wargs.enc_hid_size) self.l_conv = nn.Sequential( nn.Conv1d(wargs.enc_hid_size, self.ffs[i], kernel_size=self.fwz[i], stride=1), nn.ReLU() #nn.BatchNorm2d(self.ffs[i]) ) self.l_f2 = nn.Linear(self.ffs[i], wargs.enc_hid_size)
def __init__(self, input_size, output_size, with_ln=False, prefix='Encoder', **kwargs): super(Encoder, self).__init__() self.output_size = output_size f = lambda name: str_cat(prefix, name) # return 'Encoder_' + parameters name self.forw_gru = GRU(input_size, output_size, with_ln=with_ln, prefix=f('Forw')) self.back_gru = GRU(output_size, output_size, with_ln=with_ln, prefix=f('Back'))
def build(self): print '\t building rnn cell...' if self.cell=='gru': hidden_layer=GRU(self.rng, self.n_input,self.n_hidden,self.n_batch, self.x,self.E,self.x_mask, self.is_train,self.p) else: hidden_layer=LSTM(self.rng, self.n_input,self.n_hidden,self.n_batch, self.x,self.E,self.x_mask, self.is_train,self.p) print '\t building softmax output layer...' softmax_shape=(self.n_hidden,self.n_output) output_layer=H_Softmax(softmax_shape, hidden_layer.activation, self.y_node,self.y_choice,self.y_bit_mask,self.y_mask) self.params=[self.E,] self.params+=hidden_layer.params self.params+=output_layer.params cost=output_layer.activation lr=T.scalar("lr") gparams=[T.clip(T.grad(cost,p),-10,10) for p in self.params] updates=sgd(self.params,gparams,lr) self.train=theano.function(inputs=[self.x,self.x_mask,self.y_node,self.y_choice,self.y_bit_mask,self.y_mask,self.n_batch,lr], outputs=cost, updates=updates, givens={self.is_train:np.cast['int32'](1)}) self.test=theano.function(inputs=[self.x,self.x_mask,self.y_node,self.y_choice,self.y_bit_mask,self.y_mask,self.n_batch], outputs=cost, givens={self.is_train:np.cast['int32'](0)}) '''
def __init__(self, trg_vocab_size, max_out=True): super(Decoder, self).__init__() self.max_out = max_out self.attention = Attention(wargs.dec_hid_size, wargs.align_size) self.trg_lookup_table = nn.Embedding(trg_vocab_size, wargs.trg_wemb_size, padding_idx=PAD) self.tanh = nn.Tanh() self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size) #self.gru2 = GRU(wargs.enc_hid_size, wargs.dec_hid_size, with_two_attents=True) self.gru2 = GRU(wargs.enc_hid_size, wargs.dec_hid_size) out_size = 2 * wargs.out_size if max_out else wargs.out_size self.ls = nn.Linear(wargs.dec_hid_size, out_size) self.ly = nn.Linear(wargs.trg_wemb_size, out_size) self.lc = nn.Linear(wargs.enc_hid_size, out_size)
def create_model(): if args.model_type == 'lstm': return LSTM(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) elif args.model_type == 'rnn': return RNN(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) elif args.model_type == 'irnn': return IRNN(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) elif args.model_type == 'gru': return GRU(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) elif args.model_type == 'rnn+': if args.layers == 1: args.layers = 2 return IntersectionRNN(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) elif args.model_type == 'peephole': return Peephole(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) elif args.model_type == 'ugrnn': return UGRNN(input_size=dset.input_dimension, hidden_size=args.hx, output_size=dset.output_dimension, layers=args.layers, drop=args.drop, rec_drop=args.rec_drop) else: raise Exception
def __init__(self, trg_vocab_size, max_out=True): super(Decoder, self).__init__() self.max_out = max_out self.attention = Attention(wargs.dec_hid_size, wargs.align_size) self.trg_lookup_table = nn.Embedding(trg_vocab_size, wargs.trg_wemb_size, padding_idx=PAD) self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size) #self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size, enc_hid_size=wargs.trg_wemb_size) self.gru2 = GRU(wargs.enc_hid_size, wargs.dec_hid_size) out_size = 2 * wargs.out_size if max_out else wargs.out_size self.ls = nn.Linear(wargs.dec_hid_size, out_size) self.ly = nn.Linear(wargs.trg_wemb_size, out_size) self.lc = nn.Linear(wargs.enc_hid_size, out_size) self.classifier = Classifier(wargs.out_size, trg_vocab_size, self.trg_lookup_table if wargs.proj_share_weight is True else None)
def __init__(self, vocab_size, hidden_size=256, lr=2e-3, rnn='gru', sampling='sample'): super(CHAR_RNN, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden_size self.sampling = sampling if rnn == 'rnn': self.rnn = RNN(self.vocab_size, self.hidden_size) elif rnn == 'gru': self.rnn = GRU(self.vocab_size, self.hidden_size) else: raise NotImplementedError() self.optimizer = optim.Adam(self.parameters(), lr=lr) self.criterion = nn.CrossEntropyLoss()
def run_model(which='all'): if which in ['ann', 'all', 'main', 'standard']: model = ANN(emb_size, vocab_size, hid_dim, hid_num, class_num, sent_len).cuda() ann_loss = train(model, x, target, ann=True) plt.plot(ann_loss, label='ann') if which in ['wann', 'all', 'standard']: model = WANN(emb_size, vocab_size, hid_dim, hid_num, class_num, sent_len).cuda() wann_loss = train(model, x, target, ann=True) plt.plot(wann_loss, label='wann') if which in ['rnn', 'all', 'main']: model = RNN(emb_size, vocab_size, hid_dim, hid_num, class_num).cuda() rnn_loss = train(model, x, target) plt.plot(rnn_loss, label='rnn') if which in ['exrnn', 'all']: model = EXRNN(emb_size, vocab_size, hid_dim, hid_num, class_num, 2000, 2000).cuda() exrnn_loss = train(model, x, target) plt.plot(exrnn_loss, label='exrnn') if which in ['exmem', 'all']: model = EXRNN(emb_size, vocab_size, hid_dim, hid_num, class_num, 2000, forget_dim=None).cuda() exmem_loss = train(model, x, target) plt.plot(exmem_loss, label='exmem') if which in ['lstm', 'all', 'main']: model = LSTM(emb_size, vocab_size, hid_dim, hid_num, class_num).cuda() lstm_loss = train(model, x, target) plt.plot(lstm_loss, label='lstm') if which in ['gru', 'all', 'main']: model = GRU(emb_size, vocab_size, hid_dim, hid_num, class_num).cuda() gru_loss = train(model, x, target) plt.plot(gru_loss, label='gru') # plt.ylim([0, 2]) plt.legend() plt.grid(True) plt.show()
def build(self): print 'building rnn cell...' if self.cell == 'gru': hidden_layer = GRU(self.rng, self.n_input, self.n_hidden, self.n_batch, self.x, self.E, self.x_mask, self.is_train, self.p) else: hidden_layer = LSTM(self.rng, self.n_input, self.n_hidden, self.n_batch, self.x, self.E, self.x_mask, self.is_train, self.p) print 'building softmax output layer...' output_layer = level_softmax(self.n_hidden, self.n_output, hidden_layer.activation, self.y) cost = self.categorical_crossentropy(output_layer.activation) self.params = [ self.E, ] self.params += hidden_layer.params self.params += output_layer.params lr = T.scalar("lr") gparams = [T.clip(T.grad(cost, p), -10, 10) for p in self.params] updates = sgd(self.params, gparams, lr) self.train = theano.function( inputs=[ self.x, self.x_mask, self.y, self.y_mask, self.n_batch, lr ], outputs=cost, updates=updates, givens={self.is_train: np.cast['int32'](1)}) self.predict = theano.function( inputs=[self.x, self.x_mask, self.n_batch], outputs=output_layer.predicted, givens={self.is_train: np.cast['int32'](0)}) self.test = theano.function( inputs=[self.x, self.x_mask, self.y, self.y_mask, self.n_batch], outputs=cost, givens={self.is_train: np.cast['int32'](0)})
def gru(x, a, rew, rnn_state, n_hidden, n, activation, output_size): hidden = tf.concat([x, a, rew], 1) # use layer normalization for gru gru_cell = GRU(n_hidden, activation=activation) # gru_cell = tf.nn.rnn_cell.GRUCell(n_hidden, activation=activation, kernel_initializer=tf.initializers.orthogonal(), bias_initializer=tf.initializers.zeros()) rnn_in = tf.expand_dims(hidden, [0]) step_size = tf.minimum(tf.shape(rew)[:1], n) gru_outputs, gru_state = tf.nn.dynamic_rnn(gru_cell, rnn_in, initial_state=rnn_state, sequence_length=step_size, time_major=False) state_out = gru_state[:1, :] rnn_out = tf.reshape(gru_outputs, [-1, n_hidden]) out = tf.layers.dense( rnn_out, units=output_size, kernel_initializer=tf.initializers.glorot_normal(), bias_initializer=tf.zeros_initializer(), ) # layer normalization for dense layer norm_out = tf.contrib.layers.layer_norm(out) return norm_out, state_out
def train(self, epochs, learning_rate, kernel_size, hidden_size, model_cls, interaction, dropout): if model_cls == "cnn": model = CNN(embedding=self.data_train.vocab_embedding, embedding_size=self.data_train.vocab_embedding_size, lengths=self.data_train.lengths(), kernel_size=kernel_size, hidden_size=hidden_size, interaction=interaction, dropout=dropout) else: model = GRU(embedding=self.data_train.vocab_embedding, embedding_size=self.data_train.vocab_embedding_size, encoding_size=hidden_size, interaction=interaction, dropout=dropout) if self.use_gpu: model = model.cuda() loader = self.data_train.get_loader() loss_fn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) losses = [] accuracies = [] for epoch in range(1, epochs + 1): e_loss = [] print("\nStarting epoch {}".format(epoch)) for i, (s1, s2, labels) in enumerate(loader): if self.use_gpu: s1, s2, labels = s1.cuda(), s2.cuda(), labels.cuda() model.train() optimizer.zero_grad() # Forward pass logits = model(s1, s2) instance_loss = loss_fn(logits, labels) # Backward and optimize instance_loss.backward() optimizer.step() losses.append(instance_loss.item()) e_loss.append(instance_loss.item()) # validate every 100 iterations if i > 0 and i % 100 == 0: val_acc = self.validate(model) accuracies.append(val_acc) print( 'Epoch: [{}/{}]\tStep: [{}/{}]\tValidation Acc: {:.4f}' .format(epoch, epochs, i, len(loader), val_acc)) # self.analyzer.plot_live_lr(e_loss, title="Epoch {}".format(epoch)) avg_acc = sum(accuracies[-5:]) / 5 self.analyzer.record(model.cpu(), losses, epochs=epochs, accuracies=accuracies, learning_rate=learning_rate, hidden_size=hidden_size, kernel_size=kernel_size, validation_accuracy=avg_acc, model_name=model_cls, dropout=dropout, interaction=interaction, data_length=32 * len(loader)) self.analyzer.print_validation_results(self, model_cls, model) print("Final Accuracy: {}".format(avg_acc))
def split_input_target(sequence): input_text = sequence[:-1] target_text = sequence[1:] return input_text, target_text haikus_dataset = sequences.map(split_input_target) BATCH_SIZE = 64 BUFFER_SIZE = 1000 haikus_dataset = (haikus_dataset.shuffle(BUFFER_SIZE).batch( BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)) model = GRU(vocab_size=len(ids_from_chars.get_vocabulary()), embedding_dim=256, rnn_units=1024) for input_example_batch, target_example_batch in haikus_dataset.take(1): print(input_example_batch.shape, "# (batch_size, sequence_length, vocab_size)") example_batch_predictions = model(input_example_batch) print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)") example_sequence_input = input_example_batch[0] example_sequence_prediction_logits = example_batch_predictions[0] example_sequence_prediction_indice = tf.squeeze(tf.random.categorical( example_sequence_prediction_logits, num_samples=1), axis=-1).numpy()
print(INPUT_DATA_FILE) PRINT_EVERY = int(os.environ.get("PRINT_EVERY", "25000")) if not MODEL_OUTPUT_FILE: ts = datetime.now().strftime("%Y-%m-%d-%H-%M") MODEL_OUTPUT_FILE = "GRU-%s-%s-%s-%s.dat" % (ts, VOCABULARY_SIZE, EMBEDDING_DIM, HIDDEN_DIM) # Load data x_train, y_train, word_to_index, index_to_word = load_data(INPUT_DATA_FILE, VOCABULARY_SIZE, max_sents=1000000) if not FLAGS.print_sentences: # Build model model = GRU(VOCABULARY_SIZE, hidden_dim=HIDDEN_DIM, bptt_truncate=-1) # Print SGD step time def sgd_callback(model, num_examples_seen): dt = datetime.now().isoformat() loss = model.calculate_loss(x_train[:10000], y_train[:10000]) print("\n%s (%d)" % (dt, num_examples_seen)) print("--------------------------------------------------") print("Loss: %f" % loss) generate_sentences_from_scratch(model, 10, index_to_word, word_to_index) save_model_parameters_theano(model, MODEL_OUTPUT_FILE) print("\n") sys.stdout.flush() for epoch in range(NEPOCH):
def train(config, seed): np.random.seed(seed) torch.manual_seed(seed) # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Load dataset if config.dataset == 'randomcomb': print('Load random combinations dataset ...') # Initialize the dataset and data loader config.num_classes = config.input_length dataset = datasets.RandomCombinationsDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) elif config.dataset == 'bss': print('Load bss dataset ...') # Initialize the dataset and data loader config.num_classes = 2 config.input_dim = 3 dataset = datasets.BaumSweetSequenceDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) config.input_length = 4 * config.input_length elif config.dataset == 'bipalindrome': print('Load binary palindrome dataset ...') # Initialize the dataset and data loader config.num_classes = 2 dataset = datasets.BinaryPalindromeDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) config.input_length = config.input_length * 4 + 2 - 1 # Setup the model that we are going to use if config.model_type == 'LSTM': print("Initializing LSTM model ...") model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'biLSTM': print("Initializing bidirectional LSTM model...") model = biLSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'GRU': print("Initializing GRU model ...") model = GRU(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'peepLSTM': print("Initializing peephole LSTM model ...") model = peepLSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) # Setup the loss and optimizer loss_function = torch.nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) losses = [] train_accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Move to GPU batch_inputs = batch_inputs.to(device) # [batch_size, seq_length,1] batch_targets = batch_targets.to(device) # [batch_size] # Reset for next iteration model.zero_grad() # Forward pass log_probs = model(batch_inputs) # Compute the loss, gradients and update network parameters loss = loss_function(log_probs, batch_targets) loss.backward() losses.append(loss.item()) ####################################################################### # Check for yourself: what happens here and why? ####################################################################### torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ####################################################################### optimizer.step() predictions = torch.argmax(log_probs, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / log_probs.size(0) train_accuracies.append(accuracy) # print(predictions[0, ...], batch_targets[0, ...]) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 60 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, \ Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) # Check if training is finished if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report # https://github.com/pytorch/pytorch/pull/9655 break # Stop early if the last 100 losses were all low enough if all(x < 0.001 for x in losses[-100:]): break print('Done training.') # evaluate the model on new random data model.eval() test_accuracies = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Move to GPU batch_inputs = batch_inputs.to(device) # [batch_size, seq_length,1] batch_targets = batch_targets.to(device) # [batch_size] # Forward pass with torch.no_grad(): log_probs = model(batch_inputs) predictions = torch.argmax(log_probs, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / log_probs.size(0) test_accuracies.append(accuracy) if step >= 5000 / config.batch_size: # If you receive a PyTorch data-loader error, check this bug report # https://github.com/pytorch/pytorch/pull/9655 break return losses, train_accuracies, torch.tensor( test_accuracies).mean().item()
model = RNN(session_layers=session_layers, user_layers=user_layers, loss=args.loss, item_embedding=args.item_embedding, init_item_embeddings=item_embedding_values, hidden_act=args.hidden_act, dropout_p_hidden_usr=args.dropout_p_hidden_usr, dropout_p_hidden_ses=args.dropout_p_hidden_ses, dropout_p_init=args.dropout_p_init, lmbd=args.lmbd, decay=args.decay, grad_cap=args.grad_cap, sigma=args.sigma, adapt=args.adapt, batch_size=args.batch_size, learning_rate=args.learning_rate, momentum=args.momentum, init_as_normal=bool(args.init_as_normal), reset_after_session=bool(args.reset_after_session), train_random_order=bool(args.train_random_order), n_epochs=args.n_epochs, user_key=args.user_key, session_key=args.session_key, item_key=args.item_key, time_key=args.time_key, seed=args.rnd_seed, user_to_session_act=args.user_to_ses_act, user_propagation_mode=args.user_propagation_mode, user_to_output=bool(args.user_to_output))
def train(config, seed=0, seq_length=0): np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if seq_length != 0: config.input_length = seq_length # Initialize tensorboard writer # writer = SummaryWriter() # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Load dataset if config.dataset == 'randomcomb': print('Load random combinations dataset ...') # Initialize the dataset and data loader config.num_classes = config.input_length dataset = datasets.RandomCombinationsDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) elif config.dataset == 'bss': print('Load bss dataset ...') # Initialize the dataset and data loader config.num_classes = 2 config.input_dim = 3 dataset = datasets.BaumSweetSequenceDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) config.input_length = 4 * config.input_length elif config.dataset == 'bipalindrome': print('Load binary palindrome dataset ...') # Initialize the dataset and data loader config.num_classes = config.input_length dataset = datasets.BinaryPalindromeDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) config.input_length = config.input_length * 4 + 2 - 1 # Setup the model that we are going to use if config.model_type == 'LSTM': print("Initializing LSTM model ...") model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'biLSTM': print("Initializing bidirectional LSTM model...") model = biLSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'GRU': print("Initializing GRU model ...") model = GRU(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'peepLSTM': print("Initializing peephole LSTM model ...") model = peepLSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) # Setup the loss and optimizer loss_function = torch.nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) loss_history = [] acc_history = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Move to GPU batch_inputs = batch_inputs.to(device) # [batch_size, seq_length,1] batch_targets = batch_targets.to(device) # [batch_size] # Reset for next iteration model.zero_grad() # Forward pass log_probs = model(batch_inputs) # print('log', log_probs.size()) # print('batch', batch_targets.size) # Compute the loss, gradients and update network parameters loss = loss_function(log_probs, batch_targets) loss.backward() ####################################################################### # Check for yourself: what happens here and why? ####################################################################### torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ####################################################################### optimizer.step() predictions = torch.argmax(log_probs, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / log_probs.size(0) loss_history.append(loss.item()) acc_history.append(accuracy) if step % 200 == 0: print('\nLoss:', loss.item()) print('Acc:', accuracy) # writer.add_scalar("Loss", loss, step) # writer.add_scalar("Accuracy", accuracy, step) # print(predictions[0, ...], batch_targets[0, ...]) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 60 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, \ Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) # Check if training is finished if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report # https://github.com/pytorch/pytorch/pull/9655 break # writer.flush() # writer.close() print(f'Done training with seed {seed} and seq_length {seq_length}') print('Final loss:', loss_history[-1]) print('Final acc:', acc_history[-1]) return loss_history, acc_history
def fit(self, X, Y, activation=T.tanh, learning_rate=1e-1, mu=0.5, reg=0, epochs=120, show_fig=False): N, t, D = X.shape self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = GRU(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = np.random.randn(Mi) / np.sqrt(Mi) bo = 0.0 self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params lr = T.scalar('lr') thX = T.matrix('X') thY = T.scalar('Y') Yhat = self.forward(thX)[-1] # let's return py_x too so we can draw a sample instead self.predict_op = theano.function( inputs=[thX], outputs=Yhat, allow_input_downcast=True, ) cost = T.mean((thY - Yhat) * (thY - Yhat)) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [ (p, p + mu * dp - lr * g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads) ] self.train_op = theano.function( inputs=[lr, thX, thY], outputs=cost, updates=updates ) costs = [] for i in range(epochs): t0 = datetime.now() X, Y = shuffle(X, Y) n_correct = 0 n_total = 0 cost = 0 for j in range(N): c = self.train_op(learning_rate, X[j], Y[j]) cost += c if i % 10 == 0: print( "i:", i, "cost:", cost, "time for epoch:", (datetime.now() - t0)) if (i + 1) % 500 == 0: learning_rate /= 10 costs.append(cost) if show_fig: plt.plot(costs) plt.show()
model_dim = 64 batch_size = 128 epochs = 10 print("Data downloading and pre-processing ... ") (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=max_len, num_words=vocab_size) x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) y_train = to_categorical(y_train) y_test = to_categorical(y_test) print('Model building ... ') inputs = Input(shape=(max_len, ), name="inputs") embeddings = Embedding(vocab_size, model_dim, scale=False)(inputs) outputs = BiDirectional(GRU(model_dim, return_outputs=True))(embeddings) x = GlobalAveragePooling1D()(outputs) x = Dropout(0.2)(x) x = Dense(10, activation='relu')(x) outputs = Dense(2, activation='softmax')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(optimizer=Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9), loss='categorical_crossentropy', metrics=['accuracy']) print("Model Training ... ") es = EarlyStopping(patience=5) model.fit(x_train, y_train, batch_size=batch_size,
import sys # Uncomment to remove determinism np.random.seed(0) # Set to True to perform gradient checking GRAD_CHECK = False vec_size = 8 out_size = vec_size # Size of output bit vector at each time step in_size = vec_size + 2 # Input vector size, bigger because of start+stop bits hidden_size = 100 # Size of hidden layer of neurons learning_rate = 1e-1 # An object that keeps the network state during training. model = GRU(in_size, out_size, hidden_size) # An object that keeps the optimizer state during training optimizer = Adagrad(model.weights,learning_rate) n = 0 # counts the number of sequences trained on while True: # train on sequences of length from 1 to 4 seq_length = np.random.randint(1,5) i, t = sequences.copy_sequence(seq_length, vec_size) inputs = np.matrix(i) targets = np.matrix(t) # forward seq_length characters through the net and fetch gradient
def train(config): #np.random.seed(24) #torch.manual_seed(24) # Initialize the device which to run the model on device = torch.device(config.device) print(device) # Load dataset if config.dataset == 'randomcomb': print('Load random combinations dataset ...') # Initialize the dataset and data loader config.num_classes = config.input_length dataset = datasets.RandomCombinationsDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) elif config.dataset == 'bss': print('Load bss dataset ...') # Initialize the dataset and data loader config.num_classes = 2 config.input_dim = 3 dataset = datasets.BaumSweetSequenceDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) config.input_length = 4 * config.input_length elif config.dataset == 'bipalindrome': print('Load binary palindrome dataset ...') # Initialize the dataset and data loader config.num_classes = config.input_length dataset = datasets.BinaryPalindromeDataset(config.input_length) data_loader = DataLoader(dataset, config.batch_size, num_workers=1, drop_last=True) config.input_length = config.input_length * 4 + 2 - 1 # Setup the model that we are going to use if config.model_type == 'LSTM': print("Initializing LSTM model ...") model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'biLSTM': print("Initializing bidirectional LSTM model...") model = biLSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'GRU': print("Initializing GRU model ...") model = GRU(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) elif config.model_type == 'peepLSTM': print("Initializing peephole LSTM model ...") model = peepLSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, config.batch_size, device).to(device) # Setup the loss and optimizer loss_function = torch.nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) accuracy_list = [] loss_list = [] old_loss = 1.0 for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() # Move to GPU batch_inputs = batch_inputs.to(device) # [batch_size, seq_length,1] batch_targets = batch_targets.to(device) # [batch_size] #print(batch_inputs[:,0,:].shape) #embedding = nn.Embedding(3, config.input_dim) #print(embedding(batch_inputs[:,0,:].long()).shape) # Reset for next iteration model.zero_grad() # Forward pass log_probs = model(batch_inputs) # Compute the loss, gradients and update network parameters loss = loss_function(log_probs, batch_targets) loss.backward() ####################################################################### # Check for yourself: what happens here and why? ####################################################################### torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ####################################################################### optimizer.step() predictions = torch.argmax(log_probs, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / log_probs.size(0) accuracy_list.append(accuracy) loss_list.append(loss.item()) # print(predictions[0, ...], batch_targets[0, ...]) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size / float(t2 - t1) if step % 60 == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, \ Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, accuracy, loss)) # Check if training is finished if step == config.train_steps or old_loss == loss.item(): # If you receive a PyTorch data-loader error, check this bug report # https://github.com/pytorch/pytorch/pull/9655 break else: old_loss = loss.item() print('Done training.') ########################################################################### ########################################################################### print('Evaluating...') acc = [] for i in range(3): acc_sublist = [] for step, (batch_inputs, batch_targets) in enumerate(data_loader): model.eval() batch_inputs = batch_inputs.to( device) # [batch_size, seq_length,1] batch_targets = batch_targets.to(device) pred = model(batch_inputs) predictions = torch.argmax(pred, dim=1) correct = (predictions == batch_targets).sum().item() accuracy = correct / pred.size(0) acc_sublist.append(accuracy) if step == 25: break acc.append(np.mean(acc_sublist)) print('Mean accuracy is {} and standard deviation is {}'.format( np.mean(acc), np.std(acc))) return accuracy_list, loss_list
vocabulary_size=2000 embedding_dim=48 hidden_dim=128 nepochs=20 model_output_file="model_output_file.mof" input_data_file="./data/reddit-comments-2015.csv" print_every=25000 if not model_output_file: ts = datetime.now().strftime("%Y-%m-%d-%H-%M") MODEL_OUTPUT_FILE = "GRU-%s-%s-%s-%s.dat" % (ts, vocabulary_size, embedding_dim, hidden_dim) # Load data x_train, y_train, word2index, index2word = load_data(input_data_file, vocabulary_size) model=GRU(vocabulary_size,hidden_dim=hidden_dim,bptt_truncate=-1) t1=time.time() model.sgd_step(x_train[10],y_train[10],learning_rate) t2=time.time() print "SGD Step time: %f (ms)" %((t2-t1)*1000) def sgd_callback(model,num_examples_seen): loss=model.calculate_loss(x_train[:10000],y_train[:10000]) print("train instance: ",num_examples_seen,"Loss:",loss) generate_sentences(model,10,index2word,word2index)
import sys # Uncomment to remove determinism np.random.seed(0) # Set to True to perform gradient checking GRAD_CHECK = False vec_size = 8 out_size = vec_size # Size of output bit vector at each time step in_size = vec_size + 2 # Input vector size, bigger because of start+stop bits hidden_size = 100 # Size of hidden layer of neurons learning_rate = 1e-1 # An object that keeps the network state during training. model = GRU(in_size, out_size, hidden_size) # An object that keeps the optimizer state during training optimizer = Adagrad(model.weights, learning_rate) n = 0 # counts the number of sequences trained on while True: # train on sequences of length from 1 to 4 seq_length = np.random.randint(1, 5) i, t = sequences.copy_sequence(seq_length, vec_size) inputs = np.matrix(i) targets = np.matrix(t) # forward seq_length characters through the net and fetch gradient
session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options) with tf.Session(config=session_conf).as_default() as sess: initializer = tf.random_uniform_initializer( -1 * FLAGS.init_scale, 1 * FLAGS.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): model = GRU(FLAGS.batch_size, FLAGS.sequence_len, embedding, FLAGS.embedding_size, FLAGS.attention_dim, FLAGS.rnn_size, FLAGS.num_rnn_layers, num_classes, FLAGS.max_grad_norm, dropout=FLAGS.dropout, is_training=True) with tf.variable_scope("model", reuse=True, initializer=initializer): valid_model = GRU(FLAGS.batch_size, FLAGS.sequence_len, embedding, FLAGS.embedding_size, FLAGS.attention_dim, FLAGS.rnn_size,
help='whether to use combined policy and value nets') args = parser.parse_args() env = gym.make(args.env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(args.seed) torch.manual_seed(args.seed) if args.use_joint_pol_val: ac_net = ActorCritic(num_inputs, num_actions) opt_ac = optim.Adam(ac_net.parameters(), lr=0.0003) else: policy_net = GRU(num_inputs, num_actions) old_policy_net = GRU(num_inputs, num_actions) value_net = Value(num_inputs) opt_policy = optim.Adam(policy_net.parameters(), lr=0.0003) opt_value = optim.Adam(value_net.parameters(), lr=0.0003) def create_batch_inputs(batch_states_list, batch_actions_list, batch_advantages_list, batch_targets_list): lengths = [] for states in batch_states_list: lengths.append(states.size(0)) max_length = max(lengths) batch_states = torch.zeros(len(batch_states_list), max_length, num_inputs) batch_actions = torch.zeros(len(batch_actions_list), max_length,
def build(self): log.info('building rnn cell....') if self.cell == 'gru': recurent_x = GRU(self.rng, self.n_input, self.n_hidden, self.x, self.E, self.xmask, self.is_train, self.dropout) recurent_y = GRU(self.rng, self.n_input, self.n_hidden, self.y, self.E, self.ymask, self.is_train, self.dropout) elif self.cell == 'lstm': recurent_x = LSTM(self.rng, self.n_input, self.n_hidden, self.x, self.E, self.xmask, self.is_train, self.dropout) recurent_y = LSTM(self.rng, self.n_input, self.n_hidden, self.y, self.E, self.ymask, self.is_train, self.dropout) log.info('build the sim matrix....') sim_layer = Similarity(recurent_x.activation, recurent_y.activation, metrics=self.sim) log.info('building convolution pooling layer....') conv_pool_layer = ConvPool( input=sim_layer.activation, filter_shape=(2, 1, 3, 3), # feature_maps, 1, filter_h, filter_w input_shape=(self.batch_size, 1, 50, 50)) #sim_layer.activation.shape) projected_layer = basicLayer(conv_pool_layer.activation, input_shape=1152) rav_cost = T.nnet.binary_crossentropy(projected_layer.activation, self.label) cost = T.mean(rav_cost) acc = T.eq(projected_layer.activation > 0.5, self.label) log.info('cost calculated.....') self.params = [ self.E, ] self.params += recurent_x.params self.params += recurent_y.params self.params += conv_pool_layer.params self.params += projected_layer.params lr = T.scalar('lr') gparams = [T.clip(T.grad(cost, p), -3, 3) for p in self.params] #gparams = [T.grad(cost, p) for p in self.params] if self.optimizer == 'sgd': updates = sgd(self.params, gparams, lr) elif self.optimizer == 'adam': updates = adam(self.params, gparams, lr) elif self.optimizer == 'rmsprop': updates = rmsprop(self.params, gparams, lr) log.info('gradient calculated.....') self.train = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask, self.label, lr], outputs=[cost, acc], updates=updates, givens={self.is_train: np.cast['int32'](1)}) self.predict = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask, self.label], outputs=[rav_cost, acc], givens={self.is_train: np.cast['int32'](0)}) self.test = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=projected_layer.activation, givens={self.is_train: np.cast['int32'](0)})
def get_gru(options): model = GRU(options) return model