class AdversarialDomainAdaptation(nn.Module): def __init__(self, input_dim, cnn_hidden_dim, filter_width, lstm_hidden_dim, Lambda, use_cuda): super(AdversarialDomainAdaptation, self).__init__() self.question_encoder_cnn = CNNEncoder(input_dim, cnn_hidden_dim, filter_width, use_cuda=use_cuda) self.question_encoder_lstm = LSTMEncoder(input_dim, lstm_hidden_dim, use_cuda=use_cuda) self.gradient_reversal = GradientReversalLayer(Lambda, use_cuda) self.domain_classifier_cnn = DomainClassifier(input_dim=cnn_hidden_dim, use_cuda=use_cuda) self.domain_classifier_lstm = DomainClassifier( input_dim=lstm_hidden_dim, use_cuda=use_cuda) if use_cuda: self.cuda() def forward(self, title, body, title_mask, body_mask, use_cnn=True, use_domain_classifier=True, return_average=True): """ Runs one forward pass on the input. Return two things: - the embedding, so that we can feed it into the loss function for label prediction (only if the input came from source not target dataset) - the predicted domain label from softmax, so that we can feed it into the loss function for domain classification """ title_embedding = None body_embedding = None if use_cnn: title_embedding = self.question_encoder_cnn.run_all( title, title_mask) body_embedding = self.question_encoder_cnn.run_all(body, body_mask) else: title_embedding = self.question_encoder_lstm.run_all( title, title_mask) body_embedding = self.question_encoder_lstm.run_all( body, body_mask) embedding = (title_embedding + body_embedding) / 2 domain_label = None if use_domain_classifier: reverse = self.gradient_reversal(embedding) if use_cnn: domain_label = self.domain_classifier_cnn(reverse) else: domain_label = self.domain_classifier_lstm(reverse) return embedding, domain_label def change_lambda(self, Lambda): self.gradient_reversal.change_lambda(Lambda)
def __init__(self, input_dim, cnn_hidden_dim, filter_width, lstm_hidden_dim, Lambda, use_cuda): super(AdversarialDomainAdaptation, self).__init__() self.question_encoder_cnn = CNNEncoder(input_dim, cnn_hidden_dim, filter_width, use_cuda=use_cuda) self.question_encoder_lstm = LSTMEncoder(input_dim, lstm_hidden_dim, use_cuda=use_cuda) self.gradient_reversal = GradientReversalLayer(Lambda, use_cuda) self.domain_classifier_cnn = DomainClassifier(input_dim=cnn_hidden_dim, use_cuda=use_cuda) self.domain_classifier_lstm = DomainClassifier( input_dim=lstm_hidden_dim, use_cuda=use_cuda) if use_cuda: self.cuda()
def part3(askubuntu_data, model_type, android_data): """ Runs the model from part 3. If android_data is not None, also evaluates the model on the android_data for the direct transfer section of part 2. """ model = None if model_type == ModelType.LSTM: model = LSTMEncoder(EMBEDDING_LENGTH, LSTM_HIDDEN_DIM, use_cuda=USE_CUDA, return_average=True) elif model_type == ModelType.CNN: model = CNNEncoder(EMBEDDING_LENGTH, CNN_HIDDEN_DIM, FILTER_WIDTH, use_cuda=USE_CUDA, return_average=True) else: print "Error: unknown model type", model_type return train_model(model_type, askubuntu_data, model, NUM_EPOCHS, BATCH_SIZE, use_title=True, use_body=True, tfidf_weighting=True) print "----------Evaluating Part 2.3 on android dataset..." eval_part2(model, android_data, True, model_type, using_part1_model=True, tfidf_weighting=True) eval_part2(model, android_data, False, model_type, using_part1_model=True, tfidf_weighting=True)
def __init__(self, output_dim, hidden_dim,output_length, init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid', weights=None, truncate_gradient=-1, input_dim=None, input_length=None, hidden_state=None, batch_size=None, depth=2, context_sensitive=False, ): if not type(depth) == list: depth = [depth, depth] n_lstms = sum(depth) if depth[1] < 2 and context_sensitive: print "Warning: Your model will not be able to remember its previous output!" if weights is None: weights = [None] * (n_lstms + 1) if hidden_state is None: hidden_state = [None] * (n_lstms + 1) encoder_index = depth[0] - 1 decoder_index = depth[0] + 1 decoder = LSTMDecoder2(dim=output_dim, hidden_dim=hidden_dim, output_length=output_length, init=init,inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[decoder_index], truncate_gradient = truncate_gradient, hidden_state=hidden_state[decoder_index], batch_size=batch_size, remember_state=context_sensitive) encoder = LSTMEncoder(input_dim=input_dim, output_dim=hidden_dim,init=init, inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[encoder_index], truncate_gradient = truncate_gradient, input_length=input_length, hidden_state=hidden_state[encoder_index], batch_size=batch_size, remember_state=context_sensitive) left_deep = [LSTMEncoder(input_dim=input_dim, output_dim=input_dim,init=init, inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[i], truncate_gradient = truncate_gradient, input_length=input_length, hidden_state=hidden_state[i], batch_size=batch_size, return_sequences=True, remember_state=context_sensitive) for i in range(depth[0]-1)] right_deep = [LSTMEncoder(input_dim=output_dim, output_dim=output_dim,init=init, inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[decoder_index + 1 + i], truncate_gradient = truncate_gradient, input_length=input_length, hidden_state=hidden_state[decoder_index + 1 + i], batch_size=batch_size, return_sequences=True, remember_state=context_sensitive) for i in range(depth[1]-1)] dense = Dense(input_dim=hidden_dim, output_dim=output_dim) encoder.broadcast_state(decoder) if weights[depth[0]] is not None: dense.set_weights(weights[depth[0]]) super(Seq2seq, self).__init__() for l in left_deep: self.add(l) self.add(encoder) self.add(dense) self.add(decoder) for l in right_deep: self.add(l) self.encoder = encoder self.dense = dense self.decoder = decoder self.left_deep = left_deep self.right_deep = right_deep
#embed_model = nn.Embedding(len(vocab), args.embed_size) embed_model = None dropout = 0.5 model = MMSeq2SeqModel( None, HLSTMEncoder(args.hist_enc_layers[0], args.hist_enc_layers[1], len(vocab), args.hist_out_size, args.embed_size, args.hist_enc_hsize, dropout=dropout, embed=embed_model), LSTMEncoder(args.in_enc_layers, len(vocab), args.in_enc_hsize, args.embed_size, dropout=dropout, embed=embed_model), HLSTMDecoder(args.dec_layers, len(vocab), len(vocab), args.embed_size, args.hist_out_size + args.in_enc_hsize, args.dec_hsize, args.dec_psize, independent=False, dropout=dropout, embed=embed_model)) initialize_model_weights(model, "he", "xavier") # report data summary logging.info('#vocab = %d' % len(vocab))
def main(): parser = argparse.ArgumentParser() # logging parser.add_argument('--logfile', '-l', default='', type=str, help='write log data into a file') parser.add_argument('--debug', '-d', action='store_true', help='run in debug mode') parser.add_argument('--silent', '-s', action='store_true', help='run in silent mode') parser.add_argument('--no-progress-bar', action='store_true', help='hide progress bar') # train and validate data parser.add_argument('--train', default='train.txt', type=str, help='set filename of training data') parser.add_argument('--validate', default='dev.txt', type=str, help='set filename of validation data') parser.add_argument('--vocab-size', '-V', default=0, type=int, help='set vocabulary size (0 means no limitation)') parser.add_argument( '--target-speaker', '-T', default='S', help='set target speaker name to be learned for system output') # file settings parser.add_argument('--initial-model', '-i', help='start training from an initial model') parser.add_argument('--model', '-m', required=True, help='set prefix of output model files') parser.add_argument( '--resume', action='store_true', help='resume training from a previously saved snapshot') parser.add_argument('--snapshot', type=str, help='dump a snapshot to a file after each epoch') # Model structure parser.add_argument('--enc-layer', default=2, type=int, help='number of encoder layers') parser.add_argument('--enc-esize', default=100, type=int, help='number of encoder input-embedding units') parser.add_argument('--enc-hsize', default=512, type=int, help='number of encoder hidden units') parser.add_argument('--dec-layer', default=2, type=int, help='number of decoder layers') parser.add_argument('--dec-esize', default=100, type=int, help='number of decoder input-embedding units') parser.add_argument('--dec-hsize', default=512, type=int, help='number of decoder hidden units') parser.add_argument('--dec-psize', default=100, type=int, help='number of decoder pre-output projection units') # training conditions parser.add_argument( '--optimizer', default='Adam', type=str, help="set optimizer (SGD, Adam, AdaDelta, RMSprop, ...)") parser.add_argument('--L2-weight', default=0.0, type=float, help="set weight for L2-regularization term") parser.add_argument('--clip-grads', default=5., type=float, help="set gradient clipping threshold") parser.add_argument('--dropout-rate', default=0.5, type=float, help="set dropout rate in training") parser.add_argument('--num-epochs', '-e', default=20, type=int, help='number of epochs to be trained') parser.add_argument('--learn-rate', '-R', default=1.0, type=float, help='set initial learning rate for SGD') parser.add_argument('--learn-decay', default=1.0, type=float, help='set decaying ratio of learning rate or epsilon') parser.add_argument( '--lower-bound', default=1e-16, type=float, help='set threshold of learning rate or epsilon for early stopping') parser.add_argument('--batch-size', '-b', default=50, type=int, help='set batch size for training and validation') parser.add_argument( '--max-batch-length', default=20, type=int, help='set maximum sequence length to control batch size') parser.add_argument('--seed', default=99, type=int, help='set a seed for random numbers') # select a GPU device parser.add_argument('--gpu', '-g', default=0, type=int, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() # flush stdout if six.PY2: sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # set up the logger tqdm_logging.config(logger, args.logfile, mode=('a' if args.resume else 'w'), silent=args.silent, debug=args.debug) # gpu setup if args.gpu >= 0: cuda.check_cuda_available() cuda.get_device(args.gpu).use() xp = cuda.cupy xp.random.seed(args.seed) else: xp = np # randomize np.random.seed(args.seed) random.seed(args.seed) logger.info('----------------------------------') logger.info('Train a neural conversation model') logger.info('----------------------------------') if args.resume: if not args.snapshot: logger.error('snapshot file is not spacified.') sys.exit() with open(args.snapshot, 'rb') as f: vocab, optimizer, status, args = pickle.load(f) logger.info('Resume training from epoch %d' % status.epoch) logger.info('Args ' + str(args)) model = optimizer.target else: logger.info('Args ' + str(args)) # Prepare RNN model and load data if args.initial_model: logger.info('Loading a model from ' + args.initial_model) with open(args.initial_model, 'rb') as f: vocab, model, tmp_args = pickle.load(f) status.cur_at = time.time() else: logger.info('Making vocabulary from ' + args.train) vocab = dialog_corpus.get_vocabulary(args.train, vocabsize=args.vocab_size) model = Sequence2SequenceModel( LSTMEncoder(args.enc_layer, len(vocab), args.enc_hsize, args.enc_esize, dropout=args.dropout_rate), LSTMDecoder(args.dec_layer, len(vocab), len(vocab), args.dec_esize, args.dec_hsize, args.dec_psize, dropout=args.dropout_rate)) # Setup optimizer optimizer = vars(optimizers)[args.optimizer]() if args.optimizer == 'SGD': optimizer.lr = args.learn_rate optimizer.use_cleargrads() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.clip_grads)) if args.L2_weight > 0.: optimizer.add_hook(chainer.optimizer.WeightDecay(args.L2_weight)) status = None logger.info('Loading text data from ' + args.train) train_set = dialog_corpus.load(args.train, vocab, args.target_speaker) logger.info('Loading validation data from ' + args.validate) validate_set = dialog_corpus.load(args.validate, vocab, args.target_speaker) logger.info('Making mini batches') train_batchset = dialog_corpus.make_minibatches( train_set, batchsize=args.batch_size, max_length=args.max_batch_length) validate_batchset = dialog_corpus.make_minibatches( validate_set, batchsize=args.batch_size, max_length=args.max_batch_length) # report data summary logger.info('vocabulary size = %d' % len(vocab)) logger.info('#train sample = %d #mini-batch = %d' % (len(train_set), len(train_batchset))) logger.info('#validate sample = %d #mini-batch = %d' % (len(validate_set), len(validate_batchset))) random.shuffle(train_batchset, random.random) # initialize status parameters if status is None: status = Status(max(round(len(train_batchset), -3) / 50, 500), progress_bar=not args.no_progress_bar) else: status.progress_bar = not args.no_progress_bar # move model to gpu if args.gpu >= 0: model.to_gpu() while status.epoch <= args.num_epochs: logger.info('---------------------training--------------------------') if args.optimizer == 'SGD': logger.info('Epoch %d/%d : SGD learning rate = %g' % (status.epoch, args.num_epochs, optimizer.lr)) else: logger.info( 'Epoch %d/%d : %s eps = %g' % (status.epoch, args.num_epochs, args.optimizer, optimizer.eps)) train_ppl = train_step(model, optimizer, train_set, train_batchset, status, xp) logger.info("epoch %d training perplexity: %f" % (status.epoch, train_ppl)) # write the model params modelfile = args.model + '.' + str(status.epoch) logger.info('writing model params to ' + modelfile) model.to_cpu() with open(modelfile, 'wb') as f: pickle.dump((vocab, model, args), f, -1) if args.gpu >= 0: model.to_gpu() # start validation step logger.info('---------------------validation------------------------') start_at = time.time() validate_ppl = validate_step(model, validate_set, validate_batchset, status, xp) logger.info('epoch %d validation perplexity: %.4f' % (status.epoch, validate_ppl)) # update best model with the minimum perplexity if status.min_validate_ppl >= validate_ppl: status.bestmodel_num = status.epoch logger.info('validation perplexity reduced: %.4f -> %.4f' % (status.min_validate_ppl, validate_ppl)) status.min_validate_ppl = validate_ppl elif args.optimizer == 'SGD': modelfile = args.model + '.' + str(status.bestmodel_num) logger.info('reloading model params from ' + modelfile) with open(modelfile, 'rb') as f: vocab, model, tmp_args = pickle.load(f) if args.gpu >= 0: model.to_gpu() optimizer.lr *= args.learn_decay if optimizer.lr < args.lower_bound: break optimizer.setup(model) else: optimizer.eps *= args.learn_decay if optimizer.eps < args.lower_bound: break status.new_epoch(validate_time=time.time() - start_at) # dump snapshot if args.snapshot: logger.info('writing snapshot to ' + args.snapshot) model.to_cpu() with open(args.snapshot, 'wb') as f: pickle.dump((vocab, optimizer, status, args), f, -1) if args.gpu >= 0: model.to_gpu() logger.info('----------------') # make a symbolic link to the best model logger.info('the best model is %s.%d.' % (args.model, status.bestmodel_num)) logger.info('a symbolic link is made as ' + args.model + '.best') if os.path.exists(args.model + '.best'): os.remove(args.model + '.best') os.symlink(os.path.basename(args.model + '.' + str(status.bestmodel_num)), args.model + '.best') logger.info('done')
def __init__(self, output_dim, hidden_dim,output_length, init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid', weights=None, truncate_gradient=-1, input_dim=None, input_length=None, hidden_state=None, batch_size=None, depth=1, remember_state=False, ): if not type(depth) == list: depth = [depth, depth] n_lstms = sum(depth) if weights is None: weights = [None] * (n_lstms + 1) if hidden_state is None: hidden_state = [None] * (n_lstms + 1) encoder_index = depth[0] - 1 decoder_index = depth[0] + 1 decoder = LSTMDecoder(dim=output_dim, hidden_dim=hidden_dim, output_length=output_length, init=init,inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[decoder_index], truncate_gradient = truncate_gradient, hidden_state=hidden_state[decoder_index], batch_size=batch_size, remember_state=remember_state) encoder = LSTMEncoder(input_dim=input_dim, output_dim=hidden_dim,init=init, inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[encoder_index], truncate_gradient = truncate_gradient, input_length=input_length, hidden_state=hidden_state[encoder_index], batch_size=batch_size, remember_state=remember_state) left_deep = [LSTMEncoder(input_dim=input_dim, output_dim=input_dim,init=init, inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[i], truncate_gradient = truncate_gradient, input_length=input_length, hidden_state=hidden_state[i], batch_size=batch_size, return_sequences=True, remember_state=remember_state) for i in range(depth[0]-1)] right_deep = [LSTMEncoder(input_dim=output_dim, output_dim=output_dim,init=init, inner_init=inner_init, activation=activation, inner_activation=inner_activation,weights=weights[decoder_index + 1 + i], truncate_gradient = truncate_gradient, input_length=input_length, hidden_state=hidden_state[decoder_index + 1 + i], batch_size=batch_size, return_sequences=True, remember_state=remember_state) for i in range(depth[1]-1)] dense = Dense(input_dim=hidden_dim, output_dim=output_dim) encoder.broadcast_state(decoder) if weights[depth[0]] is not None: dense.set_weights(weights[depth[0]]) super(Seq2seq, self).__init__() for l in left_deep: self.add(l) self.add(encoder) self.add(dense) self.add(decoder) for l in right_deep: self.add(l) self.encoder = encoder self.dense = dense self.decoder = decoder self.left_deep = left_deep self.right_deep = right_deep