示例#1
0
def main(unused_argv):

  config = importlib.import_module('config.%s' % FLAGS.config)
  for argument in FLAGS.override.split(','):
    if '=' in argument:
      name = argument.split('=')[0]
      value = type(getattr(config, name))(argument.split('=')[1])
      setattr(config, name, value)
  config.input_vocab = data.Vocab(config.input_vocab_file,
                                   config.max_vocab_size)  # Max IDs
  if config.input_vocab.WordToId(data.PAD_TOKEN) <= 0:
    raise ValueError('Invalid PAD_TOKEN id.')
  # id of the UNKNOWN_TOKEN should be "0" for copynet model
  if config.input_vocab.WordToId(data.UNKNOWN_TOKEN) != 0:
    raise ValueError('Invalid UNKOWN_TOKEN id.')
  if config.input_vocab.WordToId(data.SENTENCE_START) <= 0:
    raise ValueError('Invalid SENTENCE_START id.')
  if config.input_vocab.WordToId(data.SENTENCE_END) <= 0:
    raise ValueError('Invalid SENTENCE_END id.')

  if config.output_vocab_file:
    config.output_vocab = data.Vocab(config.output_vocab_file,
                                     config.max_vocab_size)  # Max IDs
    if config.output_vocab.WordToId(data.PAD_TOKEN) <= 0:
      raise ValueError('Invalid PAD_TOKEN id.')
    # id of the UNKNOWN_TOKEN should be "0" for copynet model
    if config.output_vocab.WordToId(data.UNKNOWN_TOKEN) != 0:
      raise ValueError('Invalid UNKOWN_TOKEN id.')
    if config.output_vocab.WordToId(data.SENTENCE_START) <= 0:
      raise ValueError('Invalid SENTENCE_START id.')
    if config.output_vocab.WordToId(data.SENTENCE_END) <= 0:
      raise ValueError('Invalid SENTENCE_END id.')
  else:
    config.output_vocab = config.input_vocab

  train_batcher = config.Batcher(config.train_set, config)
  valid_batcher = config.Batcher(config.valid_set, config)
  tf.set_random_seed(config.random_seed)

  if FLAGS.mode == 'train':
    model = config.Model(config, 'train', num_gpus=FLAGS.num_gpus)
    _Train(model, config, train_batcher)
  elif FLAGS.mode == 'eval':
    config.dropout_rnn = 1.0
    config.dropout_emb = 1.0
    model = config.Model(config, 'eval', num_gpus=FLAGS.num_gpus)
    _Eval(model, config, valid_batcher)
  elif FLAGS.mode == 'decode':
    config.dropout_rnn = 1.0
    config.dropout_emb = 1.0
    config.batch_size = config.beam_size
    model = config.Model(config, 'decode', num_gpus=FLAGS.num_gpus)
    decoder = decode.BeamSearch(model, valid_batcher, config)
    decoder.DecodeLoop()
示例#2
0
文件: batcher.py 项目: lan2720/rl-GQ
def test_batcher():
    max_enc_steps = 65
    max_dec_steps = 65
    word_count_path = '/home/jiananwang/rl-QG/data/squad-v1/word_counter.json'
    glove_path = '/home/jiananwang/data/glove/glove.840B.300d.txt'
    embed_dim = 300
    max_vocab_size = 50000
    embedding_dict_file = '/home/jiananwang/rl-QG/data/squad-v1/emb_dict_%d.pkl' % max_vocab_size
    vocab = data.Vocab(word_count_path, glove_path, embed_dim, max_vocab_size, embedding_dict_file)
    data_path = '/home/jiananwang/rl-QG/data/squad-v1/train_raw.json'
    batch_size = 5
    dynamic_vocab = False
    batcher = Batcher(data_path,
                      vocab,
                      batch_size,
                      max_enc_steps, max_dec_steps,
                      mode='train',
                      dynamic_vocab=dynamic_vocab)
    batcher.setup()
    while True:
        try:
            start = time.time()
            batch = batcher.next_batch()
            print('time:', time.time()-start)
        except:
            break
示例#3
0
    def __init__(self, vocab_path, ckpt_path):
        self._num_gpus = 0
        self._vocab_path = vocab_path
        self._ckpt_path = ckpt_path
        self._vocab = data.Vocab(self._vocab_path, 50000)  #1000000
        # Check for presence of required special tokens.
        assert self._vocab.WordToId(data.PAD_TOKEN) > 0
        assert self._vocab.WordToId(data.UNKNOWN_TOKEN) >= 0
        assert self._vocab.WordToId(data.SENTENCE_START) > 0
        assert self._vocab.WordToId(data.SENTENCE_END) > 0

        self._decode_hps = seq2seq_attention_model.HParams(
            mode='decode',  # train, eval, decode
            min_lr=0.01,  # min learning rate.
            lr=0.15,  # learning rate
            batch_size=4,
            enc_layers=4,
            enc_timesteps=120,
            dec_timesteps=30,
            min_input_len=2,  # discard articles/summaries < than this
            num_hidden=256,  # for rnn cell
            emb_dim=128,  # If 0, don't use embedding
            max_grad_norm=2,
            num_softmax_samples=4096)  # If 0, no sampled softmax.

        self._hps = self._decode_hps._replace(dec_timesteps=1)
        print "=== Initilizaing... ==="
        self._model = seq2seq_attention_model.Seq2SeqAttentionModel(
            self._hps, self._vocab, num_gpus=self._num_gpus)
        print "=== Finish Initilizaing ==="
        self._decoder = seq2seq_attention_decode.BSDecoder(
            self._model, self._decode_hps, self._vocab, self._ckpt_path)

        print "==== Can Start to Answer the Question Now!!!!! ===="
def _extract_we_binary(output_file, vocab_file, we_dic):
    vocab = data.Vocab(vocab_file, 1000000)
    vsize = vocab.NumIds()
    output = codecs.open(output_file, "w", "utf-8")
    unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)]
    with open(we_dic, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        print "layer1_size:", layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            idx = data.GetWordIds(word, vocab)
            if idx != None and idx != unknown_ids and word == "<s>":
                print idx, ":", word
                output.write(word + ' ' + ' '.join(
                    map(str, np.fromstring(f.read(binary_len),
                                           dtype='float32'))) + '\n')
            elif idx == unknown_ids:
                f.read(binary_len)
            else:
                f.read(binary_len)
    f.close()
    output.close()
def main(unused_argv):
    vocab = data.Vocab(FLAGS.vocab_path, 1000000)
    # Check for presence of required special tokens.
    assert vocab.WordToId(data.PAD_TOKEN) > 0
    assert vocab.WordToId(data.UNKNOWN_TOKEN) >= 0
    assert vocab.WordToId(data.SENTENCE_START) > 0
    assert vocab.WordToId(data.SENTENCE_END) > 0

    batch_size = 1
    if FLAGS.mode == 'decode':
        batch_size = FLAGS.beam_size

    hps = seq2seq_attention_model.HParams(
        mode=FLAGS.mode,  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=batch_size,
        #enc_layers=4,
        enc_layers=2,
        enc_timesteps=60,
        #enc_timesteps=120,
        #dec_timesteps=30,
        dec_timesteps=15,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=128,  # for rnn cell
        #num_hidden=256,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=10)  # If 0, no sampled softmax.
    #num_softmax_samples=4096)  # If 0, no sampled softmax.

    batcher = batch_reader.Batcher(FLAGS.data_path,
                                   vocab,
                                   hps,
                                   FLAGS.article_key,
                                   FLAGS.abstract_key,
                                   FLAGS.max_article_sentences,
                                   FLAGS.max_abstract_sentences,
                                   bucketing=FLAGS.use_bucketing,
                                   truncate_input=FLAGS.truncate_input)
    tf.set_random_seed(FLAGS.random_seed)

    if hps.mode == 'train':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Train(model, batcher)
    elif hps.mode == 'eval':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Eval(model, batcher, vocab=vocab)
    elif hps.mode == 'decode':
        decode_mdl_hps = hps
        # Only need to restore the 1st step and reuse it since
        # we keep and feed in state for each step's output.
        decode_mdl_hps = hps._replace(dec_timesteps=1)
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)
        decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps,
                                                     vocab)
        decoder.DecodeLoop()
示例#6
0
    def __init__(self, hp, model_settings, extra_info, mode='decode'):
        vocab_file = hp.vocab_path
        max_size = hp.vocab_size
        self.vocab = data.Vocab(
            vocab_file=vocab_file,
            max_size=max_size)  # Construct the vocabulary manager

        self.model = model.SummarizationModel(
            hps=model_settings, vocab=self.vocab,
            extra_info=extra_info)  # Construct the model
        self.decode_wrapper = None
示例#7
0
文件: batcher.py 项目: lan2720/rl-GQ
def test_example():
    #batcher = Batcher(train_file, vocab) 
    #batch = batcher.next_batch()
    #batch.enc_batch
    #batch.dec_batch
    #batch.target_batch
    max_enc_steps = 65
    max_dec_steps = 65
    word_count_path = '/home/jiananwang/rl-QG/data/squad-v1/word_counter.json'
    glove_path = '/home/jiananwang/data/glove/glove.840B.300d.txt'
    embed_dim = 300
    max_vocab_size = 50000
    embedding_dict_file = '/home/jiananwang/rl-QG/data/squad-v1/emb_dict_50000.pkl'
    vocab = data.Vocab(word_count_path, glove_path, embed_dim, max_vocab_size, embedding_dict_file)
    with open('/home/jiananwang/rl-QG/data/squad-v1/dev_raw.json') as f:
        d = json.load(f)
        for ex in d:
            if ex['ifkeep']:
                para = ex['correct_sentence']
                ques = ex['question']
                ans = ex['valid_answer'][0]
                ans_pos = (ex['ans_start_in_sent'], ex['ans_end_in_sent'])
                case = Example(para, ques, ans, ans_pos,
                               vocab, max_enc_steps, max_dec_steps, dynamic_vocab=True)
                print('enc len:', case.enc_len)
                #if not case.dynamic_vocab:
                print('enc input:', case.enc_input)
                print('decoding:', ' '.join([vocab.id2word(i) for i in case.enc_input]))
                
                print('dec len:', case.dec_len)
                print('dec input:', case.dec_input)
                print('decoding:', ' '.join([vocab.id2word(i) for i in case.dec_input]))
                if not case.dynamic_vocab:
                    print('target:', case.target)
                    print('decoding:', ' '.join([vocab.id2word(i) for i in case.target]))
                print('orig para:', case.original_paragraph)
                print('orig ques:', case.original_question)
                print('orig ans:', case.original_answer)
                print('ans start:', case.answer_start_idx)
                print('ans end:', case.answer_end_idx)
                
                vocab_size = max_vocab_size + 4
                if case.dynamic_vocab:
                    print('-'*10, 'dynamic vocab', '-'*10)
                    print('enc input extend vocab:', case.enc_input_extend_vocab)
                    words = decoding(case.enc_input_extend_vocab, vocab, case.enc_oovs)
                    print('decoding:', ' '.join(words))
                    print('new dec input:', case.dec_input_extend_vocab)
                    words = decoding(case.dec_input_extend_vocab, vocab, case.enc_oovs)
                    print('decoding:', ' '.join(words))
                    print('new target:', case.target)
                    words = decoding(case.target, vocab, case.enc_oovs)
                    print('decoding:', ' '.join(words))
                break
示例#8
0
def editnet_data_to_editnetID(df, output_path):
    """
    this function reads from df.columns=['comp_tokens', 'simp_tokens', 'edit_labels','comp_pos_tags','comp_pos_ids']
    and add vocab ids for comp_tokens, simp_tokens, and edit_labels
    :param df: df.columns=['comp_tokens', 'simp_tokens', 'edit_labels','comp_pos_tags','comp_pos_ids']
    :param output_path: the path to store the df
    :return: a dataframe with df.columns=['comp_tokens', 'simp_tokens', 'edit_labels',
                                            'comp_ids','simp_id','edit_ids',
                                            'comp_pos_tags','comp_pos_ids'])
    """
    out_list = []
    vocab = data.Vocab()
    vocab.add_vocab_from_file('./vocab_data/vocab.txt', 30000)

    def prepare_example(example, vocab):
        """
        :param example: one row in pandas dataframe with feild ['comp_tokens', 'simp_tokens', 'edit_labels']
        :param vocab: vocab object for translation
        :return: inp: original input sentence,
        """
        comp_id = np.array([
            vocab.w2i[i] if i in vocab.w2i.keys() else vocab.w2i[UNK]
            for i in example['comp_tokens']
        ])
        simp_id = np.array([
            vocab.w2i[i] if i in vocab.w2i.keys() else vocab.w2i[UNK]
            for i in example['simp_tokens']
        ])
        edit_id = np.array([
            vocab.w2i[i] if i in vocab.w2i.keys() else vocab.w2i[UNK]
            for i in example['edit_labels']
        ])
        return comp_id, simp_id, edit_id  # add a dimension for batch, batch_size =1

    for i, example in df.iterrows():
        print(i)
        comp_id, simp_id, edit_id = prepare_example(example, vocab)
        ex = [
            example['comp_tokens'], comp_id, example['simp_tokens'], simp_id,
            example['edit_labels'], edit_id, example['comp_pos_tags'],
            example['comp_pos_ids']
        ]
        out_list.append(ex)
    outdf = pd.DataFrame(out_list,
                         columns=[
                             'comp_tokens', 'comp_ids', 'simp_tokens',
                             'simp_ids', 'edit_labels', 'new_edit_ids',
                             'comp_pos_tags', 'comp_pos_ids'
                         ])
    outdf.to_pickle(output_path)
    print('saved to %s' % output_path)

    return outdf
示例#9
0
文件: batcher.py 项目: lan2720/rl-GQ
def test_batch():
    max_enc_steps = 65
    max_dec_steps = 65
    word_count_path = '/home/jiananwang/rl-QG/data/squad-v1/word_counter.json'
    glove_path = '/home/jiananwang/data/glove/glove.840B.300d.txt'
    embed_dim = 300
    max_vocab_size = 50000
    embedding_dict_file = '/home/jiananwang/rl-QG/data/squad-v1/emb_dict_50000.pkl'
    vocab = data.Vocab(word_count_path, glove_path, embed_dim, max_vocab_size, embedding_dict_file)
    dynamic_vocab =True
    with open('/home/jiananwang/rl-QG/data/squad-v1/dev_raw.json') as f:
        d = json.load(f)
    example_list = []
    for ex in d:
        if ex['ifkeep']:
            para = ex['correct_sentence']
            ques = ex['question']
            ans = ex['valid_answer'][0]
            ans_pos = (ex['ans_start_in_sent'], ex['ans_end_in_sent'])
            case = Example(para, ques, ans, ans_pos,
                           vocab, max_enc_steps, max_dec_steps, dynamic_vocab)
            example_list.append(case)
            if len(example_list) == 5:
                break
    batch = Batch(example_list, vocab, max_dec_steps, dynamic_vocab)
    print('enc batch:', batch.enc_batch)
    if not dynamic_vocab:
        for i in range(batch.enc_batch.shape[0]):
            enc = batch.enc_batch[i]
            dec = batch.dec_batch[i]
            tgt = batch.target_batch[i]
            words = decoding(enc.tolist(), vocab)
            print('enc:', ' '.join(words))
            words = decoding(dec.tolist(), vocab)
            print('dec:', ' '.join(words))
            words = decoding(tgt.tolist(), vocab)
            print('tgt:', ' '.join(words))
            print('-'*20)
    else:
        for i in range(batch.enc_batch.shape[0]):
            enc = batch.enc_batch_extend_vocab[i]
            dec = batch.dec_batch[i]
            tgt = batch.target_batch[i]
            oov = batch.para_oovs_batch[i]
            words = decoding(enc.tolist(), vocab, oov)
            print('enc one case:', ' '.join(words))
            words = decoding(dec.tolist(), vocab, oov)
            print('dec one case:', ' '.join(words))
            words = decoding(tgt.tolist(), vocab, oov)
            print('tgt one case:', ' '.join(words))
            print('-'*20)
示例#10
0
    def __init__(self, hp, model_settings, extra_info):
        vocab_file=hp.vocab_path
        max_size=hp.vocab_size
        self.vocab=data.Vocab(vocab_file=vocab_file, max_size=max_size)     # Construct the vocabulary manager

        # model_hp_list=['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm',
        #     'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen']
        # model_hp_dict={}
        # for key,value in FLAGS.__flags.iteritems():
        #     if key in model_hp_list:
        #         model_hp_dict[key]=value
        # model_settings=namedtuple('HParams',model_hp_dict.keys())(**model_hp_dict)

        self.model=model.SummarizationModel(hps=model_settings, vocab=self.vocab, extra_info=extra_info)   # Construct the model
        self.decode_wrapper=None
示例#11
0
    def __init__(self, config):
        self.corpus_files = config["corpus_files"]
        # self.jamo_processor = Han2Jamo()

        # self.char_vocab = CharWordVocab.load_vocab(config["char_vocab_path"])
        self.char_vocab = data.CharWordVocab(config["char_vocab_path"])
        # self.word_vocab = WordVocab.load_vocab(config["word_vocab_path"])
        self.word_vocab = data.Vocab(config["word_vocab_path"])

        self.seq_len = config["word_seq_len"]
        self.char_seq_len = config["char_seq_len"]
        self.corpus_size = self.get_corpus_size()
        print("DataSet Size:", self.corpus_size)

        config["char_vocab_size"] = len(self.char_vocab)
        config["word_vocab_size"] = len(self.word_vocab)
示例#12
0
文件: batcher.py 项目: zqcccc/MLDemo
def main():
    from torch.nn import init
    config = util.read_config('../configs/process.yaml')
    # get_temp_vocab(config)
    vocab = data.Vocab(config.vocab_path, max_size=config.max_size)
    # vocab.build_vectors(config.pre_word_embedding_path, 300, unk_init=init.xavier_uniform)
    if config.save:
        torch.save(vocab, config.vocab_path_50)
    val_data = DocDataset(config.val_path, vocab, config)
    test_data = DocDataset(config.test_path, vocab, config)
    if config.save:
        torch.save(val_data, config.val_data_path)
        torch.save(test_data, config.test_data_path)

    train_data = DocDataset(config.train_path, vocab, config)
    if config.save:
        torch.save(train_data, config.train_data_path)
示例#13
0
def run(args):
    torch.set_default_dtype(torch.float64)
    vocab = data.Vocab() 
    
    if args.generate_data:
        generate_train_val_test(args.generate_num,vocab,0.7,0.2,args.generate_path)
        return



    batch_size = args.batch_size
    if args.load_data == True:
        data_path = args.data_path
        with open(data_path,'rb') as f:
            train_questions,train_ans,val_questions,val_ans,test_questions,test_ans = pkl.load(f)
            train_generator = data.BatchGenerator(train_questions, train_ans, batch_size)
            val_generator = data.BatchGenerator(val_questions, val_ans, batch_size)



    lr = float(args.lr) or 0.01
    num_layers = 1
    
    if args.load_model == True:
        model_path = args.model_path
        checkpoint = torch.load(model_path)
        rnn = RNNCalc(*checkpoint['model_hyper'])
        rnn.load_state_dict(checkpoint['model']) 
        optimizer = torch.optim.SGD(rnn.parameters(),lr=lr,momentum=0.9, nesterov=True)
        #optimizer = optim.Adam(rnn.parameters())
        optim.Adam(rnn.parameters()).load_state_dict(checkpoint['optimzer'])
    else:
        #create new model
        embedding_dim,vocab_size,rnn_units = 32,vocab .size(),128
        rnn = RNNCalc(num_layers,embedding_dim,vocab_size,rnn_units)
        optimizer = optim.Adam(rnn.parameters(),lr=lr)
        
        
       
    if args.mode == 'train':
        assert optimizer is not None
        trainer = Trainer(rnn, optimizer ,args.epoch_num)
        trainer.train( train_generator,val_generator,vocab,lr,10,10,args.checkpoint_path)
    elif args.mode=='test':
        eva_generator = data.BatchGenerator(train_questions, train_ans, len(train_questions))
        evaluate.evaluate(rnn, eva_generator, vocab)
示例#14
0
    def train(cls):
        cls.vocab = data.Vocab(FLAGS.vocab_path, 1000000)
        batch_size = FLAGS.beam_size
        hps = seq2seq_attention_model.HParams(
            mode=FLAGS.mode,  # train, eval, decode
            min_lr=0.01,  # min learning rate.
            lr=0.15,  # learning rate
            batch_size=batch_size,
            enc_layers=4,
            enc_timesteps=120,  #120
            dec_timesteps=120,  #30
            min_input_len=0,  # discard articles/summaries < than this
            num_hidden=256,  # for rnn cell
            emb_dim=128,  # If 0, don't use embedding
            max_grad_norm=2,
            num_softmax_samples=0)  # 4096,If 0, no sampled softmax.

        cls.batcher = Batcher(cls.vocab,
                              hps,
                              FLAGS.article_key,
                              FLAGS.abstract_key,
                              FLAGS.max_article_sentences,
                              FLAGS.max_abstract_sentences,
                              bucketing=FLAGS.use_bucketing,
                              truncate_input=FLAGS.truncate_input)
        tf.set_random_seed(FLAGS.random_seed)

        # Only need to restore the 1st step and reuse it since
        # we keep and feed in state for each step's output.
        decode_mdl_hps = hps._replace(dec_timesteps=1)
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            decode_mdl_hps, cls.vocab, num_gpus=FLAGS.num_gpus)
        cls.decoder = seq2seq_attention_decode.BSDecoder(
            model, cls.batcher, hps, cls.vocab)
        #载入模型
        cls.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
        if not (ckpt_state and ckpt_state.model_checkpoint_path):
            print('No model to decode yet at %s' % (FLAGS.log_root))

        tf.logging.info('checkpoint path %s', ckpt_state.model_checkpoint_path)
        ckpt_path = os.path.join(
            FLAGS.log_root, os.path.basename(ckpt_state.model_checkpoint_path))
        tf.logging.info('renamed checkpoint path %s', ckpt_path)
        cls.decoder._saver.restore(cls.sess, ckpt_path)
示例#15
0
def run(args):
    vocab = data.Vocab()

    if args.generate_data:
        generate_train_val_test(args.generate_num, vocab, 0.7, 0.2,
                                args.generate_path)
        return

    batch_size = args.batch_size
    if args.load_data == True:
        data_path = args.data_path
        with open(data_path, 'rb') as f:
            train_questions, train_ans, val_questions, val_ans, test_questions, test_ans = pkl.load(
                f)
            train_generator = data.BatchGenerator(train_questions, train_ans,
                                                  batch_size)
            val_generator = data.BatchGenerator(val_questions, val_ans,
                                                batch_size)

    lr = 0.01
    if args.load_model == True:
        model_path = args.model_path
        checkpoint = torch.load(model_path)
        seq2seq = Seq2seqCalc(*checkpoint['model_hyper'])
        seq2seq.load_state_dict(checkpoint['model'])
        optimizer = optim.Adam(seq2seq.parameters())
        optim.Adam(seq2seq.parameters()).load_state_dict(
            checkpoint['optimzer'])
    else:
        #create new model
        embedding_dim, vocab_size, digit_rnn_units, decoder_rnn_units = 32, vocab.size(
        ), 256, 128
        optimizer = optim.Adam(seq2seq.parameters(), lr=lr)
        seq2seq = Seq2seqCalc(embedding_dim, vocab_size, digit_rnn_units,
                              decoder_rnn_units)

    if args.mode == 'train':
        assert optimizer is not None
        trainer = Trainer(seq2seq, optimizer, args.epoch_num)
        trainer.train(train_generator, val_generator, vocab, lr, 10, 10,
                      args.checkpoint_path)
    elif args.mode == 'test':
        eva_generator = data.BatchGenerator(train_questions, train_ans,
                                            len(train_questions))
        evaluate.evaluate(seq2seq, eva_generator, vocab)
def _extract_we_text(output_file, vocab_file, we_dic):
    vocab = data.Vocab(vocab_file, 1000000)
    vsize = vocab.NumIds()
    m = copy.deepcopy(vocab._word_to_id)
    unknown_ids = [vocab.WordToId(UNKNOWN_TOKEN)]
    output = codecs.open(output_file, "w", "utf-8")
    with open(we_dic, "rb") as f:
        for line in f:
            string = line.split(" ")
            word = string[0].strip()
            value = " ".join(x for x in string[1:])
            idx = data.GetWordIds(word, vocab)
            if idx != None and idx != unknown_ids and word in m:
                del m[word]
                output.write(word + ' ' + value)
    print "====:", m
    print "---:", len(m)

    f.close()
    output.close()

    #this operation wants to garuantee that words in WE and words in vocab file must be the same
    del m['<s>']
    del m['</s>']
    del m['<d>']
    del m['</d>']
    del m['<p>']
    del m['</p>']
    tt = m.keys()

    vocab_new = vocab_file + "_new"
    with open(vocab_file, 'r') as f:
        with open(vocab_new, 'w') as g:
            for line in f.readlines():
                if all(string not in line for string in tt):
                    g.write(line)
            if '<UNK>' in m:
                g.write('<UNK> 0\n')
            if '<PAD>' in m:
                g.write('<PAD> 0\n')
    shutil.move(vocab_new, vocab_file)
    f.close()
    g.close()
示例#17
0
文件: run.py 项目: ScottLiao920/URECA
def main(unused_argv):
    vocab = data.Vocab(FLAGS.vocab_path, 1000000)
    # Check for presence of required special tokens.
    assert vocab.CheckVocab(data.PAD_TOKEN) > 0
    assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0
    assert vocab.CheckVocab(data.SENTENCE_START) > 0
    assert vocab.CheckVocab(data.SENTENCE_END) > 0

    batch_size = 64
    if FLAGS.mode == 'decode':
        batch_size = FLAGS.beam_size

    hps = seq2seq_attention_model.HParams(
        mode=FLAGS.mode,  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=batch_size,
        enc_layers=4,
        enc_timesteps=120,
        dec_timesteps=30,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=256,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=4096)  # If 0, no sampled softmax.

    eval_hps = seq2seq_attention_model.HParams(
        mode='eval',  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=batch_size,
        enc_layers=4,
        enc_timesteps=120,
        dec_timesteps=30,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=256,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=4096)  # If 0, no sampled softmax.

    batcher = batch_reader.Batcher(
        FLAGS.data_path, vocab, hps, FLAGS.article_key,
        FLAGS.abstract_key, FLAGS.max_article_sentences,
        FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing,
        truncate_input=FLAGS.truncate_input)
    eval_batcher = batch_reader.Batcher(
        FLAGS.eval_data_path, vocab, eval_hps, FLAGS.article_key,
        FLAGS.abstract_key, FLAGS.max_article_sentences,
        FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing,
        truncate_input=FLAGS.truncate_input)
    tf.set_random_seed(FLAGS.random_seed)

    if hps.mode == 'train':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        eval_model = seq2seq_attention_model.Seq2SeqAttentionModel(
            eval_hps, vocab, num_gpus=FLAGS.num_gpus
        )
        count = 0
        while count * FLAGS.eval_every_iteration < FLAGS.max_run_steps:
            _Train(model, batcher)
            eval_avg_loss = 0
            # read previous loss from eval_dir (if any)
            try:
                eval_results = tf.contrib.estimator.read_eval_metrics(FLAGS.eval_dir)
                i = 0
                for step, metrics in eval_results.items():
                    eval_avg_loss += metrics['running_avg_loss']
                    i += 1
                prev_avg_loss = eval_avg_loss / i
            except FileNotFoundError:
                print("Haven't run evaluation yet.")
            cur_loss = _Eval(eval_model, eval_batcher, 20, vocab=vocab)
            if eval_avg_loss is not 0 and prev_avg_loss < cur_loss:
                print("Early stopping!")
                break
            count += 1

    elif hps.mode == 'eval':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Eval(model, eval_batcher, vocab=vocab)
    elif hps.mode == 'decode':
        decode_mdl_hps = hps
        # Only need to restore the 1st step and reuse it since
        # we keep and feed in state for each step's output.
        decode_mdl_hps = hps._replace(dec_timesteps=1)
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)
        decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps, vocab)
        decoder.DecodeLoop()
示例#18
0
文件: main.py 项目: lmvasque/EditNTS
def main():
    # torch.manual_seed(233)
    set_seed(233)
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s [INFO] %(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--data_path',
        type=str,
        dest='data_path',
        default='/home/ml/ydong26/data/EditNTS_data/editnet_data/%s/' %
        dataset,
        help='Path to train vocab_data')
    parser.add_argument('--store_dir',
                        action='store',
                        dest='store_dir',
                        default='/home/ml/ydong26/tmp_store/editNTS_%s' %
                        dataset,
                        help='Path to exp storage directory.')
    parser.add_argument('--vocab_path',
                        type=str,
                        dest='vocab_path',
                        default='../vocab_data/',
                        help='Path contains vocab, embedding, postag_set')
    parser.add_argument(
        '--load_model',
        type=str,
        dest='load_model',
        default=None,
        help='Path for loading pre-trained model for further training')

    parser.add_argument('--vocab_size',
                        dest='vocab_size',
                        default=30000,
                        type=int)
    parser.add_argument('--batch_size',
                        dest='batch_size',
                        default=32,
                        type=int)
    parser.add_argument('--max_seq_len', dest='max_seq_len', default=100)

    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--hidden', type=int, default=200)
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--device', type=int, default=1, help='select GPU')
    parser.add_argument('--test',
                        action='store_true',
                        default=False,
                        dest='test_enabled')
    parser.add_argument('--run_eval',
                        action='store_true',
                        default=False,
                        dest='run_eval')
    parser.add_argument('--run_training',
                        action='store_true',
                        default=False,
                        dest='run_training')

    #train_file = '/media/vocab_data/yue/TS/editnet_data/%s/train.df.filtered.pos'%dataset
    # test='/media/vocab_data/yue/TS/editnet_data/%s/test.df.pos' % args.dataset
    args = parser.parse_args()
    print(args)
    torch.cuda.set_device(args.device)

    # load vocab-related files and init vocab
    print('*' * 10)
    vocab = data.Vocab()
    vocab.add_vocab_from_file(args.vocab_path + 'vocab.txt', args.vocab_size)
    vocab.add_embedding(gloveFile=args.vocab_path + 'glove.6B.100d.txt')
    pos_vocab = data.POSvocab(args.vocab_path)  #load pos-tags embeddings
    print('*' * 10)

    print(args)
    print("generating config")
    hyperparams = collections.namedtuple(
        'hps',  #hyper=parameters
        [
            'vocab_size', 'embedding_dim', 'word_hidden_units',
            'sent_hidden_units', 'pretrained_embedding', 'word2id', 'id2word',
            'pos_vocab_size', 'pos_embedding_dim'
        ])
    hps = hyperparams(vocab_size=vocab.count,
                      embedding_dim=100,
                      word_hidden_units=args.hidden,
                      sent_hidden_units=args.hidden,
                      pretrained_embedding=vocab.embedding,
                      word2id=vocab.w2i,
                      id2word=vocab.i2w,
                      pos_vocab_size=pos_vocab.count,
                      pos_embedding_dim=30)

    print('init editNTS model')
    edit_net = EditNTS(hps, n_layers=1)
    edit_net.cuda()

    if args.load_model is not None:
        print("load edit_net for further training")
        ckpt_path = args.load_model
        ckpt = Checkpoint.load(ckpt_path)
        print("Epoch: {} | Step: {}".format(ckpt.epoch, ckpt.step))
        edit_net = ckpt.model
        edit_net.cuda()
        edit_net.train()

    if args.run_eval:
        print("Running Evaluation..")
        eval_standalone(edit_net, args, vocab, ckpt)
    elif args.run_training:
        print("Running Training..")
        training(edit_net, args.epochs, args, vocab, test=args.test_enabled)
    else:
        print("ERROR: No running mode selected")
# DATA_PATH = "/home/synerzip/Sasidhar/Learning/Tensorflow/textsum/data/reviews"
# VOCAB_PATH = "/home/synerzip/Sasidhar/Learning/Tensorflow/textsum/data/vocab_1"
# LOG_ROOT = "/home/synerzip/Sasidhar/Learning/Tensorflow/textsum/log_root"

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('data_path', DATA_PATH, 'data path')
tf.app.flags.DEFINE_string('vocab_path', VOCAB_PATH,
                           'Path expression to text vocabulary file.')
tf.app.flags.DEFINE_string('log_root', LOG_ROOT, 'Directory for model root.')

tf.app.flags.DEFINE_integer('beam_size', 4,
                            'beam size for beam search decoding.')
tf.app.flags.DEFINE_integer('random_seed', 111, 'A seed value for randomness.')
tf.app.flags.DEFINE_integer('num_gpus', 0, 'Number of gpus used.')

vocab = data.Vocab(FLAGS.vocab_path, 10003)

batch_size = 4

hps = seq2seq_attention_model.HParams(
    mode='decode',
    min_lr=0.01,  # min learning rate.
    lr=0.15,  # learning rate
    batch_size=batch_size,
    enc_layers=4,
    enc_timesteps=200,
    dec_timesteps=30,
    min_input_len=2,  # discard articles/summaries < than this
    num_hidden=256,  # for rnn cell
    emb_dim=128,  # If 0, don't use embedding
    max_grad_norm=2,
示例#20
0
            else:
                yield (background_text, context_text, response_text, span_text,
                       b_start, b_end, r_start, r_end, example_id)


if __name__ == '__main__':

    hps_dict = {
        'mode': 'train',
        'batch_size': 16,
        'max_bac_enc_steps': 300,
        'max_con_enc_steps': 65,
        'max_dec_steps': 95
    }
    hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
    vocab = data.Vocab('data/mixed_context/finished_files/vocab', 25000)
    batcher = Batcher('data/mixed_context/finished_files/chunked/train_*',
                      vocab,
                      hps,
                      single_pass=False)
    batch = batcher.next_batch()

    # print("batch.target_batch: ",batch.target_batch)
    i = 0
    print()
    print("backgrounds: ", batch.original_backgrounds[i], "\n")
    print("contexts: ", batch.original_contexts[i], "\n")
    print("responses: ", batch.original_responses[i], "\n")
    print("spans: ", batch.original_spans[i], "\n")
    print("b_starts: ", batch.original_b_starts[i], "\n")
    print("b_ends: ", batch.original_b_ends[i], "\n")
if __name__ == '__main__':
    args = get_parser().parse_args()

    os.makedirs(args.prediction_log_dir, exist_ok=True)

    sequentialization_client = AstSequentializationApiClient(
        args.sequentialization_api_host,
        args.sequentialization_api_port,
    )
    if args.format == 'AST':
        source_code_processor = AstSequenceProcessor(sequentialization_client)
    else:
        source_code_processor = TokenizedCodeProcessor(
            sequentialization_client)
    Evaluator(
        GwtSectionPredictionTransformer,
        parse_sampler_settings(args.sampler_settings),
        args.evaluation_dataset_path,
        data.Vocab(args.vocab_path),
        bpe.BpeProcessor(args.bpe_model_path),
        source_code_processor,
        args.max_prediction_length,
        args.num_workers,
        args.device,
        args.write_results_to_tensorboard,
        args.prediction_log_dir,
        args.log_interval,
        args.evaluation_dataset_ids_path,
    ).evaluate(args.tensorboard_log_dir, args.max_number_of_checkpoints)
示例#22
0
def main(mode_type):
    # 读取词表
    vocab = data.Vocab(
        os.path.join(parameter_config.VOCAB_DIR,
                     parameter_config.VOCAB_FILE_NAME),
        parameter_config.VOCAB_SIZE)
    batch_size = parameter_config.BATCH_SIZE
    if mode_type == 'decode':
        batch_size = 1

    # 设置模型超参数
    hps = seq2seq_model.HParams(
        mode=mode_type,  # train, eval, decode
        batch_size=batch_size,
        enc_timesteps=parameter_config.ENC_TIMESTEPS,
        emb_dim=parameter_config.EMB_DIM,
        min_input_len=parameter_config.MIN_INPUT_LEN,
        num_hidden=parameter_config.NUM_HIDDEN,
        enc_layers=parameter_config.ENC_LAYERS,
        min_lr=parameter_config.MIN_LR,
        lr=parameter_config.LR,
        max_grad_norm=parameter_config.MAX_GRAD_NORM)

    tf.set_random_seed(111)

    if hps.mode == 'train':
        batcher = batch_reader.Batcher(parameter_config.TRAIN_DIR,
                                       vocab,
                                       'index',
                                       'target',
                                       'sentence',
                                       hps,
                                       bucketing=False,
                                       truncate_input=True)
        model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0)
        _Train(model, batcher, parameter_config.TRAIN_STEP)
    elif hps.mode == 'eval':
        batcher = batch_reader.Batcher(parameter_config.EVALUATION_SET,
                                       vocab,
                                       'index',
                                       'target',
                                       'sentence',
                                       hps,
                                       bucketing=False,
                                       truncate_input=True)
        model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0)
        _Eval(model, batcher)
    elif hps.mode == 'decode':
        batcher = batch_reader.Batcher(parameter_config.DECODE_DIR,
                                       vocab,
                                       'index',
                                       'target',
                                       'sentence',
                                       hps,
                                       bucketing=False,
                                       truncate_input=True)
        model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0)
        if not os.path.exists(
                os.path.join(os.getcwd(), parameter_config.DECODE_STORE_DIR)):
            os.mkdir(
                os.path.join(os.getcwd(), parameter_config.DECODE_STORE_DIR))
        _Decode(
            model, batcher,
            os.path.join(parameter_config.DECODE_STORE_DIR,
                         parameter_config.DECODE_STORE_FILE))
    elif hps.mode == 'eval_step':
        model = seq2seq_model.Seq2SeqModel(hps, vocab, num_gpus=0)
        _Eval_Step(model)
    else:
        print('mode_type must be train eval decode or eval_step')
示例#23
0
def main(unused_argv):
    tf.logging.set_verbosity(tf.logging.INFO)
    vocab = data.Vocab(FLAGS.vocab_path, 1000000)
    # Check for presence of required special tokens.
    assert vocab.CheckVocab(data.PAD_TOKEN) > 0
    assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0
    assert vocab.CheckVocab(data.START_DECODING) > 0
    assert vocab.CheckVocab(data.STOP_DECODING) > 0

    batch_size = 4
    if FLAGS.mode == 'decode':
        batch_size = FLAGS.beam_size

    hps = seq2seq_attention_model.HParams(
        mode=FLAGS.mode,  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=batch_size,
        enc_layers=1,
        enc_timesteps=800,
        dec_timesteps=200,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=256,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=4096,  # If 0, no sampled softmax.
        trunc_norm_init_std=0.05)

    batcher = batch_reader.Batcher(FLAGS.data_path,
                                   vocab,
                                   hps,
                                   FLAGS.article_id_key,
                                   FLAGS.article_key,
                                   FLAGS.abstract_key,
                                   FLAGS.labels_key,
                                   FLAGS.section_names_key,
                                   FLAGS.sections_key,
                                   FLAGS.max_article_sentences,
                                   FLAGS.max_abstract_sentences,
                                   bucketing=FLAGS.use_bucketing,
                                   truncate_input=FLAGS.truncate_input)
    tf.set_random_seed(FLAGS.random_seed)

    if hps.mode == 'train':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Train(model, batcher)
    elif hps.mode == 'eval':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Eval(model, batcher, vocab=vocab)
    elif hps.mode == 'decode':
        decode_mdl_hps = hps
        # Only need to restore the 1st step and reuse it since
        # we keep and feed in state for each step's output.
        decode_mdl_hps = hps._replace(dec_timesteps=1)
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)
        decoder = seq2seq_attention_decode.BeamSearchDecoder(
            model, batcher, hps, vocab)
        decoder.decode_loop()
示例#24
0
    vocab_path = os.path.join(pkg_path, "data/textsum/data/vocab.txt")
    data_path = os.path.join(pkg_path, "data/textsum/data/train.txt")
    hps = seq2seq_attention_model.HParams(
        mode='train',  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=2,
        enc_layers=2,
        enc_timesteps=100,
        dec_timesteps=20,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=256,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=0)  # If 0, no sampled softmax.
    vocab = data.Vocab(vocab_path, 10000)
    dataset = read_data_sets(data_path, vocab, hps)

    article_batch, abstract_batch, targets, source_article, source_abstract = dataset.next_batch(
        hps.batch_size)
    print(article_batch)
    print(source_article)

    print(abstract_batch)
    print(source_abstract)

    print(targets)
    print("\n")

    article_batch1, abstract_batch1, targets1, _, _ = dataset.next_batch(
        hps.batch_size)
def main(unused_argv):
    vocab = data.Vocab(FLAGS.vocab_path, 1000000)
    # Check for presence of required special tokens.
    assert vocab.CheckVocab(data.PAD_TOKEN) > 0
    assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0
    assert vocab.CheckVocab(data.SENTENCE_START) > 0
    assert vocab.CheckVocab(data.SENTENCE_END) > 0

    batch_size = 4
    if FLAGS.mode == 'decode':
        batch_size = FLAGS.beam_size

    hps = seq2seq_attention_model.HParams(
        mode=FLAGS.mode,  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=batch_size,
        enc_layers=1,
        enc_timesteps=120,
        dec_timesteps=30,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=128,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=4096)  # If 0, no sampled softmax.

    batcher = batch_reader.Batcher(FLAGS.data_path,
                                   vocab,
                                   hps,
                                   FLAGS.article_key,
                                   FLAGS.abstract_key,
                                   FLAGS.max_article_sentences,
                                   FLAGS.max_abstract_sentences,
                                   bucketing=FLAGS.use_bucketing,
                                   truncate_input=FLAGS.truncate_input)
    tf.set_random_seed(FLAGS.random_seed)

    if hps.mode == 'train':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Train(model, batcher)
    elif hps.mode == 'eval':
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            hps, vocab, num_gpus=FLAGS.num_gpus)
        _Eval(model, batcher, vocab=vocab)
    elif hps.mode == 'decode':
        decode_mdl_hps = hps
        # Only need to restore the 1st step and reuse it since
        # we keep and feed in state for each step's output.
        decode_mdl_hps = hps._replace(dec_timesteps=1)
        model = seq2seq_attention_model.Seq2SeqAttentionModel(
            decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)

        to_build_grapth = True
        p = preprocessing(FLAGS.vocab_path)

        # 舊的decode迴圈
        # while True:
        #     kb_input = input('> ')
        #     if kb_input == 'c':
        #         description_str = input('輸入description > ')
        #         context_str = input('輸入context> ')
        #         input_data = p.get_data(description=description_str, context=context_str)
        #         print('輸入資料:')
        #         pprint(input_data)
        #     elif kb_input == 'q':
        #         break
        #     else:
        #         try:
        #             text_to_binary('yahoo_knowledge_data/decode/ver_5/dataset_ready/data_ready_' + kb_input,
        #                     'yahoo_knowledge_data/decode/decode_data')
        #         except:
        #             print('預設testing data出現錯誤')
        #     decoder = seq2seq_attention_decode.BSDecoder(model, hps, vocab, to_build_grapth)
        #     to_build_grapth = False
        #     decoder.DecodeLoop()

        # 論文用的decode迴圈
        file_num = 1
        while True:
            if file_num % 60 == 0:
                print('已經印60筆')
                break
            try:
                text_to_binary(
                    'yahoo_knowledge_data/decode/ver_5/dataset_ready/data_ready_'
                    + str(file_num), 'yahoo_knowledge_data/decode/decode_data')
            except:
                print('預設testing data出現錯誤')
                break
            decoder = seq2seq_attention_decode.BSDecoder(
                model, hps, vocab, to_build_grapth)
            to_build_grapth = False
            decoder.DecodeLoop()
            print('==================', file_num, '==================')
            file_num += 1
示例#26
0
import data
from checkpoint import Checkpoint
from editnts import EditNTS
from evaluator import Evaluator

import torch
from torch import nn
from argparse import ArgumentParser
import collections

vocab = data.Vocab()
vocab.add_vocab_from_file('vocab_data/vocab.txt', 30000)
vocab.add_embedding(gloveFile='vocab_data/glove.6B.100d.txt')
pos_vocab = data.POSvocab('vocab_data')

print("generating config")
hyperparams = collections.namedtuple(
    'hps',  #hyper=parameters
    [
        'vocab_size', 'embedding_dim', 'word_hidden_units',
        'sent_hidden_units', 'pretrained_embedding', 'word2id', 'id2word',
        'pos_vocab_size', 'pos_embedding_dim'
    ])
# hps = hyperparams(
#     vocab_size=vocab.count,
#     embedding_dim=100,
#     word_hidden_units=200,
#     sent_hidden_units=200,
#     pretrained_embedding=vocab.embedding,
#     word2id=vocab.w2i,
#     id2word=vocab.i2w,
示例#27
0
def train(hps, device, summary):
    train_corpus = data.Corpus(hps['train_corpus'], hps['tokenization'])
    eval_corpus = data.Corpus(hps['eval_corpus'], hps['tokenization'])
    test_corpus = None
    token_list = train_corpus.export_token_list(
    ) + eval_corpus.export_token_list()
    if hps['test_corpus']:
        test_corpus = data.Corpus(hps['test_corpus'], hps['tokenization'])
        token_list += test_corpus.export_token_list()

    vocab = data.Vocab(token_list)
    vocab.save(hps['vocab_file'])
    vocab = data.Vocab.load(hps['vocab_file'])

    train_corpus.tokenize(vocab)
    eval_corpus.tokenize(vocab)

    ntokens = vocab.size()

    m = model.RNNModel(
        ntokens,
        hps['emsize'],
        hps['nhid'],
        hps['nlayers'],
        hps['dropout'],
        hps['tied'],
    ).to(device)

    criterion = torch.nn.CrossEntropyLoss()

    train_data = batchify(train_corpus.ids, hps['batch_size'], device)
    eval_data = batchify(eval_corpus.ids, hps['batch_size'], device)
    if test_corpus is not None:
        test_corpus.tokenize(vocab)
        test_data = batchify(test_corpus.ids, hps['batch_size'], device)
    else:
        test_data = None

    lr = hps['lr']
    best_val_loss = None
    val_loss_not_improved = 0
    n_batches = len(train_data) // hps['bptt']
    test_loss, test_perp = -1, -1

    # At any point you can hit Ctrl + C to break out of training early.
    print('-' * 95)
    try:
        for epoch in range(1, hps['epochs'] + 1):

            # Train for one epoch
            epoch_start_time = time.time()
            train_epoch(m, criterion, train_data, vocab, hps, lr, epoch,
                        device, summary)
            elapsed = time.time() - epoch_start_time
            step = hps['batch_size'] * helpers.get_num_batches_seen(
                epoch, n_batches, n_batches)

            # Evaluate model on validation set
            with torch.no_grad():
                val_loss = evaluate(m, criterion, eval_data, vocab, hps)
                val_perp = math.exp(val_loss)

            summary.add_scalar('ValidationLoss', val_loss, step)
            summary.add_scalar('ValidationPerp', val_perp, step)

            # Evaluate model on test set
            if test_data is not None:
                with torch.no_grad():
                    test_loss = evaluate(m, criterion, test_data, vocab, hps)
                    test_perp = math.exp(test_loss)
                    test_bpc = test_loss * math.log2(math.e)

                summary.add_scalar('TestLoss', test_loss, step)
                summary.add_scalar('TestPerp', test_perp, step)
                summary.add_scalar('TestBPC', test_bpc, step)

            helpers.log_end_of_epoch(epoch, elapsed, val_loss, val_perp,
                                     test_loss, test_perp)

            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                os.makedirs(os.path.dirname(hps['save']), exist_ok=True)
                with open(hps['save'], 'wb') as f:
                    torch.save(m, f)

                best_val_loss = val_loss
            else:
                val_loss_not_improved += 1
                if val_loss_not_improved == 3:
                    # Anneal the learning rate if no improvement has been seen in the validation dataset.
                    lr /= 4
                    val_loss_not_improved = 0

            summary.add_scalar('LR', lr, step)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')
示例#28
0
                    self.example_q_threads[idx] = new_t
                    new_t.daemon = True
                    new_t.start()
            for idx, t in enumerate(self.batch_q_threads):
                if not t.is_alive():  # if the thread is dead
                    tf.logging.error(
                        "Found batch queue thread dead. Restarting.")
                    new_t = Thread(target=self.fill_batch_queue)
                    self.batch_q_threads[idx] = new_t
                    new_t.daemon = True
                    new_t.start()


###############################################################################
if __name__ == "__main__":
    vocab = data.Vocab("./data/vocab.txt", 0, 50)
    print vocab.tag_to_id
    print vocab.id_to_tag

    hps_dict = {
        "batch_size": 4,
        "max_steps": 50,
        "mode": "train",
        "single_pass": False,
    }
    hps = namedtuple("hps", hps_dict.keys())(**hps_dict)

    example = Example("现代化的战舰上", "BMESMES", vocab, hps)
    print example.sentence
    print example.label
    print example.len
示例#29
0
import data

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--content', metavar='content', )
    args = parser.parse_args()
    hps = seq2seq_attention_model.HParams(
        mode='eval',  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.15,  # learning rate
        batch_size=1,
        enc_layers=4,
        enc_timesteps=120,
        dec_timesteps=30,
        min_input_len=2,  # discard articles/summaries < than this
        num_hidden=256,  # for rnn cell
        emb_dim=128,  # If 0, don't use embedding
        max_grad_norm=2,
        num_softmax_samples=4096)

    vocab = data.Vocab('vocabulary.txt', 1000000)
    model = seq2seq_attention_model.Seq2SeqAttentionModel(hps, vocab, num_gpus=0)
    model.build_graph()
    saver = tf.train.Saver()
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    ckpt_state = tf.train.get_checkpoint_state('model_save')
    (summaries, loss, train_step) = model.run_eval_step(
        sess, [args.content], [''], targets, article_lens,
        abstract_lens, loss_weights)
示例#30
0
def train(hps):
    train_corpus = data.Corpus(hps['train_corpus'], hps['tokenization'])
    eval_corpus = data.Corpus(hps['eval_corpus'], hps['tokenization'])
    token_list = train_corpus.export_token_list() + eval_corpus.export_token_list()
    if hps['test_corpus']:
        test_corpus = data.Corpus(hps['test_corpus'], hps['tokenization'])
        token_list += test_corpus.export_token_list()
    else:
        test_corpus = None

    vocab = data.Vocab(token_list)
    vocab.save(hps['vocab_file'])
    vocab = data.Vocab.load(hps['vocab_file'])

    train_corpus.tokenize(vocab)
    eval_corpus.tokenize(vocab)

    ntokens = vocab.size()
    m = model.RNNModel(
        ntokens,
        hps['emsize'],
        hps['nhid'],
        hps['nlayers'],
        hps['dropout'],
        hps['tied'], )
    if hps['cuda']:
        m.cuda()

    criterion = torch.nn.CrossEntropyLoss()

    train_data = batchify(train_corpus.ids, hps['batch_size'], hps['cuda'])
    eval_data = batchify(eval_corpus.ids, hps['batch_size'], hps['cuda'])
    if test_corpus is not None:
        test_corpus.tokenize(vocab)
        test_data = batchify(test_corpus.ids, hps['batch_size'], hps['cuda'])
    else:
        test_data = None

    lr = hps['lr']
    best_val_loss = None

    # At any point you can hit Ctrl + C to break out of training early.
    print('-' * 95)
    try:
        for epoch in range(1, hps['epochs'] + 1):
            epoch_start_time = time.time()
            train_epoch(m, criterion, train_data, vocab, hps, lr, epoch)
            val_loss = evaluate(m, criterion, eval_data, vocab, hps)
            print('-' * 95)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:14.8f}'.format(
                    epoch, (time.time() - epoch_start_time), val_loss,
                    math.exp(val_loss)))
            if test_data is not None:
                test_loss = evaluate(m, criterion, test_data, vocab, hps)
                print(
                    '|                                  |  test loss {:5.2f} | '
                    ' test ppl {:14.8f}'.format(test_loss, math.exp(test_loss)))
            print('-' * 95)
            # Save the model if the validation loss is the best we've seen so
            # far.
            if not best_val_loss or val_loss < best_val_loss:
                os.makedirs(os.path.dirname(hps['save']), exist_ok=True)
                if hps['cuda']:
                    with open(hps['save'] + '.gpu', 'wb') as f:
                        torch.save(m, f)
                    m.cpu()
                    with open(hps['save'] + '.cpu', 'wb') as f:
                        torch.save(m, f)
                    m.cuda()
                else:
                    with open(hps['save'] + '.cpu', 'wb') as f:
                        torch.save(m, f)
                best_val_loss = val_loss
            else:
                # Anneal the learning rate if no improvement has been seen in
                # the validation dataset.
                lr /= 4
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')