Пример #1
0
def train_val():
    ''' Train on the training set, and validate on seen and unseen splits. '''
  
    setup()
    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH)
    train_env = R2RBatch(features, batch_size=batch_size, splits=['train'], tokenizer=tok)

    # Creat validation environments
    val_envs = {split: (R2RBatch(features, batch_size=batch_size, splits=[split], 
                tokenizer=tok), Evaluation([split])) for split in ['val_seen', 'val_unseen']}

    # Build models and train
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, 
                  dropout_ratio, bidirectional=bidirectional).cuda()
    decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(),
                  action_embedding_size, hidden_size, dropout_ratio).cuda()
    train(train_env, encoder, decoder, n_iters, val_envs=val_envs)
Пример #2
0
def test_submission():
    ''' Train on combined training and validation sets, and generate test submission. '''
  
    setup()
    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAINVAL_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH)
    train_env = R2RBatch(features, batch_size=batch_size, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok)
    
    # Build models and train
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, 
                  dropout_ratio, bidirectional=bidirectional).cuda()
    decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(),
                  action_embedding_size, hidden_size, dropout_ratio).cuda()
    train(train_env, encoder, decoder, n_iters)

    # Generate test submission
    test_env = R2RBatch(features, batch_size=batch_size, splits=['test'], tokenizer=tok)
    agent = Seq2SeqAgent(test_env, "", encoder, decoder, max_episode_len)
    agent.results_path = '%s%s_%s_iter_%d.json' % (RESULT_DIR, model_prefix, 'test', 20000)
    agent.test(use_dropout=False, feedback='argmax')
    agent.write_results()
Пример #3
0
def eval(args):
    transsys_lookup = {
        "Cov": Covington,
        "NCov": NewCovington,
        "Cov2": Covington2,
        "Cov3": Covington3
    }
    transsys = transsys_lookup[args.transsys]

    vocab, vecs, pretrained = read_vocab(conll_file=args.conll_file,
                                         wordvec_file=args.wordvec_file,
                                         vocab_file=args.vocab_file,
                                         wordvec_dim=args.wordvec_dim,
                                         min_count=args.min_count,
                                         log=log)
    mappings, invmappings = read_mappings(args.mappings_file,
                                          transsys,
                                          log=log)
    data, sent_length, trans_length = read_data(conll_file=args.conll_file,
                                                seq_file=args.seq_file,
                                                vocab=vocab,
                                                mappings=mappings,
                                                transsys=transsys,
                                                fpos=args.fpos,
                                                log=log)

    if args.transsys == 'NCov':
        sent_length = 70

    feat_shape = [5] if args.transsys == 'Cov' else [sent_length, 5]

    transsys = transsys(mappings, invmappings)

    parser = Parser(args,
                    vecs,
                    pretrained,
                    mappings,
                    invmappings,
                    sent_length,
                    trans_length,
                    -1,
                    log,
                    train=False)

    trans_predictors = parser.trans_predictors

    log.info('Computational graph successfully built.')
    log.info('Setting up tensorflow session...')

    saver = tf.train.Saver(max_to_keep=10000)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)
    for epoch in reversed(xrange(int(args.epochs * args.epoch_multiplier))):
        #with tf.Session(config=config) as sess:
        savedpath = '%s/model_epoch%d' % (args.model_dir, epoch)
        if not op.exists(savedpath + '.meta'):
            continue
        log.info('Evaluating Epoch %3d...' % (epoch))
        saver.restore(sess, savedpath)

        #print "aki empieza a crear los estados"
        states = [[(0, ParserState(datum[0], transsys=transsys))]
                  for datum in data]
        #print "aki termina++++++++++++++++++++++++++++++++++++++++++++++"

        with smart_open(
                '%s/%s_pos_eval_beam_%d_output_epoch%d.txt' %
            (args.model_dir, args.eval_dataset, args.beam_size, epoch),
                'w') as outf2:
            with smart_open(
                    '%s/%s_eval_beam_%d_output_epoch%d.txt' %
                (args.model_dir, args.eval_dataset, args.beam_size, epoch),
                    'w') as outf:
                for batch in xrange(
                    (len(data) + args.batch_size - 1) / args.batch_size):
                    #print "Empieza un nuevo batch"
                    idx = range(batch * args.batch_size,
                                min((batch + 1) * args.batch_size, len(data)))

                    batch_size = len(idx)

                    batch_data = [data[i] for i in idx]
                    batch_states = [states[i] for i in idx]

                    # prepare data in tensor shape
                    batch_sent_lengths = np.array(
                        [len(datum[0]) for datum in batch_data] +
                        [sent_length] * (args.batch_size - batch_size),
                        dtype=np.int32)
                    batch_words = np.zeros((args.batch_size, sent_length),
                                           dtype=np.int32)
                    batch_words2 = np.zeros((args.batch_size, sent_length),
                                            dtype=np.int32)
                    batch_gold_pos = np.zeros((args.batch_size, sent_length),
                                              dtype=np.int32)
                    for i in xrange(batch_size):
                        batch_words[
                            i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_words2[
                            i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_gold_pos[
                            i, :batch_sent_lengths[i]] = batch_data[i][2]

                    batch_trans_feat_ids = np.zeros(
                        tuple([args.batch_size * args.beam_size] + feat_shape),
                        dtype=np.int32)
                    batch_trans_feat_sizes = np.zeros(
                        (args.batch_size * args.beam_size), dtype=np.int32)

                    preds_list = [
                        parser.combined_head, parser.combined_dep,
                        parser.pos_preds
                    ]

                    if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                        preds_list += [parser.transition_logit]
                    if args.fpos:
                        preds_list += [parser.fpos_preds]

                    preds = sess.run(preds_list,
                                     feed_dict={
                                         parser.words: batch_words,
                                         parser.words2: batch_words2,
                                         parser.sent_lengths:
                                         batch_sent_lengths,
                                         parser.gold_pos: batch_gold_pos,
                                     })
                    # unpack predictions
                    batch_combined_head, batch_combined_dep, pos_preds = preds[:
                                                                               3]
                    preds = preds[3:]
                    if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                        batch_trans_logit = preds[0]
                        preds = preds[1:]
                    if args.fpos:
                        fpos_preds = preds[0]
                        preds = preds[1:]

                    if args.fpos:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i] - 1):
                                outf2.write(
                                    "%s\t%s\n" %
                                    (invmappings['pos'][pos_preds[i][j]],
                                     invmappings['fpos'][fpos_preds[i][j]]))
                            outf2.write("\n")
                    else:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i] - 1):
                                outf2.write(
                                    "%s\t_\n" %
                                    invmappings['pos'][pos_preds[i][j]])
                            outf2.write("\n")

                    j = 0
                    updated = range(batch_size)
                    batch_finished = [[] for _ in range(batch_size)]

                    feat_lengths = [[] for _ in range(batch_size)]

                    #print 'dale_____________2222222______________________________'

                    while True:
                        batch_feats = [[
                            featurize_state(batch_states[i][k][1], mappings)
                            for k in range(len(batch_states[i]))
                        ] for i in updated]

                        #print 'bach feats'
                        #print batch_feats

                        for i, beam_feats in zip(updated, batch_feats):

                            #print '====='
                            #print beam_feats

                            feats = beam_feats[0]
                            if len(feats) > 0:
                                if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                                    feat_lengths[i] += [len(feats)]
                                else:
                                    feat_lengths[i] += [
                                        len(batch_states[i][0]
                                            [1].transitionset())
                                    ]
                                    ##print batch_states[i][0][1].transitionset()

                        preds = []
                        predsid = []
                        for i, beam_feats in zip(updated, batch_feats):
                            for k, feats in enumerate(beam_feats):
                                if len(feats) <= 0:
                                    if len(batch_finished[i]) < args.beam_size:
                                        heappush(batch_finished[i],
                                                 batch_states[i][k])
                                    else:
                                        heappushpop(batch_finished[i],
                                                    batch_states[i][k])

                                    continue

                                beamidx = i * args.beam_size + k
                                if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':

                                    #print('sent_length',sent_length)
                                    #print('btfi size', len(batch_trans_feat_ids))
                                    #print feats
                                    #print [args.batch_size * args.beam_size] + feat_shape
                                    batch_trans_feat_ids[
                                        beamidx, :len(feats)] = feats
                                else:
                                    batch_trans_feat_ids[beamidx] = feats

                                batch_trans_feat_sizes[beamidx] = len(feats)

                                assert (batch_trans_feat_sizes[beamidx] > 0)

                                predsid.append((i, k))
                                preds.append(trans_predictors[i][k])

                        if len(predsid) <= 0:
                            break

                        if args.transsys == 'NCov' or args.transsys == 'Cov2' or args.transsys == 'Cov3':
                            p = sess.run(preds,
                                         feed_dict={
                                             parser.combined_head_placeholder:
                                             batch_combined_head,
                                             parser.combined_dep_placeholder:
                                             batch_combined_dep,
                                             parser.trans_logit_placeholder:
                                             batch_trans_logit,
                                             parser.trans_feat_ids:
                                             batch_trans_feat_ids,
                                             parser.trans_feat_sizes:
                                             batch_trans_feat_sizes
                                         })
                        else:
                            p = sess.run(preds,
                                         feed_dict={
                                             parser.combined_head_placeholder:
                                             batch_combined_head,
                                             parser.combined_dep_placeholder:
                                             batch_combined_dep,
                                             parser.trans_feat_ids:
                                             batch_trans_feat_ids,
                                             parser.trans_feat_sizes:
                                             batch_trans_feat_sizes
                                         })

                        next_batchstates = [[] for _ in xrange(batch_size)]
                        updated = set()

                        for ik, pred in izip(predsid, p):
                            i, k = ik

                            updated.add(i)

                            #print("deberia ser 0 al final", len(batch_states[i][k][1].transitionset()))
                            if len(batch_states[i][k][1].transitionset()) > 0:
                                # model outputs NLLs so the lower the better
                                sort = sorted(enumerate(pred),
                                              key=lambda x: x[1])
                                expanded_beams = 0
                                for choice, score in sort:
                                    newscore = batch_states[i][k][0] - score

                                    #print 'transition set'
                                    #print sort
                                    #print( 'choice', choice)

                                    #print  transsys.tuple_trans_from_int(batch_states[i][k][1].transitionset(), choice)[0]
                                    #print 'allowed antes de entrar'
                                    #print  batch_states[i][k][1].transitionset()
                                    if transsys.tuple_trans_from_int(
                                            batch_states[i][k]
                                        [1].transitionset(),
                                            choice)[0] in batch_states[i][k][
                                                1].transitionset():
                                        candidate = (newscore,
                                                     batch_states[i][k][1],
                                                     choice)
                                        if len(next_batchstates[i]
                                               ) < args.beam_size:
                                            heappush(next_batchstates[i],
                                                     candidate)
                                        elif newscore > next_batchstates[i][0][
                                                0]:
                                            heappushpop(
                                                next_batchstates[i], candidate)

                                        #print 'candidadte'
                                        #print candidate
                                        expanded_beams += 1
                                        if expanded_beams >= args.beam_size:
                                            break

                        #print 'dale____________ini pred______________________________'

                        for i in updated:
                            next_batchstates[i] = nlargest(args.beam_size,
                                                           next_batchstates[i],
                                                           key=lambda x: x[0])
                            for k, t in enumerate(next_batchstates[i]):
                                #print '------------config executing------------------'
                                score, state, choice = t
                                state = state.clone()
                                transsys.advance(state, choice)
                                next_batchstates[i][k] = (score, state)

                        #print 'dale____________fin pred______________________________'

                        batch_states = next_batchstates

                        j += 1

                    #print 'dale_______escribe____________________________________'
                    for i in xrange(batch_size):
                        assert len(batch_finished) == batch_size
                        assert len(
                            batch_finished[i]) > 0, "nothing finished: %d" % (
                                i)
                        assert len(batch_finished[i][0]) > 1, "%s" % (
                            batch_finished[i][0])
                        state_pred = nlargest(1,
                                              batch_finished[i],
                                              key=lambda x: x[0])[0][1]
                        for t in state_pred.head[1:]:
                            outf.write("%d\t%s\n" %
                                       (t[0], invmappings['rel'][t[1]]))
                        outf.write("\n")

                    log.info('Epoch %3d batch %4d' % (epoch, batch))
        log.info('Use exclusively the model of last epoch'
                 )  #Added to just use the last model
        break
    sess.close()
Пример #4
0
def train(language, embed_size, mode='proj',
          model_save_path=utils.DEFAULT_MODEL_PATH,
          data_path=utils.DEFAULT_DATA_PATH,
          batch_size=DEFAULT_BATCH_SIZE,
          weighted=False, normalize=False, use_bias=True):
    # type: (str, int, str, str, str, int, bool, bool, bool) -> None
    assert os.path.isdir(data_path), 'Data path %s doesn\'t exist'
    assert language == 'global' or language in utils.ECOSYSTEMS, \
        'Unknown programming language'
    assert mode in _MODE_FNAMES, (
            "Invalid mode '%s'. Should be one of %s" % (
                mode, ','.join(_MODE_FNAMES)))

    model_save_fname = os.path.join(model_save_path, '%s_%s_%d%s%s%s.model' % (
        language, mode, embed_size, '_norm' if normalize else '',
        '_no_bias' if not use_bias else '',
        '_weighted' if weighted else '',
    ))
    sys.stderr.write('The model will be saved to: %s\n' % model_save_fname)

    if not os.path.isdir(model_save_path):
        os.mkdir(model_save_path)

    vocab_path = os.path.join(data_path, language + '_vocab.csv')
    if language == 'global' and not os.path.isfile(vocab_path):
        # TODO: implement
        raise NotImplementedError
        # build_global_data(language)

    sys.stderr.write('Reading vocabulary..\n')
    idx2namespace, namespace2idx = utils.read_vocab(vocab_path)
    vocab_size = len(idx2namespace)

    sys.stderr.write('Reading dataset..\n')
    csv.field_size_limit(2147483647)  # DANGER ZONE, but won't read otherwise
    imports_prefix = '%s_%s_imports_' % (language, mode)

    input_data_train, input_offsets_train = utils.read_dev(
        os.path.join(data_path, imports_prefix + 'train.csv'), namespace2idx)
    input_data_val, input_offsets_val = utils.read_dev(
        os.path.join(data_path, imports_prefix + 'val.csv'), namespace2idx)

    dataset_train = dev2vecSequence(
        input_data_train, input_offsets_train,
        vocab_size=vocab_size, batch_size=batch_size)
    dataset_val = dev2vecSequence(
        input_data_val, input_offsets_val,
        vocab_size=vocab_size, batch_size=batch_size)

    model = get_nn_model(
        vocab_size, embed_size, normalize=False, batch_size=DEFAULT_BATCH_SIZE)

    loss_fn = 'binary_crossentropy'
    if weighted:
        counts_fname = os.path.join(
            data_path, '%s_namespace_counts_by_%s.csv' % (
                language, 'projects' if mode == 'proj' else mode))
        counts = df = pd.read_csv(
            counts_fname, header=None, index_col=0, squeeze=True)
        counts = counts[
            [idx2namespace[idx] for idx in range(len(idx2namespace))]]
        label_weights = counts / counts.median()
        loss_fn = get_weighted_loss_fn(label_weights)

    model.compile(optimizer='adam', loss=loss_fn)
    # in most cases, model starts to overfit after less than one epoch
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)

    sys.stderr.write('Training..\n')
    model.fit(dataset_train, epochs=2, callbacks=[callback],
              validation_data=dataset_val)

    sys.stderr.write('Saving the model..\n')
    model.save(model_save_fname)
Пример #5
0
def eval(args):
    transsys_lookup = {"ASw": ArcSwift,
                       "AER" : ArcEagerReduce,
                       "AES": ArcEagerShift,
                       "ASd"  : ArcStandard,
                       "AH"  : ArcHybrid,}
    transsys = transsys_lookup[args.transsys]

    vocab, vecs, pretrained = read_vocab(conll_file=args.conll_file, wordvec_file=args.wordvec_file, vocab_file=args.vocab_file, wordvec_dim=args.wordvec_dim, min_count=args.min_count, log=log)
    mappings, invmappings = read_mappings(args.mappings_file, transsys, log=log)
    data, sent_length, trans_length = read_data(conll_file=args.conll_file, seq_file=args.seq_file, vocab=vocab, mappings=mappings, transsys=transsys, fpos=args.fpos, log=log)

    feat_shape = [5] if args.transsys != 'ASw' else [sent_length, 5]

    transsys = transsys(mappings, invmappings)

    parser = Parser(args, vecs, pretrained, mappings, invmappings, sent_length, trans_length, -1, log, train=False)

    trans_predictors = parser.trans_predictors

    log.info('Computational graph successfully built.')
    log.info('Setting up tensorflow session...')

    saver = tf.train.Saver(max_to_keep=10000)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)
    for epoch in reversed(xrange(int(args.epochs * args.epoch_multiplier))):
        #with tf.Session(config=config) as sess:
        savedpath = '%s/model_epoch%d' % (args.model_dir, epoch)
        if not op.exists(savedpath + '.meta'):
            continue
        log.info('Evaluating Epoch %3d...' % (epoch))
        saver.restore(sess, savedpath)

        states = [[(0, ParserState(datum[0], transsys=transsys))] for datum in data]
        with smart_open('%s/%s_pos_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf2:
            with smart_open('%s/%s_eval_beam_%d_output_epoch%d.txt' % (args.model_dir, args.eval_dataset, args.beam_size, epoch), 'w') as outf:
                for batch in xrange((len(data)+args.batch_size-1) / args.batch_size):
                    idx = range(batch * args.batch_size, min((batch+1) * args.batch_size, len(data)))

                    batch_size = len(idx)

                    batch_data = [data[i] for i in idx]
                    batch_states = [states[i] for i in idx]

                    # prepare data in tensor shape
                    batch_sent_lengths = np.array([len(datum[0]) for datum in batch_data] + [sent_length] * (args.batch_size - batch_size), dtype=np.int32)
                    batch_words = np.zeros((args.batch_size, sent_length), dtype=np.int32)
                    batch_words2 = np.zeros((args.batch_size, sent_length), dtype=np.int32)
                    batch_gold_pos = np.zeros((args.batch_size, sent_length), dtype=np.int32)
                    for i in xrange(batch_size):
                        batch_words[i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_words2[i, :batch_sent_lengths[i]] = batch_data[i][0]
                        batch_gold_pos[i, :batch_sent_lengths[i]] = batch_data[i][2]

                    batch_trans_feat_ids = np.zeros(tuple([args.batch_size * args.beam_size] + feat_shape), dtype=np.int32)
                    batch_trans_feat_sizes = np.zeros((args.batch_size * args.beam_size), dtype=np.int32)

                    preds_list = [parser.combined_head, parser.combined_dep, parser.pos_preds]
                    if args.transsys == 'ASw':
                        preds_list += [parser.transition_logit]
                    if args.fpos:
                        preds_list += [parser.fpos_preds]

                    preds = sess.run(preds_list,
                               feed_dict={parser.words: batch_words,
                                          parser.words2: batch_words2,
                                          parser.sent_lengths: batch_sent_lengths,
                                          parser.gold_pos: batch_gold_pos,})
                    # unpack predictions
                    batch_combined_head, batch_combined_dep, pos_preds = preds[:3]
                    preds = preds[3:]
                    if args.transsys == 'ASw':
                        batch_trans_logit = preds[0]
                        preds = preds[1:]
                    if args.fpos:
                        fpos_preds = preds[0]
                        preds = preds[1:]

                    if args.fpos:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i]-1):
                                outf2.write("%s\t%s\n" % (invmappings['pos'][pos_preds[i][j]], invmappings['fpos'][fpos_preds[i][j]]))
                            outf2.write("\n")
                    else:
                        for i in xrange(batch_size):
                            for j in xrange(batch_sent_lengths[i]-1):
                                outf2.write("%s\t_\n" % invmappings['pos'][pos_preds[i][j]])
                            outf2.write("\n")

                    j = 0
                    updated = range(batch_size)
                    batch_finished = [[] for _ in range(batch_size)]

                    feat_lengths = [[] for _ in range(batch_size)]

                    while True:
                        batch_feats = [[featurize_state(batch_states[i][k][1], mappings) for k in range(len(batch_states[i]))] for i in updated]
                        for i, beam_feats in zip(updated, batch_feats):
                            feats = beam_feats[0]
                            if len(feats) > 0:
                                if args.transsys == 'ASw':
                                    feat_lengths[i] += [len(feats)]
                                else:
                                    feat_lengths[i] += [len(batch_states[i][0][1].transitionset())]

                        preds = []
                        predsid = []
                        for i, beam_feats in zip(updated, batch_feats):
                            for k, feats in enumerate(beam_feats):
                                if len(feats) <= 0:
                                    if len(batch_finished[i]) < args.beam_size:
                                        heappush(batch_finished[i], batch_states[i][k])
                                    else:
                                        heappushpop(batch_finished[i], batch_states[i][k])

                                    continue

                                beamidx = i * args.beam_size + k
                                if args.transsys == 'ASw':
                                    batch_trans_feat_ids[beamidx, :len(feats)] = feats
                                else:
                                    batch_trans_feat_ids[beamidx] = feats

                                batch_trans_feat_sizes[beamidx] = len(feats)

                                assert(batch_trans_feat_sizes[beamidx] > 0)

                                predsid.append((i, k))
                                preds.append(trans_predictors[i][k])

                        if len(predsid) <= 0:
                            break

                        if args.transsys == 'ASw':
                            p = sess.run(preds, feed_dict={parser.combined_head_placeholder: batch_combined_head,
                                                       parser.combined_dep_placeholder: batch_combined_dep,
                                                       parser.trans_logit_placeholder:batch_trans_logit,
                                                       parser.trans_feat_ids: batch_trans_feat_ids,
                                                       parser.trans_feat_sizes: batch_trans_feat_sizes})
                        else:
                            p = sess.run(preds, feed_dict={parser.combined_head_placeholder: batch_combined_head,
                                                       parser.combined_dep_placeholder: batch_combined_dep,
                                                       parser.trans_feat_ids: batch_trans_feat_ids,
                                                       parser.trans_feat_sizes: batch_trans_feat_sizes})

                        next_batchstates = [[] for _ in xrange(batch_size)]
                        updated = set()
                        for ik, pred in izip(predsid, p):
                            i, k = ik

                            updated.add(i)

                            if len(batch_states[i][k][1].transitionset()) > 0:
                                # model outputs NLLs so the lower the better
                                sort = sorted(enumerate(pred), key=lambda x: x[1])
                                expanded_beams = 0
                                for choice, score in sort:
                                    newscore = batch_states[i][k][0] - score

                                    if transsys.tuple_trans_from_int(batch_states[i][k][1].transitionset(), choice)[0] in batch_states[i][k][1].transitionset():
                                        candidate = (newscore, batch_states[i][k][1], choice)
                                        if len(next_batchstates[i]) < args.beam_size:
                                            heappush(next_batchstates[i], candidate)
                                        elif newscore > next_batchstates[i][0][0]:
                                            heappushpop(next_batchstates[i], candidate)

                                        expanded_beams += 1
                                        if expanded_beams >= args.beam_size:
                                            break

                        for i in updated:
                            next_batchstates[i] = nlargest(args.beam_size, next_batchstates[i], key=lambda x:x[0])
                            for k, t in enumerate(next_batchstates[i]):
                                score, state, choice = t
                                state = state.clone()
                                transsys.advance(state, choice)
                                next_batchstates[i][k] = (score, state)

                        batch_states = next_batchstates

                        j += 1

                    for i in xrange(batch_size):
                        assert len(batch_finished) == batch_size
                        assert len(batch_finished[i]) > 0, "nothing finished: %d" % (i)
                        assert len(batch_finished[i][0]) > 1, "%s" % (batch_finished[i][0])
                        state_pred = nlargest(1, batch_finished[i], key=lambda x:x[0])[0][1]
                        for t in state_pred.head[1:]:
                            outf.write("%d\t%s\n" % (t[0], invmappings['rel'][t[1]]))
                        outf.write("\n")

                    log.info('Epoch %3d batch %4d' % (epoch, batch))
    sess.close()
Пример #6
0
def train_val():
    ''' Train on the training set, and validate on seen and unseen splits. '''

    # Set which GPU to use
    device = torch.device('cuda', hparams.device_id)

    # Load hyperparameters from checkpoint (if exists)
    if os.path.exists(hparams.load_path):
        print('Load model from %s' % hparams.load_path)
        ckpt = load(hparams.load_path, device)
        start_iter = ckpt['iter']
    else:
        if not hparams.forward_agent and not hparams.random_agent and not hparams.shortest_agent:
            if hasattr(hparams, 'load_path') and hasattr(
                    hparams, 'eval_only') and hparams.eval_only:
                sys.exit('load_path %s does not exist!' % hparams.load_path)
        ckpt = None
    start_iter = 0
    end_iter = hparams.n_iters

    if not hasattr(hparams, 'ask_baseline'):
        hparams.ask_baseline = None
    if not hasattr(hparams, 'instruction_baseline'):
        hparams.instruction_baseline = None

    # Set random seeds
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    np.random.seed(hparams.seed)
    random.seed(hparams.seed)

    # Create or load vocab
    train_vocab_path = os.path.join(hparams.data_path, 'vocab.txt')
    if not os.path.exists(train_vocab_path):
        raise Exception('Vocab file not found at %s' % train_vocab_path)
    vocab = read_vocab([train_vocab_path])
    hparams.instr_padding_idx = vocab.index('<PAD>')

    tokenizer = Tokenizer(vocab=vocab, encoding_length=hparams.max_instr_len)
    featurizer = ImageFeatures(hparams.img_features, device)
    simulator = Simulator(hparams)

    # Create train environment
    train_env = Batch(hparams, simulator, featurizer, tokenizer, split='train')

    # Create validation environments
    val_splits = ['val_seen', 'val_unseen']
    eval_mode = hasattr(hparams, 'eval_only') and hparams.eval_only
    if eval_mode:
        if 'val_seen' in hparams.load_path:
            val_splits = ['test_seen']
        elif 'val_unseen' in hparams.load_path:
            val_splits = ['test_unseen']
        else:
            val_splits = ['test_seen', 'test_unseen']
        end_iter = start_iter + 1

    if hparams.eval_on_val:
        val_splits = [x.replace('test_', 'val_') for x in val_splits]

    val_envs_tmp = {
        split:
        (Batch(hparams, simulator, featurizer, tokenizer,
               split=split), Evaluation(hparams, [split], hparams.data_path))
        for split in val_splits
    }

    val_envs = {}
    for key, value in val_envs_tmp.items():
        if '_seen' in key:
            val_envs[key + '_env_seen_anna'] = value
            val_envs[key + '_env_unseen_anna'] = value
        else:
            assert '_unseen' in key
            val_envs[key] = value

    # Build model and optimizer
    model = AgentModel(len(vocab), hparams, device).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=hparams.lr,
                           weight_decay=hparams.weight_decay)

    best_metrics = {env_name: -1 for env_name in val_envs.keys()}
    best_metrics['combined'] = -1

    # Load model paramters from checkpoint (if exists)
    if ckpt is not None:
        model.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optim_state_dict'])
        best_metrics = ckpt['best_metrics']
        train_env.ix = ckpt['data_idx']

    if hparams.log_every == -1:
        hparams.log_every = round(len(train_env.data) / \
            (hparams.batch_size * 100)) * 100

    print('')
    pprint(vars(hparams), width=1)
    print('')
    print(model)
    print('Number of parameters:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    if hparams.random_agent or hparams.forward_agent or hparams.shortest_agent:
        assert eval_mode
        agent = SimpleAgent(hparams)
    else:
        agent = VerbalAskAgent(model, hparams, device)

    return train(train_env, val_envs, agent, model, optimizer, start_iter,
                 end_iter, best_metrics, eval_mode)
Пример #7
0
# !/usr/bin/env python
# -*- coding:utf-8 -*-

import tensorflow as tf
from model import NerModel
from utils import tokenize,read_vocab,format_result
import tensorflow_addons as tf_ad
from args_help import args
import json


vocab2id, id2vocab = read_vocab(args.vocab_file)
tag2id, id2tag = read_vocab(args.tag_file)
# lables {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}
text_sequences ,label_sequences= tokenize(args.test_path,vocab2id,tag2id)


optimizer = tf.keras.optimizers.Adam(args.lr)
model = NerModel(hidden_num = args.hidden_num, vocab_size =len(vocab2id), label_size = len(tag2id), embedding_size = args.embedding_size)
# restore model
ckpt = tf.train.Checkpoint(optimizer=optimizer,model=model)
ckpt.restore(tf.train.latest_checkpoint(args.output_dir))


while True:
    text = input("input:")
    dataset = tf.keras.preprocessing.sequence.pad_sequences([[vocab2id.get(char,0) for char in text]], padding='post')
    print('dataset',dataset)
    logits, text_lens = model.predict(dataset)

    print('logits.hape',logits.shape)
Пример #8
0
def main(args):
    model_prefix = '{}_{}'.format(args.model_type, args.train_id)
    
    log_path = args.LOG_DIR + model_prefix + '/'
    checkpoint_path = args.CHK_DIR + model_prefix + '/'
    result_path = args.RESULT_DIR + model_prefix + '/'
    cp_file = checkpoint_path + "best_model.pth.tar"
    init_epoch = 0

    if not os.path.exists(log_path):
        os.makedirs(log_path)
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    ## set up the logger
    set_logger(os.path.join(log_path, 'train.log'))

    ## save argparse parameters
    with open(log_path+'args.yaml', 'w') as f:
        for k, v in args.__dict__.items():
            f.write('{}: {}\n'.format(k, v))

    logging.info('Training model: {}'.format(model_prefix))

    ## set up vocab txt
    # create txt here
    setup(args, clear=True)
    print(args.__dict__)

    # indicate src and tgt language
    src, tgt = 'en', 'zh'
    maps = {'en':args.TRAIN_VOCAB_EN, 'zh':args.TRAIN_VOCAB_ZH}
    vocab_src = read_vocab(maps[src])
    tok_src = Tokenizer(language=src, vocab=vocab_src, encoding_length=args.MAX_INPUT_LENGTH, zh_tok='jieba')
    vocab_tgt = read_vocab(maps[tgt])
    tok_tgt = Tokenizer(language=tgt, vocab=vocab_tgt, encoding_length=args.MAX_INPUT_LENGTH, zh_tok='jieba')
    logging.info('Vocab size src/tgt:{}/{}'.format( len(vocab_src), len(vocab_tgt)) )

    ## Setup the training, validation, and testing dataloaders
    train_loader, val_loader, test_loader = create_split_loaders(args.DATA_DIR, (tok_src, tok_tgt), args.batch_size, args.MAX_VID_LENGTH, (src, tgt), num_workers=4, pin_memory=True)
    logging.info('train/val/test size: {}/{}/{}'.format( len(train_loader), len(val_loader), len(test_loader) ))

    ## init model
    if args.model_type == 's2s':
        encoder = Encoder(vocab_size=len(vocab_src), embed_size=args.wordembed_dim, hidden_size=args.enc_hid_size).cuda()
        decoder = Decoder(embed_size=args.wordembed_dim, hidden_size=args.dec_hid_size, vocab_size=len(vocab_tgt)).cuda()

    encoder.train()
    decoder.train()

    ## define loss
    criterion = nn.CrossEntropyLoss(ignore_index=padding_idx).cuda()
    ## init optimizer 
    dec_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                             lr=args.decoder_lr, weight_decay=args.weight_decay)
    enc_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=args.encoder_lr, weight_decay=args.weight_decay)

    count_paras(encoder, decoder, logging)

    ## track loss during training
    total_train_loss, total_val_loss = [], []
    best_val_bleu, best_epoch = 0, 0

    ## init time
    zero_time = time.time()

    # Begin training procedure
    earlystop_flag = False
    rising_count = 0

    for epoch in range(init_epoch, args.epochs):
        ## train for one epoch
        start_time = time.time()
        train_loss = train(train_loader, encoder, decoder, criterion, enc_optimizer, dec_optimizer, epoch)

        val_loss, sentbleu, corpbleu = validate(val_loader, encoder, decoder, criterion, tok_tgt)
        end_time = time.time()

        epoch_time = end_time - start_time
        total_time = end_time - zero_time
        
        logging.info('Total time used: %s Epoch %d time uesd: %s train loss: %.4f val loss: %.4f sentbleu: %.4f corpbleu: %.4f' % (
                str(datetime.timedelta(seconds=int(total_time))),
                epoch, str(datetime.timedelta(seconds=int(epoch_time))), train_loss, val_loss, sentbleu, corpbleu))

        if corpbleu > best_val_bleu:
            best_val_bleu = corpbleu
            save_checkpoint({ 'epoch': epoch, 
                'enc_state_dict': encoder.state_dict(), 'dec_state_dict': decoder.state_dict(),
                'enc_optimizer': enc_optimizer.state_dict(), 'dec_optimizer': dec_optimizer.state_dict(),
                }, cp_file)
            best_epoch = epoch

        logging.info("Finished {0} epochs of training".format(epoch+1))

        total_train_loss.append(train_loss)
        total_val_loss.append(val_loss)

    logging.info('Best corpus bleu score {:.4f} at epoch {}'.format(best_val_bleu, best_epoch))

    ### the best model is the last model saved in our implementation
    logging.info ('************ Start eval... ************')
    # Evaluate on validation dataset
    eval(test_loader, encoder, decoder, cp_file, tok_tgt, result_path)
Пример #9
0
def train(opt):
    device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    output_file = open(opt.saved_path + os.sep + "logs.txt", "w")
    output_file.write("Model's parameters: {}".format(vars(opt)))
    training_params = {
        "batch_size": opt.batch_size,
        "shuffle": True,
        "drop_last": True
    }
    test_params = {
        "batch_size": opt.batch_size,
        "shuffle": False,
        "drop_last": False
    }

    # max_word_length, max_sent_length = get_max_lengths(opt.train_set)
    max_word_length, max_sent_length = 13, 24
    vocab = read_vocab('data/yelp_review_full_csv/train.csv.txt')
    emb, word_to_ix = get_pretrained_word_embedding(opt.word2vec_path, vocab)
    df = pd.read_csv(opt.train_set, names=['label', 'text'])
    texts = np.array(df['text'])
    labels = np.array(df['label'])
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    for train_index, test_index in sss.split(texts, labels):
        X_train, X_valid = texts[train_index], texts[test_index]
        y_train, y_valid = labels[train_index], labels[test_index]
    training_set = Custom_Dataset(X_train, y_train, word_to_ix,
                                  max_sent_length, max_word_length)
    valid_set = Custom_Dataset(X_valid, y_valid, word_to_ix, max_sent_length,
                               max_word_length)
    training_generator = DataLoader(training_set,
                                    num_workers=32,
                                    **training_params)
    valid_generator = DataLoader(valid_set, num_workers=32, **training_params)
    df_test = pd.read_csv(opt.test_set, names=['label', 'text'])
    test_texts = np.array(df_test['text'])
    test_labels = np.array(df_test['label'])
    test_set = Custom_Dataset(test_texts, test_labels, word_to_ix,
                              max_sent_length, max_word_length)
    test_generator = DataLoader(test_set, num_workers=32, **test_params)

    model = nn.DataParallel(
        HierarchicalAttention(opt.word_hidden_size, opt.sent_hidden_size,
                              opt.batch_size, training_set.num_classes, emb,
                              max_sent_length, max_word_length))

    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)

    if torch.cuda.is_available():
        model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()),
                                 lr=opt.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           factor=0.1,
                                                           patience=5,
                                                           verbose=True,
                                                           min_lr=1e-8)
    best_acc = 0.
    best_epoch = 0
    model.train()
    num_iter_per_epoch = len(training_generator)

    for epoch in range(opt.num_epoches):
        print("Epoch " + str(epoch))
        for feature, label, doc_len, sent_len in training_generator:
            if torch.cuda.is_available():
                sent_len = torch.stack(sent_len, dim=1).to(device)
                doc_len = doc_len.to(device)
                feature = feature.to(device)
                label = label.to(device)
            optimizer.zero_grad()
            predictions = model(feature, sent_len, doc_len)
            loss = criterion(predictions, label)
            loss.backward()
            optimizer.step()
        training_metrics = get_evaluation(label.cpu().numpy(),
                                          predictions.cpu().detach().numpy(),
                                          list_metrics=["accuracy"])
        print("Epoch: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
            epoch + 1, opt.num_epoches, optimizer.param_groups[0]['lr'], loss,
            training_metrics["accuracy"]))

        if epoch % opt.test_interval == 0:
            model.eval()
            loss_ls = []
            te_label_ls = []
            te_pred_ls = []
            for te_feature, te_label, te_doc_len, te_sent_len in test_generator:
                num_sample = len(te_label)
                if torch.cuda.is_available():
                    te_sent_len = torch.stack(te_sent_len, dim=1).to(device)
                    te_doc_len = te_doc_len.to(device)
                    te_feature = te_feature.to(device)
                    te_label = te_label.to(device)

                with torch.no_grad():
                    te_predictions = model(te_feature, te_sent_len, te_doc_len)
                te_loss = criterion(te_predictions, te_label)
                loss_ls.append(te_loss * num_sample)
                te_label_ls.extend(te_label.clone().cpu())
                te_pred_ls.append(te_predictions.clone().cpu())
            te_loss = sum(loss_ls) / test_set.__len__()
            te_pred = torch.cat(te_pred_ls, 0)
            te_label = np.array(te_label_ls)
            test_metrics = get_evaluation(
                te_label,
                te_pred.numpy(),
                list_metrics=["accuracy", "confusion_matrix"])
            vl_loss_ls = []
            vl_label_ls = []
            vl_pred_ls = []
            for vl_feature, vl_label, vl_doc_len, vl_sent_len in valid_generator:
                num_sample = len(vl_label)
                if torch.cuda.is_available():
                    vl_sent_len = torch.stack(vl_sent_len, dim=1).to(device)
                    vl_doc_len = vl_doc_len.to(device)
                    vl_feature = vl_feature.to(device)
                    vl_label = vl_label.to(device)
                with torch.no_grad():
                    vl_predictions = model(vl_feature, vl_sent_len, vl_doc_len)
                vl_loss = criterion(vl_predictions, vl_label)
                vl_loss_ls.append(vl_loss * num_sample)
                vl_label_ls.extend(vl_label.clone().cpu())
                vl_pred_ls.append(vl_predictions.clone().cpu())
            vl_loss = sum(vl_loss_ls) / valid_set.__len__()
            vl_pred = torch.cat(vl_pred_ls, 0)
            vl_label = np.array(vl_label_ls)
            vl_metrics = get_evaluation(
                vl_label,
                vl_pred.numpy(),
                list_metrics=["accuracy", "confusion_matrix"])

            output_file.write(
                "Epoch: {}/{} \nValid loss: {} Valid accuracy: {} \nValid confusion matrix: \n{}\nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n"
                .format(epoch + 1, opt.num_epoches, vl_loss,
                        vl_metrics["accuracy"], vl_metrics["confusion_matrix"],
                        te_loss, test_metrics["accuracy"],
                        test_metrics["confusion_matrix"]))
            print(
                "Epoch: {}/{}, Lr: {},Valid Loss: {}, Valid Accuracy: {}, Test Loss: {}, Test Accuracy: {}"
                .format(epoch + 1, opt.num_epoches,
                        optimizer.param_groups[0]['lr'], vl_loss,
                        vl_metrics["accuracy"], te_loss,
                        test_metrics["accuracy"]))
            scheduler.step(vl_metrics["accuracy"])
            model.train()
            if vl_metrics["accuracy"] > best_acc:
                best_acc = vl_metrics["accuracy"]
                best_epoch = epoch
                torch.save(model, opt.saved_path + os.sep + "whole_model_han")

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print(
                    "Stop training at epoch {}. The lowest loss achieved is {}"
                    .format(epoch, te_loss))
                break
def read_words(filename, vocab_filename):
    vocab = read_vocab(vocab_filename)
    return [
        word for line in read_file(filename) for word in line.split()
        if word in vocab
    ]
def main(args):

    ####################
    # Arguments
    gpu = args.gpu
    model_name = args.model
    initial_tree_sampling = args.initial_tree_sampling
    path_config = args.config
    data_augmentation = args.data_augmentation
    trial_name = args.name
    actiontype = args.actiontype
    max_epoch = args.max_epoch
    dev_size = args.dev_size

    # Check
    assert actiontype in ["train", "evaluate"]
    if actiontype == "train":
        assert max_epoch > 0
    assert len(initial_tree_sampling.split("_")) == 3
    for type_ in initial_tree_sampling.split("_"):
        assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"]
    assert initial_tree_sampling.split("_")[2] != "X"
    assert initial_tree_sampling.split("_")[1] != "RB2"
    assert initial_tree_sampling.split("_")[2] != "RB2"

    if trial_name is None or trial_name == "None":
        trial_name = utils.get_current_time()

    ####################
    # Path setting
    config = utils.Config(path_config)

    basename = "%s.%s.%s.aug_%s.%s" \
            % (model_name,
               initial_tree_sampling,
               utils.get_basename_without_ext(path_config),
               data_augmentation,
               trial_name)

    if actiontype == "train":
        path_log = os.path.join(config.getpath("results"),
                                basename + ".training.log")
    elif actiontype == "evaluate":
        path_log = os.path.join(config.getpath("results"),
                                basename + ".evaluation.log")
    path_train = os.path.join(config.getpath("results"),
                              basename + ".training.jsonl")
    path_valid = os.path.join(config.getpath("results"),
                              basename + ".validation.jsonl")
    path_snapshot = os.path.join(config.getpath("results"),
                                 basename + ".model")
    path_pred = os.path.join(config.getpath("results"),
                             basename + ".evaluation.ctrees")
    path_eval = os.path.join(config.getpath("results"),
                             basename + ".evaluation.json")

    utils.set_logger(path_log)

    ####################
    # Random seed
    random_seed = trial_name
    random_seed = utils.hash_string(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    cuda.cupy.random.seed(random_seed)

    ####################
    # Log so far
    utils.writelog("gpu=%d" % gpu)
    utils.writelog("model_name=%s" % model_name)
    utils.writelog("initial_tree_sampling=%s" % initial_tree_sampling)
    utils.writelog("path_config=%s" % path_config)
    utils.writelog("data_augmentation=%s" % data_augmentation)
    utils.writelog("trial_name=%s" % trial_name)
    utils.writelog("actiontype=%s" % actiontype)
    utils.writelog("max_epoch=%s" % max_epoch)
    utils.writelog("dev_size=%s" % dev_size)

    utils.writelog("path_log=%s" % path_log)
    utils.writelog("path_train=%s" % path_train)
    utils.writelog("path_valid=%s" % path_valid)
    utils.writelog("path_snapshot=%s" % path_snapshot)
    utils.writelog("path_pred=%s" % path_pred)
    utils.writelog("path_eval=%s" % path_eval)

    utils.writelog("random_seed=%d" % random_seed)

    ####################
    # Data preparation
    begin_time = time.time()

    train_databatch = dataloader.read_rstdt("train",
                                            relation_level="coarse-grained",
                                            with_root=False)
    test_databatch = dataloader.read_rstdt("test",
                                           relation_level="coarse-grained",
                                           with_root=False)
    vocab_word = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt"))
    vocab_postag = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "postags.vocab.txt"))
    vocab_deprel = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "deprels.vocab.txt"))

    if data_augmentation:
        external_train_databatch = dataloader.read_ptbwsj_wo_rstdt(
            with_root=False)
        # Remove documents with only one leaf node
        filtering_function = lambda d, i: len(d.batch_edu_ids[i]) == 1
        external_train_databatch = utils.filter_databatch(
            external_train_databatch, filtering_function)

    end_time = time.time()
    utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time))

    ####################
    # Hyper parameters
    word_dim = config.getint("word_dim")
    postag_dim = config.getint("postag_dim")
    deprel_dim = config.getint("deprel_dim")
    lstm_dim = config.getint("lstm_dim")
    mlp_dim = config.getint("mlp_dim")
    n_init_epochs = config.getint("n_init_epochs")
    negative_size = config.getint("negative_size")
    batch_size = config.getint("batch_size")
    weight_decay = config.getfloat("weight_decay")
    gradient_clipping = config.getfloat("gradient_clipping")
    optimizer_name = config.getstr("optimizer_name")

    utils.writelog("word_dim=%d" % word_dim)
    utils.writelog("postag_dim=%d" % postag_dim)
    utils.writelog("deprel_dim=%d" % deprel_dim)
    utils.writelog("lstm_dim=%d" % lstm_dim)
    utils.writelog("mlp_dim=%d" % mlp_dim)
    utils.writelog("n_init_epochs=%d" % n_init_epochs)
    utils.writelog("negative_size=%d" % negative_size)
    utils.writelog("batch_size=%d" % batch_size)
    utils.writelog("weight_decay=%f" % weight_decay)
    utils.writelog("gradient_clipping=%f" % gradient_clipping)
    utils.writelog("optimizer_name=%s" % optimizer_name)

    ####################
    # Model preparation
    cuda.get_device(gpu).use()

    # Initialize a model
    utils.mkdir(os.path.join(config.getpath("data"), "caches"))
    path_embed = config.getpath("pretrained_word_embeddings")
    path_caches = os.path.join(
        config.getpath("data"), "caches",
        "cached." + os.path.basename(path_embed) + ".npy")
    if os.path.exists(path_caches):
        utils.writelog("Loading cached word embeddings ...")
        initialW = np.load(path_caches)
    else:
        initialW = utils.read_word_embedding_matrix(path=path_embed,
                                                    dim=word_dim,
                                                    vocab=vocab_word,
                                                    scale=0.0)
        np.save(path_caches, initialW)

    if model_name == "spanbasedmodel":
        # Span-based model w/ template features
        template_feature_extractor = models.TemplateFeatureExtractor(
            databatch=train_databatch)
        utils.writelog("Template feature size=%d" %
                       template_feature_extractor.feature_size)
        if actiontype == "train":
            for template in template_feature_extractor.templates:
                dim = template_feature_extractor.template2dim[template]
                utils.writelog("Template feature #%s %s" % (dim, template))
        model = models.SpanBasedModel(
            vocab_word=vocab_word,
            vocab_postag=vocab_postag,
            vocab_deprel=vocab_deprel,
            word_dim=word_dim,
            postag_dim=postag_dim,
            deprel_dim=deprel_dim,
            lstm_dim=lstm_dim,
            mlp_dim=mlp_dim,
            initialW=initialW,
            template_feature_extractor=template_feature_extractor)
    elif model_name == "spanbasedmodel2":
        # Span-based model w/o template features
        model = models.SpanBasedModel2(vocab_word=vocab_word,
                                       vocab_postag=vocab_postag,
                                       vocab_deprel=vocab_deprel,
                                       word_dim=word_dim,
                                       postag_dim=postag_dim,
                                       deprel_dim=deprel_dim,
                                       lstm_dim=lstm_dim,
                                       mlp_dim=mlp_dim,
                                       initialW=initialW)
    else:
        raise ValueError("Invalid model_name=%s" % model_name)
    utils.writelog("Initialized the model ``%s''" % model_name)

    # Load pre-trained parameters
    if actiontype != "train":
        serializers.load_npz(path_snapshot, model)
        utils.writelog("Loaded trained parameters from %s" % path_snapshot)

    model.to_gpu(gpu)

    ####################
    # Decoder preparation
    decoder = decoders.IncrementalCKYDecoder()

    ####################
    # Initializer preparation
    sampler = treesamplers.TreeSampler(initial_tree_sampling.split("_"))

    ####################
    # Training / evaluation
    if actiontype == "train":
        with chainer.using_config("train", True):
            if dev_size > 0:
                # Training with cross validation
                train_databatch, dev_databatch = dataloader.randomsplit(
                    n_dev=dev_size, databatch=train_databatch)
                with open(
                        os.path.join(config.getpath("results"),
                                     basename + ".valid_gold.ctrees"),
                        "w") as f:
                    for sexp in dev_databatch.batch_nary_sexp:
                        f.write("%s\n" % " ".join(sexp))
            else:
                # Training with the full training set
                dev_databatch = None

            if data_augmentation:
                train_databatch = utils.concat_databatch(
                    train_databatch, external_train_databatch)
            training.train(
                model=model,
                decoder=decoder,
                sampler=sampler,
                max_epoch=max_epoch,
                n_init_epochs=n_init_epochs,
                negative_size=negative_size,
                batch_size=batch_size,
                weight_decay=weight_decay,
                gradient_clipping=gradient_clipping,
                optimizer_name=optimizer_name,
                train_databatch=train_databatch,
                dev_databatch=dev_databatch,
                path_train=path_train,
                path_valid=path_valid,
                path_snapshot=path_snapshot,
                path_pred=os.path.join(config.getpath("results"),
                                       basename + ".valid_pred.ctrees"),
                path_gold=os.path.join(config.getpath("results"),
                                       basename + ".valid_gold.ctrees"))

    elif actiontype == "evaluate":
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            # Test
            parsing.parse(model=model,
                          decoder=decoder,
                          databatch=test_databatch,
                          path_pred=path_pred)
            scores = rst_parseval.evaluate(
                pred_path=path_pred,
                gold_path=os.path.join(config.getpath("data"), "rstdt",
                                       "renamed", "test.labeled.nary.ctrees"))
            old_scores = old_rst_parseval.evaluate(
                pred_path=path_pred,
                gold_path=os.path.join(config.getpath("data"), "rstdt",
                                       "renamed", "test.labeled.nary.ctrees"))
            out = {
                "Morey2018": {
                    "Unlabeled Precision": scores["S"]["Precision"] * 100.0,
                    "Precision_info": scores["S"]["Precision_info"],
                    "Unlabeled Recall": scores["S"]["Recall"] * 100.0,
                    "Recall_info": scores["S"]["Recall_info"],
                    "Micro F1": scores["S"]["Micro F1"] * 100.0
                },
                "Marcu2000": {
                    "Unlabeled Precision":
                    old_scores["S"]["Precision"] * 100.0,
                    "Precision_info": old_scores["S"]["Precision_info"],
                    "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0,
                    "Recall_info": old_scores["S"]["Recall_info"],
                    "Micro F1": old_scores["S"]["Micro F1"] * 100.0
                }
            }
            utils.write_json(path_eval, out)
            utils.writelog(utils.pretty_format_dict(out))

    utils.writelog("Done: %s" % basename)
Пример #12
0
                    msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                          + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                    print(
                        msg.format(total_batch, loss_train, acc_train,
                                   loss_val, acc_val, time_dif, improved_str))

                model.session.run(model.optim, feed_dict=feed_dict)  # 运行优化
                total_batch += 1

                if total_batch - last_improved > require_improvement:
                    # 验证集正确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break  # 跳出循环
            if flag:  # 同上
                break


print('Configuring CNN model...')
config = TCNNConfig()
if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
    build_vocab(train_dir, vocab_dir, config.vocab_size)

categories, cat_to_id = read_sentiment_category()  # 训练情感
# categories, cat_to_id = read_type_category()   # 训练新闻类型

print(cat_to_id)
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextCNN(config)
train()
Пример #13
0
def train_val(seed=None):
    ''' Train on the training set, and validate on seen and unseen splits. '''

    # which GPU to use
    device = torch.device('cuda', hparams.device_id)

    # Resume from lastest checkpoint (if any)
    if os.path.exists(hparams.load_path): # only present in os if not first time. present in hparam but not in os
        ckpt = load(hparams.load_path, device)
        start_iter = ckpt['iter'] # iter is a key of ckpt object that gives start_iter
        # print("start_iter:")
        # print(start_iter)
        # input()
    else:
        if hasattr(args, 'load_path') and hasattr(args, 'eval_only') and args.eval_only:
            sys.exit('load_path %s does not exist!' % hparams.load_path) # exit only if no path and eval, can still train
        ckpt = None
        start_iter = 0
    end_iter = hparams.n_iters # from config

    # Setup seed and read vocab
    setup(seed=seed)

    train_vocab_path = os.path.join(hparams.data_path, 'train_vocab.txt')
    if hasattr(hparams, 'external_main_vocab') and hparams.external_main_vocab:
        train_vocab_path = hparams.external_main_vocab # external_main_vocab likely from command line arg if present

    # verbal advisor means vocab is a list of navigation action for the agent.
    if 'verbal' in hparams.advisor:
        subgoal_vocab_path = os.path.join(hparams.data_path, hparams.subgoal_vocab) # data/asknav/verbal_hard_vocab.txt
        vocab = read_vocab([train_vocab_path, subgoal_vocab_path])
    else:
        vocab = read_vocab([train_vocab_path])
    tok = Tokenizer(vocab=vocab, encoding_length=hparams.max_input_length) # tokenize vocab

    # Create a training environment
    train_env = VNLABatch(hparams, split='train', tokenizer=tok)

    # Create validation environments
    val_splits = ['val_seen', 'val_unseen']
    
    # eval_mode code
    eval_mode = hasattr(hparams, 'eval_only') and hparams.eval_only # if command line indicates eval and value of test seen/unseen
    if eval_mode:
        if '_unseen' in hparams.load_path:
            val_splits = ['test_unseen']
        if '_seen' in hparams.load_path:
            val_splits = ['test_seen']
        end_iter = start_iter + hparams.log_every
    # end

    # create object/dict containing envs, key is 'val_seen' or 'val_unseen' values are VNLABatch respectively.
    val_envs = { split: (VNLABatch(hparams, split=split, tokenizer=tok,
        from_train_env=train_env, traj_len_estimates=train_env.traj_len_estimates),
        Evaluation(hparams, [split], hparams.data_path)) for split in val_splits} # evaluate val for both seen and unseen

    # Build models
    model = AttentionSeq2SeqModel(len(vocab), hparams, device).to(device)

    optimizer = optim.Adam(model.parameters(), lr=hparams.lr,
        weight_decay=hparams.weight_decay)

    best_metrics = { 'val_seen'  : -1,
                     'val_unseen': -1,
                     'combined'  : -1 } # probably the best scores so far if ckpt has it

    # Load model parameters from a checkpoint (if any)
    if ckpt is not None:
        model.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optim_state_dict'])
        best_metrics = ckpt['best_metrics']
        train_env.ix = ckpt['data_idx']

    print('')
    pprint(vars(hparams), width=1)
    print('')
    print(model)

    # Initialize agent
    if 'verbal' in hparams.advisor:
        agent = VerbalAskAgent(model, hparams, device)
    elif hparams.advisor == 'direct':
        agent = AskAgent(model, hparams, device) # agent, as well as model (in attentionSeq2SeqModel), depends on whether the advisor is direct or hint / verbal

    # Train
    return train(train_env, val_envs, agent, model, optimizer, start_iter, end_iter,
          best_metrics, eval_mode) # eval mode has splits that gives different environments.
Пример #14
0
def train_val_augment(test_only=False):
    """
    Train the listener with the augmented data
    """
    setup()
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    feat_dict = read_img_features(features, test_only=test_only)

    if test_only:
        featurized_scans = None
        val_env_names = ['val_train_seen']
    else:
        featurized_scans = set(
            [key.split("_")[0] for key in list(feat_dict.keys())])
        val_env_names = ['val_train_seen', 'val_seen', 'val_unseen']

    if not args.test_obj:
        print('Loading compact pano-caffe object features ... (~3 seconds)')
        import pickle as pkl
        with open('img_features/objects/pano_object_class.pkl', 'rb') as f_pc:
            pano_caffe = pkl.load(f_pc)
    else:
        pano_caffe = None

    aug_path = args.aug

    # Create the training environment
    train_env = R2RBatch(feat_dict,
                         pano_caffe,
                         batch_size=args.batchSize,
                         splits=['train'],
                         tokenizer=tok)
    aug_env = R2RBatch(feat_dict,
                       pano_caffe,
                       batch_size=args.batchSize,
                       splits=[aug_path],
                       tokenizer=tok,
                       name='aug')

    stats = train_env.get_statistics()
    print("The training data_size is : %d" % train_env.size())
    print("The average instruction length of the dataset is %0.4f." %
          (stats['length']))
    print("The average action length of the dataset is %0.4f." %
          (stats['path']))
    stats = aug_env.get_statistics()
    print("The augmentation data size is %d" % aug_env.size())
    print("The average instruction length of the dataset is %0.4f." %
          (stats['length']))
    print("The average action length of the dataset is %0.4f." %
          (stats['path']))

    val_envs = {
        split:
        (R2RBatch(feat_dict,
                  pano_caffe,
                  batch_size=args.batchSize,
                  splits=[split],
                  tokenizer=tok), Evaluation([split], featurized_scans, tok))
        for split in val_env_names
    }

    train(train_env, tok, args.iters, val_envs=val_envs, aug_env=aug_env)
Пример #15
0
def main(_):
    vocab = read_vocab('data/yelp-2013-w2i.pkl')
    glove_embs = load_glove('glove.6B.{}d.txt'.format(FLAGS.emb_size),
                            FLAGS.emb_size, vocab)
    data_reader = DataReader(train_file='data/yelp-2013-train.pkl',
                             dev_file='data/yelp-2013-dev.pkl',
                             test_file='data/yelp-2013-test.pkl')

    config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
    with tf.Session(config=config) as sess:
        model = Model(cell_dim=FLAGS.cell_dim,
                      att_dim=FLAGS.att_dim,
                      vocab_size=len(vocab),
                      emb_size=FLAGS.emb_size,
                      num_classes=FLAGS.num_classes,
                      dropout_rate=FLAGS.dropout_rate,
                      pretrained_embs=glove_embs)

        loss = loss_fn(model.labels, model.logits)
        train_op, global_step = train_fn(loss)
        batch_acc, total_acc, acc_update, metrics_init = eval_fn(
            model.labels, model.logits)
        summary_op = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())

        train_writer.add_graph(sess.graph)
        saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints)

        print('\n{}> Start training'.format(datetime.now()))

        epoch = 0
        valid_step = 0
        test_step = 0
        train_test_prop = len(data_reader.train_data) / len(
            data_reader.test_data)
        test_batch_size = int(FLAGS.batch_size / train_test_prop)
        best_acc = float('-inf')

        while epoch < FLAGS.num_epochs:
            epoch += 1
            print('\n{}> Epoch: {}'.format(datetime.now(), epoch))

            sess.run(metrics_init)
            for batch_docs, batch_labels in data_reader.read_train_set(
                    FLAGS.batch_size, shuffle=True):
                _step, _, _loss, _acc, _ = sess.run(
                    [global_step, train_op, loss, batch_acc, acc_update],
                    feed_dict=model.get_feed_dict(batch_docs,
                                                  batch_labels,
                                                  training=True))
                if _step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    train_writer.add_summary(_summary, global_step=_step)
            print('Training accuracy = {:.2f}'.format(
                sess.run(total_acc) * 100))

            sess.run(metrics_init)
            for batch_docs, batch_labels in data_reader.read_valid_set(
                    test_batch_size):
                _loss, _acc, _ = sess.run([loss, batch_acc, acc_update],
                                          feed_dict=model.get_feed_dict(
                                              batch_docs, batch_labels))
                valid_step += 1
                if valid_step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    valid_writer.add_summary(_summary, global_step=valid_step)
            print('Validation accuracy = {:.2f}'.format(
                sess.run(total_acc) * 100))

            sess.run(metrics_init)
            for batch_docs, batch_labels in data_reader.read_test_set(
                    test_batch_size):
                _loss, _acc, _ = sess.run([loss, batch_acc, acc_update],
                                          feed_dict=model.get_feed_dict(
                                              batch_docs, batch_labels))
                test_step += 1
                if test_step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    test_writer.add_summary(_summary, global_step=test_step)
            test_acc = sess.run(total_acc) * 100
            print('Testing accuracy = {:.2f}'.format(test_acc))

            if test_acc > best_acc:
                best_acc = test_acc
                saver.save(sess, './' + FLAGS.checkpoint_dir)
            print('Best testing accuracy = {:.2f}'.format(test_acc))

    print("{} Optimization Finished!".format(datetime.now()))
    print('Best testing accuracy = {:.2f}'.format(best_acc))
Пример #16
0
import math
from lm import UnigramLM,BigramLM,InterpolatedBigramModel
from utils import read_sentence, read_vocab, calculate_bigram_perplexity, calc_average_log_likelihood,calculate_bigram_document_perplexity,calc_average_document_log_likelihood
import numpy as np

# reading train/valid/test data
train_data = read_sentence('train.txt')
valid_data = read_sentence('valid.txt')
test_data = read_sentence('test.txt')
vocab = read_vocab('vocab.txt')

#run
print("--------------------")
print("Backoff Bigram Model")
print("--------------------")
bigram = BigramLM(train_data, vocab, smoothing=True)
print("Average log likelihood of first line in test: %s" % (calc_average_log_likelihood(bigram,test_data[:1])))
print("Average ppl of first line in test: %s" % (calculate_bigram_perplexity(bigram, test_data[:1])))
loglikelihood = calc_average_log_likelihood(bigram,test_data[:100])
ppl = calculate_bigram_perplexity(bigram, test_data[:100])
print("Mean of loglikelihood of first 100 line: %s" % np.mean(loglikelihood))
print("Variance of loglikelihood of first 100 line: %s" % np.var(loglikelihood))
print("Mean of ppl of first 100 line: %s" % np.mean(ppl))
print("Variance of ppl of first 100 line: %s" % np.var(ppl))
print("Average ppl of document: %s" % (calculate_bigram_document_perplexity(bigram,test_data)))
print("Average log likelihood of document: %s" % (calc_average_document_log_likelihood(bigram,test_data)))




print("\n")
Пример #17
0
PLACE365_CANDIDATE_FEATURES = 'img_features/ResNet-152-places365-candidate.tsv'

if args.place365:
    features = PLACE365_FEATURES
    CANDIDATE_FEATURES = PLACE365_CANDIDATE_FEATURES
else:
    features = IMAGENET_FEATURES
    CANDIDATE_FEATURES = IMAGENET_CANDIDATE_FEATURES

#load features and feature_candidates
feature_dict = read_img_features(features)
candidate_dict = utils.read_candidates(CANDIDATE_FEATURES)
#load glove and vocab
glove_path = 'tasks/R2R/data/train_glove.npy'
glove = np.load(glove_path)
vocab = read_vocab(TRAIN_VOCAB)
tok = Tokenizer(vocab=vocab)

#intantialize listener and load pre-trained model
listner = Seq2SeqAgent(None,
                       "",
                       tok,
                       feat=feature_dict,
                       candidates=candidate_dict,
                       episode_len=args.maxAction)
listner.load(
    'snap/long/ablation_cand_0208_accuGrad_envdrop_ty/state_dict/best_val_unseen'
)


# nav graph loader from env.py
Пример #18
0
def train_val(path_type, max_episode_len, history, MAX_INPUT_LENGTH,
              feedback_method, n_iters, model_prefix, blind, args):
    ''' Train on the training set, and validate on seen and unseen splits. '''

    nav_graphs = setup(args.action_space, args.navigable_locs_path)
    # Create a batch training environment that will also preprocess text
    use_bert = (args.encoder_type
                in ['bert', 'vlbert'])  # for tokenizer and dataloader
    if use_bert:
        tok = BTokenizer(MAX_INPUT_LENGTH)
    else:
        vocab = read_vocab(TRAIN_VOCAB)
        tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH)
    #train_env = R2RBatch(features, batch_size=batch_size, splits=['train'], tokenizer=tok,
    #                     path_type=path_type, history=history, blind=blind)

    feature_store = Feature(features, args.panoramic)
    train_env = R2RBatch(feature_store,
                         nav_graphs,
                         args.panoramic,
                         args.action_space,
                         batch_size=args.batch_size,
                         splits=['train'],
                         tokenizer=tok,
                         path_type=path_type,
                         history=history,
                         blind=blind)

    # Creat validation environments
    #val_envs = {split: (R2RBatch(features, batch_size=batch_size, splits=[split],
    #            tokenizer=tok, path_type=path_type, history=history, blind=blind),
    #            Evaluation([split], path_type=path_type)) for split in ['val_seen', 'val_unseen']}

    val_envs = {
        split: (R2RBatch(feature_store,
                         nav_graphs,
                         args.panoramic,
                         args.action_space,
                         batch_size=args.batch_size,
                         splits=[split],
                         tokenizer=tok,
                         path_type=path_type,
                         history=history,
                         blind=blind), Evaluation([split],
                                                  path_type=path_type))
        for split in ['val_seen', 'val_unseen']
    }

    # Build models and train
    #enc_hidden_size = hidden_size//2 if bidirectional else hidden_size

    if args.encoder_type == 'vlbert':
        if args.pretrain_model_name is not None:
            print("Using the pretrained lm model from %s" %
                  (args.pretrain_model_name))
            encoder = DicEncoder(FEATURE_ALL_SIZE, args.enc_hidden_size,
                                 args.hidden_size, args.dropout_ratio,
                                 args.bidirectional, args.transformer_update,
                                 args.bert_n_layers, args.reverse_input,
                                 args.top_lstm, args.vl_layers, args.la_layers,
                                 args.bert_type)
            premodel = DicAddActionPreTrain.from_pretrained(
                args.pretrain_model_name)
            encoder.bert = premodel.bert
            encoder.drop = nn.Dropout(p=args.dropout_ratio)
            encoder.bert._resize_token_embeddings(
                len(tok))  # remember to resize tok embedding size
            encoder.bert.update_lang_bert, encoder.bert.config.update_lang_bert = args.transformer_update, args.transformer_update
            encoder.bert.update_add_layer, encoder.bert.config.update_add_layer = args.update_add_layer, args.update_add_layer
            encoder = encoder.cuda()

        else:
            encoder = DicEncoder(FEATURE_ALL_SIZE, args.enc_hidden_size,
                                 args.hidden_size, args.dropout_ratio,
                                 args.bidirectional, args.transformer_update,
                                 args.bert_n_layers, args.reverse_input,
                                 args.top_lstm, args.vl_layers, args.la_layers,
                                 args.bert_type).cuda()
            encoder.bert._resize_token_embeddings(
                len(tok))  # remember to resize tok embedding size

    elif args.encoder_type == 'bert':
        if args.pretrain_model_name is not None:
            print("Using the pretrained lm model from %s" %
                  (args.pretrain_model_name))
            encoder = BertEncoder(args.enc_hidden_size, args.hidden_size,
                                  args.dropout_ratio, args.bidirectional,
                                  args.transformer_update, args.bert_n_layers,
                                  args.reverse_input, args.top_lstm,
                                  args.bert_type)
            premodel = BertForMaskedLM.from_pretrained(
                args.pretrain_model_name)
            encoder.bert = premodel.bert
            encoder.drop = nn.Dropout(p=args.dropout_ratio)
            encoder.bert._resize_token_embeddings(
                len(tok))  # remember to resize tok embedding size
            #encoder.bert.update_lang_bert, encoder.bert.config.update_lang_bert = args.transformer_update, args.transformer_update
            #encoder.bert.update_add_layer, encoder.bert.config.update_add_layer = args.update_add_layer, args.update_add_layer
            encoder = encoder.cuda()
            pdb.set_trace()
        else:
            encoder = BertEncoder(args.enc_hidden_size, args.hidden_size,
                                  args.dropout_ratio, args.bidirectional,
                                  args.transformer_update, args.bert_n_layers,
                                  args.reverse_input, args.top_lstm,
                                  args.bert_type).cuda()
            encoder.bert._resize_token_embeddings(len(tok))
    else:
        enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
        encoder = EncoderLSTM(len(vocab),
                              word_embedding_size,
                              enc_hidden_size,
                              padding_idx,
                              dropout_ratio,
                              bidirectional=bidirectional).cuda()

    #decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(),
    #              action_embedding_size, args.hidden_size, args.dropout_ratio).cuda()
    ctx_hidden_size = args.enc_hidden_size * (2 if args.bidirectional else 1)
    if use_bert and not args.top_lstm:
        ctx_hidden_size = 768

    decoder = R2RAttnDecoderLSTM(Seq2SeqAgent.n_inputs(),
                                 Seq2SeqAgent.n_outputs(),
                                 action_embedding_size, ctx_hidden_size,
                                 args.hidden_size, args.dropout_ratio,
                                 FEATURE_SIZE, args.panoramic,
                                 args.action_space, args.dec_h_type).cuda()

    train(train_env,
          encoder,
          decoder,
          n_iters,
          path_type,
          history,
          feedback_method,
          max_episode_len,
          MAX_INPUT_LENGTH,
          model_prefix,
          val_envs=val_envs,
          args=args)
Пример #19
0
def make_env_and_models(args,
                        train_vocab_path,
                        train_splits,
                        test_splits,
                        test_instruction_limit=None,
                        instructions_per_path=None):
    setup()
    if args.env == 'r2r':
        EnvBatch = R2RBatch
        ImgFeatures = ImageFeatures
    elif args.env == 'refer360':
        EnvBatch = Refer360Batch
        ImgFeatures = Refer360ImageFeatures
    else:
        raise NotImplementedError(
            'this {} environment is not implemented.'.format(args.env))

    image_features_list = ImgFeatures.from_args(args)
    feature_size = sum(
        [featurizer.feature_dim for featurizer in image_features_list]) + 128
    if args.use_visited_embeddings:
        feature_size += 64
    if args.use_oracle_embeddings:
        feature_size += 64
    action_embedding_size = feature_size

    vocab = read_vocab(train_vocab_path, args.language)
    tok = Tokenizer(vocab=vocab)

    train_env = EnvBatch(image_features_list,
                         splits=train_splits,
                         tokenizer=tok,
                         args=args)

    enc_hidden_size = args.hidden_size // 2 if args.bidirectional else args.hidden_size
    wordvec = np.load(args.wordvec_path)

    word_embedding_size = get_word_embedding_size(args)
    enc_hidden_size = 600  # refer360 >>>
    enc_hidden_size = 512  # refer360 >>>
    # enc_hidden_size = 512  # r2r >>>

    encoder = try_cuda(
        SpeakerEncoderLSTM(action_embedding_size,
                           feature_size,
                           enc_hidden_size,
                           args.dropout_ratio,
                           bidirectional=args.bidirectional))
    word_embedding_size = 300  # refer360 >>>>
    word_embedding_size = 300  # r2r >>>>
    hidden_size = 600  # refer360 >>>
    hidden_size = 512  # refer360 >>>
    # hidden_size = 512  # >>> r2r
    #hidden_size = args.hidden_size

    decoder = try_cuda(
        SpeakerDecoderLSTM(len(vocab),
                           word_embedding_size,
                           hidden_size,
                           args.dropout_ratio,
                           wordvec=wordvec,
                           wordvec_finetune=args.wordvec_finetune))

    test_envs = {}
    for split in test_splits:
        b = EnvBatch(image_features_list,
                     splits=[split],
                     tokenizer=tok,
                     args=args)
        e = eval_speaker.SpeakerEvaluation(
            [split], instructions_per_path=instructions_per_path, args=args)
        test_envs[split] = (b, e)

    # TODO
    # test_envs = {
    #     split: (BatchEnv(image_features_list, batch_size=batch_size,
    #                      splits=[split], tokenizer=tok,
    #                      instruction_limit=test_instruction_limit,
    #                      prefix=args.prefix),
    #             eval_speaker.SpeakerEvaluation(
    #                 [split], instructions_per_path=instructions_per_path, ))
    #     for split in test_splits}

    return train_env, test_envs, encoder, decoder
Пример #20
0
def test():
    print('current directory', os.getcwd())
    os.chdir('..')
    print('current directory', os.getcwd())

    # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    visible_gpu = "0"
    os.environ["CUDA_VISIBLE_DEVICES"] = visible_gpu

    args.name = 'SSM'
    args.attn = 'soft'
    args.train = 'listener'
    args.featdropout = 0.3
    args.angle_feat_size = 128
    args.feedback = 'sample'
    args.ml_weight = 0.2
    args.sub_out = 'max'
    args.dropout = 0.5
    args.optim = 'adam'
    args.lr = 3e-4
    args.iters = 80000
    args.maxAction = 15
    args.batchSize = 4
    args.target_batch_size = 4
    args.pe_dim = 128

    args.self_train = True
    args.aug = 'tasks/R2R/data/aug_paths.json'

    args.featdropout = 0.4
    args.iters = 200000

    if args.optim == 'rms':
        print("Optimizer: Using RMSProp")
        args.optimizer = torch.optim.RMSprop
    elif args.optim == 'adam':
        print("Optimizer: Using Adam")
        args.optimizer = torch.optim.Adam
    elif args.optim == 'sgd':
        print("Optimizer: sgd")
        args.optimizer = torch.optim.SGD

    TRAIN_VOCAB = 'tasks/R2R/data/train_vocab.txt'
    TRAINVAL_VOCAB = 'tasks/R2R/data/trainval_vocab.txt'

    IMAGENET_FEATURES = 'img_features/ResNet-152-imagenet.tsv'

    if args.features == 'imagenet':
        features = IMAGENET_FEATURES

    if args.fast_train:
        name, ext = os.path.splitext(features)
        features = name + "-fast" + ext

    print(args)

    def setup():
        torch.manual_seed(1)
        torch.cuda.manual_seed(1)
        # Check for vocabs
        if not os.path.exists(TRAIN_VOCAB):
            write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB)
        if not os.path.exists(TRAINVAL_VOCAB):
            write_vocab(
                build_vocab(splits=['train', 'val_seen', 'val_unseen']),
                TRAINVAL_VOCAB)

    #
    setup()

    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    feat_dict = read_img_features(features)

    print('start extract keys...')
    featurized_scans = set(
        [key.split("_")[0] for key in list(feat_dict.keys())])
    print('keys extracted...')

    val_envs = {
        split: R2RBatch(feat_dict,
                        batch_size=args.batchSize,
                        splits=[split],
                        tokenizer=tok)
        for split in ['train', 'val_seen', 'val_unseen']
    }

    evaluators = {
        split: Evaluation([split], featurized_scans, tok)
        for split in ['train', 'val_seen', 'val_unseen']
    }

    learner = Learner(val_envs,
                      "",
                      tok,
                      args.maxAction,
                      process_num=4,
                      max_node=17,
                      visible_gpu=visible_gpu)
    learner.eval_init()

    ckpt = 'snap/%s/state_dict/ssm_ckpt' % args.name

    learner.load_eval(ckpt)

    results = learner.eval()
    loss_str = ''
    for key in results:
        evaluator = evaluators[key]
        result = results[key]

        score_summary, score_details = evaluator.score(result)

        loss_str += ", %s \n" % key

        for metric, val in score_summary.items():
            loss_str += ', %s: %.3f' % (metric, val)

        loss_str += '\n'

    print(loss_str)
Пример #21
0
from env import R2RBatch
from refer360_env import Refer360Batch
from utils import Tokenizer, read_vocab
from vocab import TRAIN_VOCAB
from train import make_arg_parser
from utils import get_arguments
from pprint import pprint
import os
arg_parser = make_arg_parser()
arg_parser.add_argument('--cache_path', type=str,
                        required=True)
args = get_arguments(arg_parser)
vocab = read_vocab(TRAIN_VOCAB, args.language)
tok = Tokenizer(vocab)

if args.env == 'r2r':
  EnvBatch = R2RBatch
elif args.env in ['refer360']:
  EnvBatch = Refer360Batch
if args.prefix in ['refer360', 'r2r', 'R2R', 'REVERIE', 'r360tiny', 'RxR_en-ALL']:
  val_splits = ['val_unseen', 'val_seen']
  target = 'val_unseen'
elif args.prefix in ['touchdown', 'td']:
  val_splits = ['dev']
  target = 'dev'

env = EnvBatch(['none'],
               splits=['train'] + val_splits,
               tokenizer=tok,
               args=args)
if args.env == 'r2r':
Пример #22
0
def main(opts):

    # set manual_seed and build vocab
    setup(opts, opts.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # create a batch training environment that will also preprocess text
    vocab = read_vocab(opts.train_vocab)
    tok = Tokenizer(opts.remove_punctuation == 1, opts.reversed == 1, vocab=vocab, encoding_length=opts.max_cap_length)

    # create language instruction encoder
    encoder_kwargs = {
        'opts': opts,
        'vocab_size': len(vocab),
        'embedding_size': opts.word_embedding_size,
        'hidden_size': opts.rnn_hidden_size,
        'padding_idx': padding_idx,
        'dropout_ratio': opts.rnn_dropout,
        'bidirectional': opts.bidirectional == 1,
        'num_layers': opts.rnn_num_layers
    }
    print('Using {} as encoder ...'.format(opts.lang_embed))
    if 'lstm' in opts.lang_embed:
        encoder = EncoderRNN(**encoder_kwargs)
    else:
        raise ValueError('Unknown {} language embedding'.format(opts.lang_embed))
    print(encoder)

    # create policy model
    policy_model_kwargs = {
        'opts':opts,
        'img_fc_dim': opts.img_fc_dim,
        'img_fc_use_batchnorm': opts.img_fc_use_batchnorm == 1,
        'img_dropout': opts.img_dropout,
        'img_feat_input_dim': opts.img_feat_input_dim,
        'rnn_hidden_size': opts.rnn_hidden_size,
        'rnn_dropout': opts.rnn_dropout,
        'max_len': opts.max_cap_length,
        'max_navigable': opts.max_navigable
    }

    if opts.arch == 'self-monitoring':
        model = SelfMonitoring(**policy_model_kwargs)
    elif opts.arch == 'speaker-baseline':
        model = SpeakerFollowerBaseline(**policy_model_kwargs)
    else:
        raise ValueError('Unknown {} model for seq2seq agent'.format(opts.arch))
    print(model)

    encoder = encoder.to(device)
    model = model.to(device)

    params = list(encoder.parameters()) + list(model.parameters())
    optimizer = torch.optim.Adam(params, lr=opts.learning_rate)

    # optionally resume from a checkpoint
    if opts.resume:
        model, encoder, optimizer, best_success_rate = resume_training(opts, model, encoder, optimizer)

    # if a secondary exp name is specified, this is useful when resuming from a previous saved
    # experiment and save to another experiment, e.g., pre-trained on synthetic data and fine-tune on real data
    if opts.exp_name_secondary:
        opts.exp_name += opts.exp_name_secondary

    feature, img_spec = load_features(opts.img_feat_dir)

    if opts.test_submission:
        assert opts.resume, 'The model was not resumed before running for submission.'
        test_env = ('test', (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size,
                                 splits=['test'], tokenizer=tok), Evaluation(['test'])))
        agent_kwargs = {
            'opts': opts,
            'env': test_env[1][0],
            'results_path': "",
            'encoder': encoder,
            'model': model,
            'feedback': opts.feedback
        }
        agent = PanoSeq2SeqAgent(**agent_kwargs)
        # setup trainer
        trainer = PanoSeq2SeqTrainer(opts, agent, optimizer)
        epoch = opts.start_epoch - 1
        trainer.eval(epoch, test_env)
        return

    # set up R2R environments
    if not opts.train_data_augmentation:
        train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed,
                                 splits=['train'], tokenizer=tok)
    else:
        train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed,
                                 splits=['synthetic'], tokenizer=tok)

    val_envs = {split: (R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size,
                                     splits=[split], tokenizer=tok), Evaluation([split]))
                for split in ['val_seen', 'val_unseen']}

    # create agent
    agent_kwargs = {
        'opts': opts,
        'env': train_env,
        'results_path': "",
        'encoder': encoder,
        'model': model,
        'feedback': opts.feedback
    }
    agent = PanoSeq2SeqAgent(**agent_kwargs)

    # setup trainer
    trainer = PanoSeq2SeqTrainer(opts, agent, optimizer, opts.train_iters_epoch)

    if opts.eval_beam or opts.eval_only:
        success_rate = []
        for val_env in val_envs.items():
            success_rate.append(trainer.eval(opts.start_epoch - 1, val_env, tb_logger=None))
        return

    # set up tensorboard logger
    tb_logger = set_tb_logger(opts.log_dir, opts.exp_name, opts.resume)

    best_success_rate = best_success_rate if opts.resume else 0.0

    for epoch in range(opts.start_epoch, opts.max_num_epochs + 1):
        trainer.train(epoch, train_env, tb_logger)

        if epoch % opts.eval_every_epochs == 0:
            success_rate = []
            for val_env in val_envs.items():
                success_rate.append(trainer.eval(epoch, val_env, tb_logger))

            success_rate_compare = success_rate[1]

            if is_experiment():
                # remember best val_seen success rate and save checkpoint
                is_best = success_rate_compare >= best_success_rate
                best_success_rate = max(success_rate_compare, best_success_rate)
                print("--> Highest val_unseen success rate: {}".format(best_success_rate))

                # save the model if it is the best so far
                save_checkpoint({
                    'opts': opts,
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'encoder_state_dict': encoder.state_dict(),
                    'best_success_rate': best_success_rate,
                    'optimizer': optimizer.state_dict(),
                    'max_episode_len': opts.max_episode_len,
                }, is_best, checkpoint_dir=opts.checkpoint_dir, name=opts.exp_name)

        if opts.train_data_augmentation and epoch == opts.epochs_data_augmentation:
            train_env = R2RPanoBatch(opts, feature, img_spec, batch_size=opts.batch_size, seed=opts.seed,
                                     splits=['train'], tokenizer=tok)

    print("--> Finished training")
Пример #23
0
import numpy as np


def check(ar):
    ar = ar.cpu().detach().numpy()
    return np.any(np.isnan(ar))


def check2(ar):
    # ar = ar.cpu().numpy()
    return np.any(np.isnan(ar))


import utils
TRAIN_VOCAB = '../tasks/R2R/data/train_vocab.txt'
vocab = utils.read_vocab(TRAIN_VOCAB)
tok = utils.Tokenizer(vocab=vocab, encoding_length=args.maxInput)


#
class EncoderLSTM(nn.Module):
    ''' Encodes navigation instructions, returning hidden state context (for
        attention methods) and a decoder initial state. '''
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 padding_idx,
                 dropout_ratio,
                 bidirectional=False,
                 num_layers=1):
Пример #24
0
def get_dataloaders(args):
    model_prefix = '{}_{}'.format(args.model_type, args.train_id)

    log_path = args.LOG_DIR + model_prefix + '/'
    checkpoint_path = args.CHK_DIR + model_prefix + '/'
    result_path = args.RESULT_DIR + model_prefix + '/'
    cp_file = checkpoint_path + "best_model.pth.tar"
    init_epoch = 0

    if not os.path.exists(log_path):
        os.makedirs(log_path)
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    ## set up the logger
    set_logger(os.path.join(log_path, 'train.log'))

    ## save argparse parameters
    with open(log_path + 'args.yaml', 'w') as f:
        for k, v in args.__dict__.items():
            f.write('{}: {}\n'.format(k, v))

    logging.info('Training model: {}'.format(model_prefix))

    ## set up vocab txt
    # create txt here
    print('running setup')
    setup(args, clear=False)
    print(args.__dict__)

    # indicate src and tgt language
    if args.source_language == 'en':
        src, tgt = 'en', 'zh'
    else:
        src, tgt = 'zh', 'en'

    maps = {'en': args.TRAIN_VOCAB_EN, 'zh': args.TRAIN_VOCAB_ZH}
    vocab_src = read_vocab(maps[src])
    tok_src = Tokenizer(language=src,
                        vocab=vocab_src,
                        encoding_length=args.MAX_INPUT_LENGTH,
                        zh_tok='jieba')
    vocab_tgt = read_vocab(maps[tgt])
    tok_tgt = Tokenizer(language=tgt,
                        vocab=vocab_tgt,
                        encoding_length=args.MAX_INPUT_LENGTH,
                        zh_tok='jieba')
    logging.info('Vocab size src/tgt:{}/{}'.format(len(vocab_src),
                                                   len(vocab_tgt)))

    ## Setup the training, validation, and testing dataloaders
    train_loader, val_loader, test_loader = create_split_loaders(
        args.DATA_DIR, (tok_src, tok_tgt),
        args.batch_size,
        args.MAX_VID_LENGTH, (src, tgt),
        num_workers=4,
        pin_memory=True)
    logging.info('train/val/test size: {}/{}/{}'.format(
        len(train_loader), len(val_loader), len(test_loader)))

    return train_loader, val_loader, test_loader, tok_src, tok_tgt, len(
        vocab_src), len(vocab_tgt)
Пример #25
0
def main(_):
    vocab = read_vocab('data/ICLR_Review_all-w2i.pkl')
    glove_embs = load_glove('glove.6B.{}d.txt'.format(FLAGS.emb_size),
                            FLAGS.emb_size, vocab)
    data_reader = DataReader(train_file='data/ICLR_Review_all-train.pkl',
                             dev_file='data/ICLR_Review_all-dev.pkl',
                             test_file='data/ICLR_Review_all-test.pkl')

    config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
    with tf.Session(config=config) as sess:
        model = Model(cell_dim=FLAGS.cell_dim,
                      att_dim=FLAGS.att_dim,
                      vocab_size=len(vocab),
                      emb_size=FLAGS.emb_size,
                      num_classes=FLAGS.num_classes,
                      dropout_rate=FLAGS.dropout_rate,
                      pretrained_embs=glove_embs)

        loss = loss_fn(model.labels, model.logits)
        train_op, global_step = train_fn(loss)
        batch_acc, total_acc, acc_update, metrics_init, predictions = eval_fn(
            model.labels, model.logits)
        summary_op = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())

        train_writer.add_graph(sess.graph)
        saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints)

        print('\n{}> Start training'.format(datetime.now()))
        result_save_folder = str(datetime.now())
        output_folder = os.path.join('.', 'output')
        create_folder_if_not_exists(output_folder)

        stats_graph_folder = os.path.join(
            output_folder, result_save_folder)  # Folder where to save graphs
        create_folder_if_not_exists(stats_graph_folder)

        epoch = 0
        valid_step = 0
        test_step = 0
        train_test_prop = len(data_reader.train_data) / len(
            data_reader.test_data)
        test_batch_size = int(FLAGS.batch_size / train_test_prop)
        best_acc = float('-inf')

        while epoch < FLAGS.num_epochs:
            epoch += 1
            print('\n{}> Epoch: {}'.format(datetime.now(), epoch))

            sess.run(metrics_init)
            all_labels = []
            all_y_pred = []
            for batch_docs, batch_labels in data_reader.read_train_set(
                    FLAGS.batch_size, shuffle=True):
                _step, _, _loss, _acc, _, y_pred_batch = sess.run(
                    [
                        global_step, train_op, loss, batch_acc, acc_update,
                        predictions
                    ],
                    feed_dict=model.get_feed_dict(batch_docs,
                                                  batch_labels,
                                                  training=True))
                all_labels += batch_labels
                #y_pred_batch_array = y_pred_batch.eval(session=sess)
                y_pred_batch_list = y_pred_batch.tolist()
                all_y_pred += y_pred_batch_list
                if _step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    train_writer.add_summary(_summary, global_step=_step)
            print('Training accuracy = {:.2f}'.format(
                sess.run(total_acc) * 100))
            save_results(all_labels, all_y_pred, stats_graph_folder, 'train',
                         epoch)

            sess.run(metrics_init)
            all_valid_labels = []
            all_valid_y_pred = []
            for batch_docs, batch_labels in data_reader.read_valid_set(
                    test_batch_size):
                _loss, _acc, _, valid_y_pred_batch = sess.run(
                    [loss, batch_acc, acc_update, predictions],
                    feed_dict=model.get_feed_dict(batch_docs, batch_labels))
                all_valid_labels += batch_labels
                valid_y_pred_batch_list = valid_y_pred_batch.tolist()
                all_valid_y_pred += valid_y_pred_batch_list

                valid_step += 1
                if valid_step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    valid_writer.add_summary(_summary, global_step=valid_step)
            print('Validation accuracy = {:.2f}'.format(
                sess.run(total_acc) * 100))
            save_optimized_presicion(all_valid_labels, all_valid_y_pred,
                                     stats_graph_folder, 'valid', epoch)
            save_distance_measure(all_valid_labels, all_valid_y_pred,
                                  stats_graph_folder, 'valid', epoch)
            save_results(all_valid_labels, all_valid_y_pred,
                         stats_graph_folder, 'valid', epoch)

            sess.run(metrics_init)
            all_test_labels = []
            all_test_y_pred = []
            for batch_docs, batch_labels in data_reader.read_test_set(
                    test_batch_size):
                _loss, _acc, _, test_y_pred_batch = sess.run(
                    [loss, batch_acc, acc_update, predictions],
                    feed_dict=model.get_feed_dict(batch_docs, batch_labels))
                all_test_labels += batch_labels
                test_y_pred_batch_list = test_y_pred_batch.tolist()
                all_test_y_pred += test_y_pred_batch_list

                test_step += 1
                if test_step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    test_writer.add_summary(_summary, global_step=test_step)
            test_acc = sess.run(total_acc) * 100
            print('Testing accuracy = {:.2f}'.format(test_acc))
            save_optimized_presicion(all_test_labels, all_test_y_pred,
                                     stats_graph_folder, 'test', epoch)
            save_distance_measure(all_test_labels, all_test_y_pred,
                                  stats_graph_folder, 'test', epoch)
            save_results(all_test_labels, all_test_y_pred, stats_graph_folder,
                         'test', epoch)

            if test_acc > best_acc:
                best_acc = test_acc
                saver.save(sess, FLAGS.checkpoint_dir)
            print('Best testing accuracy = {:.2f}'.format(best_acc))

    print("{} Optimization Finished!".format(datetime.now()))
    print('Best testing accuracy = {:.2f}'.format(best_acc))
Пример #26
0
        raise ValueError(
            """usage: python run_cnn.py [train / test] [cnn/rnn]""")

    if sys.argv[2] not in ['cnn', 'rnn']:
        raise ValueError(
            """usage: python run_cnn.py [train / test] [cnn/rnn]""")

    print('Configuring {0} model...'.format(sys.argv[2]))
    model_name = sys.argv[2]
    if model_name == 'cnn':
        config = TCNNconfig()
    elif model_name == 'rnn':
        config = TRNNconfig()

    word2id, id2word = read_vocab(
        "/search/odin/wts/my-pytorch-try/text_cnn_rnn/data/cnews/cnews.vocab.txt"
    )
    config.word2index = word2id
    config.index2word = id2word
    config.vocab_size = len(word2id)  # 词汇表大小
    config.category2index = {
        "财经": 0,
        "房产": 1,
        "家居": 2,
        "教育": 3,
        "科技": 4,
        "时尚": 5,
        "时政": 6,
        "体育": 7,
        "游戏": 8,
        "娱乐": 9
Пример #27
0
import utils as utils
import numpy as np


path = '20news-bydate-matlab/matlab'
features = utils.read_features("expanded.txt")
label_array = utils.read_label(path, 'train.label')
print len(label_array)




answer_label_array = utils.read_label(path, 'test.label')
test_features = utils.read_features("test_expanded.txt")

vocab = utils.read_vocab("vocabulary.txt")

#remove stop words
from stop_words import get_stop_words
stop_words = get_stop_words('en')
# vocab = set(vocab)

# example_count=11269# total examples in training set
#nb.train(features[0:example_count,],label_array[0:example_count,])
clf.train(features, label_array, vocab=vocab)

#
# Y=[9,6]
# print "\nTrying to predict: "+str(Y)

correct_count = 0
Пример #28
0
def train():
    vocab = read_vocab(FLAGS.vocab_data)
    glove = load_glove("glove.6B.{}d.txt".format(FLAGS.emb_size),
                       FLAGS.emb_size, vocab)
    train = Dataset(filepath=FLAGS.train_data)
    valid = Dataset(filepath=FLAGS.valid_data)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            han = HieAttNet(cell_type=FLAGS.cell_type,
                            hid_size=FLAGS.hid_size,
                            att_size=FLAGS.att_size,
                            vocab_size=len(vocab),
                            emb_size=FLAGS.emb_size,
                            num_classes=FLAGS.num_classes,
                            pretrained_embs=glove,
                            l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
                han.loss, global_step=global_step)
            acc, acc_op = tf.metrics.accuracy(labels=han.labels,
                                              predictions=han.predictions,
                                              name="metrics/acc")
            metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                             scope="metrics")
            metrics_init_op = tf.variables_initializer(var_list=metrics_vars)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", han.loss)
            acc_summary = tf.summary.scalar("accuracy", han.accuracy)

            # Train summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Valid summaries
            valid_step = 0
            valid_summary_op = tf.summary.merge([loss_summary, acc_summary])
            valid_summary_dir = os.path.join(out_dir, "summaries", "valid")
            valid_summary_writer = tf.summary.FileWriter(
                valid_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # initialize all variables
            best_valid_acc = 0.0
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            # training and validating loop
            for epoch in range(FLAGS.num_epochs):
                print('-' * 100)
                print('\n{}> epoch: {}\n'.format(
                    datetime.datetime.now().isoformat(), epoch))
                sess.run(metrics_init_op)
                # Training process
                for batch in train.bacth_iter(FLAGS.batch_size,
                                              desc="Training",
                                              shuffle=True):
                    labels, docs = zip(*batch)
                    padded_docs, sent_length, max_sent_length, word_length, max_word_length = normalize(
                        docs)
                    feed_dict = {
                        han.docs: padded_docs,
                        han.labels: labels,
                        han.sent_length: sent_length,
                        han.word_length: word_length,
                        han.max_sent_length: max_sent_length,
                        han.max_word_length: max_word_length,
                        han.is_training: True,
                        han.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, step, summaries, loss, accuracy, _ = sess.run([
                        train_op, global_step, train_summary_op, han.loss,
                        han.accuracy, acc_op
                    ], feed_dict)
                    train_summary_writer.add_summary(summaries, step)

                    # training log display
                    # if step % FLAGS.display_every == 0:
                    #     time_str = datetime.datetime.now().isoformat()
                    #     print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

                    # Model checkpoint
                    # if step % FLAGS.checkpoint_every == 0:
                    #     path = saver.save(sess, checkpoint_prefix, global_step=step)
                    #     print("saved model checkpoint to {}\n".format(path))
                print("\ntraining accuracy = {:.2f}\n".format(
                    sess.run(acc) * 100))

                sess.run(metrics_init_op)
                # Validating process
                for batch in valid.bacth_iter(FLAGS.batch_size,
                                              desc="Validating",
                                              shuffle=False):
                    valid_step += 1
                    labels, docs = zip(*batch)
                    padded_docs, sent_length, max_sent_length, word_length, max_word_length = normalize(
                        docs)
                    feed_dict = {
                        han.docs: padded_docs,
                        han.labels: labels,
                        han.sent_length: sent_length,
                        han.max_sent_length: max_sent_length,
                        han.word_length: word_length,
                        han.max_word_length: max_word_length,
                        han.is_training: False,
                        han.dropout_keep_prob: 1.0
                    }
                    summaries, loss, accuracy, _ = sess.run(
                        [valid_summary_op, han.loss, han.accuracy, acc_op],
                        feed_dict)
                    valid_summary_writer.add_summary(summaries,
                                                     global_step=valid_step)

                valid_acc = sess.run(acc) * 100
                print("\nvalidating accuracy = {:.2f}\n".format(valid_acc))
                print("previous best validating accuracy = {:.2f}\n".format(
                    best_valid_acc))

                # model checkpoint
                if valid_acc > best_valid_acc:
                    best_valid_acc = valid_acc
                    path = saver.save(sess, checkpoint_prefix)
                    print("saved model checkpoint to {}\n".format(path))

            print("{} optimization finished!\n".format(
                datetime.datetime.now()))
            print("best validating accuracy = {:.2f}\n".format(best_valid_acc))
Пример #29
0
def train_val():
    ''' Train on the training set, and validate on seen and unseen splits. '''
    # args.fast_train = True
    setup()
    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAIN_VOCAB)

    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    feat_dict = read_img_features(features)

    # load object feature
    obj_s_feat = None
    if args.sparseObj:
        print("Start loading the object sparse feature")
        start = time.time()
        obj_s_feat = np.load(sparse_obj_feat, allow_pickle=True).item()
        print(
            "Finish Loading the object sparse feature from %s in %0.4f seconds"
            % (sparse_obj_feat, time.time() - start))

    obj_d_feat = None
    if args.denseObj:
        print("Start loading the object dense feature")
        start = time.time()
        obj_d_feat1 = np.load(dense_obj_feat1, allow_pickle=True).item()
        obj_d_feat2 = np.load(dense_obj_feat2, allow_pickle=True).item()
        obj_d_feat = {**obj_d_feat1, **obj_d_feat2}
        print(
            "Finish Loading the dense object dense feature from %s and %s in %0.4f seconds"
            % (dense_obj_feat1, dense_obj_feat2, time.time() - start))

    featurized_scans = set(
        [key.split("_")[0] for key in list(feat_dict.keys())])

    train_env = R2RBatch(feat_dict,
                         obj_d_feat=obj_d_feat,
                         obj_s_feat=obj_s_feat,
                         batch_size=args.batchSize,
                         splits=['train'],
                         tokenizer=tok)
    from collections import OrderedDict

    val_env_names = ['val_unseen', 'val_seen']
    if args.submit:
        val_env_names.append('test')
    else:
        pass
        #val_env_names.append('train')

    if not args.beam:
        val_env_names.append("train")

    val_envs = OrderedDict(((split, (R2RBatch(feat_dict,
                                              obj_d_feat=obj_d_feat,
                                              obj_s_feat=obj_s_feat,
                                              batch_size=args.batchSize,
                                              splits=[split],
                                              tokenizer=tok),
                                     Evaluation([split], featurized_scans,
                                                tok)))
                            for split in val_env_names))

    if args.train == 'listener':
        train(train_env, tok, args.iters, val_envs=val_envs)
    elif args.train == 'validlistener':
        if args.beam:
            beam_valid(train_env, tok, val_envs=val_envs)
        else:
            valid(train_env, tok, val_envs=val_envs)
    elif args.train == 'speaker':
        train_speaker(train_env, tok, args.iters, val_envs=val_envs)
    elif args.train == 'validspeaker':
        valid_speaker(tok, val_envs)
    else:
        assert False
def train():
    lines = [line.strip() for line in open("data/data.csv", "r").readlines()]
    lines = [(json.loads(line)["dream"], json.loads(line)["decode"])
             for line in lines]
    inputs = [" ".join(list(q)) for q, a in lines]
    outputs = [" ".join(list(a)) for q, a in lines]
    all_info = ' '.join(inputs + outputs).split()
    if os.path.exists(args.vocab_file):
        dictionary_input, rev_dictionary_input = read_vocab(args.vocab_file)
    else:
        dictionary_input, rev_dictionary_input = build_vocab(
            all_info, args.vocab_file)

    dictionary_output, rev_dictionary_output = dictionary_input, rev_dictionary_input

    min_line_length = 2
    max_line_length = 100

    data_filter = [(q, a) for q, a in zip(inputs, outputs)
                   if len_check(q, min_line_length, max_line_length)
                   and len_check(a, min_line_length, max_line_length)]
    random.shuffle(data_filter)
    inputs = [q for q, a in data_filter]
    outputs = [a + ' EOS' for q, a in data_filter]

    tf.logging.info("sample size: %s", len(inputs))
    inputs_dev = inputs[0:100]
    outputs_dev = outputs[0:100]
    inputs_train = inputs[100:]
    outputs_train = outputs[100:]

    inputs_train = str_idx(inputs_train, dictionary_input,
                           dictionary_input['UNK'])
    print(inputs_train[:2])
    outputs_train = str_idx(outputs_train, dictionary_output,
                            dictionary_output['UNK'])
    print(outputs_train[:2])
    inputs_dev = str_idx(inputs_dev, dictionary_input, dictionary_input['UNK'])
    outputs_dev = str_idx(outputs_dev, dictionary_output,
                          dictionary_output['UNK'])

    model = Seq2Seq(args.size_layer, args.num_layers, args.embedded_size,
                    len(dictionary_input), len(dictionary_output),
                    args.learning_rate, dictionary_input)

    with tf.Session() as sess:
        with tf.device("/cpu:0"):
            ckpt = tf.train.get_checkpoint_state(args.checkpoint_dir)
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                tf.logging.info("restore model from patch: %s",
                                ckpt.model_checkpoint_path)  # 加载预训练模型
                saver = tf.train.Saver(max_to_keep=4)
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                saver = tf.train.Saver(max_to_keep=4)
                sess.run(tf.global_variables_initializer())

            global_step = 0
            for epoch_index in range(args.epoch):
                total_loss, total_accuracy = 0, 0
                batch_num = 0
                for k in range(0, len(inputs_train), args.batch_size):
                    batch_num = batch_num + 1
                    index = min(k + args.batch_size, len(inputs_train))
                    batch_x, seq_x = pad_sentence_batch(
                        inputs_train[k:index], dictionary_input["PAD"])
                    batch_y, seq_y = pad_sentence_batch(
                        outputs_train[k:index], dictionary_input["PAD"])
                    predicted, accuracy, loss, _, global_step = sess.run(
                        fetches=[
                            model.predicting_ids, model.accuracy, model.cost,
                            model.optimizer, model.global_step
                        ],
                        feed_dict={
                            model.X: batch_x,
                            model.Y: batch_y
                        })
                    total_loss += loss
                    total_accuracy += accuracy

                    if global_step % 100 == 0:
                        print(
                            '%s epoch: %d, global_step: %d, loss: %f, accuracy: %f'
                            % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                               epoch_index + 1, global_step, loss, accuracy))
                        saver.save(sess,
                                   os.path.join(args.checkpoint_dir,
                                                "seq2seq.ckpt"),
                                   global_step=global_step)

                        print("+" * 20)
                        for i in range(4):
                            print('row %d' % (i + 1))
                            print(
                                'dream:', ''.join([
                                    rev_dictionary_input[n] for n in batch_x[i]
                                    if n not in [0, 1, 2, 3]
                                ]))
                            print(
                                'real   meaning:', ''.join([
                                    rev_dictionary_output[n]
                                    for n in batch_y[i]
                                    if n not in [0, 1, 2, 3]
                                ]))
                            print(
                                'dream decoding:', ''.join([
                                    rev_dictionary_output[n]
                                    for n in predicted[i]
                                    if n not in [0, 1, 2, 3]
                                ]), '')

                        index = list(range(len((inputs_dev))))
                        random.shuffle(index)
                        batch_x, _ = pad_sentence_batch([
                            inputs_dev[i] for i in index
                        ][:args.batch_size], dictionary_input["PAD"])
                        batch_y, _ = pad_sentence_batch([
                            outputs_dev[i] for i in index
                        ][:args.batch_size], dictionary_input["PAD"])
                        predicted = sess.run(model.predicting_ids,
                                             feed_dict={model.X: batch_x})
                        print("-" * 20)
                        for i in range(4):
                            print('row %d' % (i + 1))
                            # print(batch_x[i])
                            # print(predicted[i])
                            print(
                                'dream:', ''.join([
                                    rev_dictionary_input[n] for n in batch_x[i]
                                    if n not in [0, 1, 2, 3]
                                ]))
                            print(
                                'real   meaning:', ''.join([
                                    rev_dictionary_output[n]
                                    for n in batch_y[i]
                                    if n not in [0, 1, 2, 3]
                                ]))
                            print(
                                'dream decoding:', ''.join([
                                    rev_dictionary_output[n]
                                    for n in predicted[i]
                                    if n not in [0, 1, 2, 3]
                                ]), '')

                total_loss /= batch_num
                total_accuracy /= batch_num
                print(
                    '***%s epoch: %d, global_step: %d, avg loss: %f, avg accuracy: %f'
                    %
                    (datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                     epoch_index + 1, global_step, total_loss, total_accuracy))