예제 #1
0
파일: sample.py 프로젝트: wxmAndrew/deepAPI
def main(args):
    conf = getattr(configs, 'config_'+args.model)()
    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
    else:
        print("Note that our pre-trained models require CUDA to evaluate.")
    
    # Load data
    test_set=APIDataset(args.data_path+'valid.h5', conf['maxlen'])
    test_loader=torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, num_workers=1)
    vocab_api = load_dict(input_dir+'vocab.apiseq.pkl')
    vocab_desc = load_dict(input_dir+'vocab.desc.pkl')
    n_tokens = len(vocab_api)

    metrics=Metrics()
    
    # Load model checkpoints    
    model = getattr(model, args.model)(conf, n_tokens)
    ckpt='./output/{}/{}/models/model_epo{}.pkl'.format(args.model, args.expname, args.reload_from)
    model.load_state_dict(torch.load(ckpt))
    if torch.cuda.is_available():
        model=model.cuda()
    model.eval()
    
    f_eval = open("./output/{}/{}/results.txt".format(args.model, args.expname), "w")
    repeat = args.n_samples
    
    evaluate(model, metrics, test_loader, vocab_desc, vocab_api, f_eval, repeat)
예제 #2
0
    def __init__(self, conf):
        self.conf = conf
        self.path = conf['workdir']

        self.vocab_methname = load_dict(self.path + conf['vocab_name'])
        self.vocab_apiseq = load_dict(self.path + conf['vocab_api'])
        self.vocab_tokens = load_dict(self.path + conf['vocab_tokens'])
        self.vocab_desc = load_dict(self.path + conf['vocab_desc'])

        self.codevecs = []
        self.codebase = []
        self.codebase_chunksize = 2000000
예제 #3
0
    def __init__(self, conf):
        self.model_params = conf
        self.path = conf['workdir']

        self.vocab_methname = load_dict(self.path + conf['vocab_name'])
        self.vocab_apiseq = load_dict(self.path + conf['vocab_api'])
        self.vocab_tokens = load_dict(self.path + conf['vocab_tokens'])
        self.vocab_desc = load_dict(self.path + conf['vocab_desc'])

        self.codevecs = []
        self.codebase = []
        self.codebase_chunksize = conf['chunk_size']

        self.validation_set = None
예제 #4
0
파일: train.py 프로젝트: martinhartt/HGfGT
def main():
    print("Loading dictionary...")
    print(opt.dictionary)
    dict = data.load_dict(opt.dictionary)

    DataLoader = data.HierDataLoader if opt.hier else data.AbsDataLoader

    print("Constructing train tensors...")
    train_data = DataLoader(opt.train,
                            dict,
                            opt,
                            window=opt.window,
                            max_size=opt.maxSize)

    print("Constructing validation tensors...")
    valid_data = DataLoader(opt.valid,
                            dict,
                            opt,
                            window=opt.window,
                            max_size=opt.maxSize)

    print("Setting up language model and training parameters...")
    t = trainer.Trainer(opt, dict)

    print("Training...")
    t.train(train_data, valid_data)
예제 #5
0
 def __init__(self, args, logger, hvd=False):
     self.args = args
     self.logger = logger
     self.hvd = hvd
     self.optimizer = args.optimizer
     self.num_hidden_layers = args.layer_depth
     self.embedding_dim = args.embedding_dim
     self.bucket = BUCKET
     self.ngram_dim = args.ngram_dim
     self.tag2id, self.id2tag = load_dict(args.tag_to_id_path,
                                          args.encoding)
     self.num_tags = 3  # len(self.tag2id)
     self.word2id, self.id2word = load_dict(args.word_to_id_path,
                                            args.encoding)
     self.logger.info("tag2id size: %d" % self.num_tags)
     self.logger.info("word2id size: %d" % len(self.word2id))
     self.lambda1 = args.lambda1
     self.lambda2 = args.lambda2
     self.lambda3 = args.lambda3
     self.lambda4 = args.lambda4
예제 #6
0
 def __init__(self, seqmodel, paths, config):
     self.model = seqmodel
     self.args = seqmodel.args
     self.encoding = self.args.encoding
     self.logger = seqmodel.logger
     self.model_global_step = seqmodel.global_step
     self.global_step = 0
     self.hvd = seqmodel.hvd
     self.model_path = paths['model_path']
     if self.model.args.restore:
         self.restore_model_path = paths['restore_model_path']
     self.summary_path = paths['summary_path']
     self.result_path = paths['result_path']
     self.tag2id, self.id2tag = load_dict(self.args.tag_to_id_path,
                                          self.encoding)
     self.word2id, self.id2word = load_dict(self.args.word_to_id_path,
                                            self.encoding)
     self.dataset2flag, self.flag2dataset = load_dict_ano(
         self.args.dataset_to_flag_path, self.encoding)
     self.config = config
     self.batch_size = self.args.batch_size
     self.epoch_num = self.args.epoch
     self.min_epoch_num = self.args.min_epoch
     self.restore = self.args.restore
     self.dropout = self.args.dropout
     self.optimizer = self.args.optimizer
     self.lr = self.args.lr
     self.max_scores = 0.0
     self.unseccessful_step_num = 0
     self.eval_step = self.args.eval_step
     self.local_step_num = 0
     self.total_w_count = 0
     self.total_w_loss = 0
     self.total_w_loss1 = 0
     self.total_w_loss2 = 0
     self.total_w_loss3 = 0
     self.total_w_loss4 = 0
     self.train_sample_num = 0
     self.save_max = self.args.save_max
     self.logger.info("model path: %s " % self.model_path)
예제 #7
0
def quant_post(args):
    place = paddle.set_device("gpu")
    exe = paddle.static.Executor(place)

    label2id, id2label = load_dict(args.label_path)
    train_ds = load_dataset(read, data_path=args.dev_path, lazy=False)

    tokenizer = PPMiniLMTokenizer.from_pretrained(args.base_model_name)
    trans_func = partial(convert_example_to_feature,
                         tokenizer=tokenizer,
                         label2id=label2id,
                         max_seq_len=args.max_seq_len)
    train_ds = train_ds.map(trans_func, lazy=True)

    def batch_generator_func():
        batch_data = [[], []]
        for data in train_ds:
            batch_data[0].append(data[0])
            batch_data[1].append(data[1])
            if len(batch_data[0]) == args.batch_size:
                input_ids = Pad(axis=0, pad_val=0,
                                dtype="int64")(batch_data[0])
                segment_ids = Pad(axis=0, pad_val=0,
                                  dtype="int64")(batch_data[1])
                yield [input_ids, segment_ids]
                batch_data = [[], []]

    paddleslim.quant.quant_post_static(
        exe,
        args.static_model_dir,
        args.quant_model_dir,
        save_model_filename=args.save_model_filename,
        save_params_filename=args.save_params_filename,
        algo=args.algorithm,
        hist_percent=0.9999,
        batch_generator=batch_generator_func,
        model_filename=args.input_model_filename,
        params_filename=args.input_param_filename,
        quantizable_op_type=['matmul', 'matmul_v2'],
        weight_bits=8,
        weight_quantize_type='channel_wise_abs_max',
        batch_nums=1)
예제 #8
0
def dict_to_mongo():
    items = load_dict()
    for word, item in items.items():
        if Word.objects(word=word).count() == 0:
            dbitem = Word(**item)
            dbitem.save()
def main(args):
    global anno, infer_y, h_pre, alpha_past, if_trainning, dictLen

    worddicts = load_dict(args.dictPath)
    dictLen = len(worddicts)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk

    test, test_uid_list = dataIterator(
        args.testPklPath,
        args.testCaptionPath,
        worddicts,
        batch_size=2,
        batch_Imagesize=400000,
        maxlen=100,
        maxImagesize=400000,
    )

    x = tf.placeholder(tf.float32, shape=[None, None, None, 1])

    y = tf.placeholder(tf.int32, shape=[None, None])

    x_mask = tf.placeholder(tf.float32, shape=[None, None, None])

    y_mask = tf.placeholder(tf.float32, shape=[None, None])

    lr = tf.placeholder(tf.float32, shape=())

    if_trainning = tf.placeholder(tf.bool, shape=())

    watcher_train = Watcher_train(blocks=3,
                                  level=16,
                                  growth_rate=24,
                                  training=if_trainning)

    annotation, anno_mask = watcher_train.dense_net(x, x_mask)

    # for initilaizing validation
    anno = tf.placeholder(
        tf.float32,
        shape=[
            None,
            annotation.shape.as_list()[1],
            annotation.shape.as_list()[2],
            annotation.shape.as_list()[3],
        ],
    )
    infer_y = tf.placeholder(tf.int64, shape=(None, ))
    h_pre = tf.placeholder(tf.float32, shape=[None, 256])
    alpha_past = tf.placeholder(
        tf.float32,
        shape=[
            None,
            annotation.shape.as_list()[1],
            annotation.shape.as_list()[2]
        ],
    )

    attender = Attender(annotation.shape.as_list()[3], 256, 512)

    parser = Parser(256, 256, attender, annotation.shape.as_list()[3])

    wap = WAP(
        watcher_train,
        attender,
        parser,
        256,
        256,
        annotation.shape.as_list()[3],
        dictLen,
        if_trainning,
    )

    hidden_state_0 = tf.tanh(
        tf.tensordot(tf.reduce_mean(anno, axis=[1, 2]), wap.Wa2h, axes=1) +
        wap.ba2h)  # [batch, hidden_dim]

    cost = wap.get_cost(annotation, y, anno_mask, y_mask)

    vs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    for vv in vs:
        if not vv.name.startswith("batch_normalization"):
            cost += 1e-4 * tf.reduce_sum(tf.pow(vv, 2))

    p, w, h, alpha = wap.get_word(infer_y, h_pre, alpha_past, anno)

    optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):
        trainer = optimizer.minimize(cost)

    max_epoch = 200

    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True

    init = tf.global_variables_initializer()

    saver = tf.train.Saver()

    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        sess.run(init)
        saver.restore(
            sess,
            os.path.join(args.modelPath, args.modelFileName) + ".ckpt")

        print("Start sampling...")
        _t = time.time()
        fpp_sample = open(
            os.path.join(args.resultPath,
                         str(args.resultFileName) + ".txt"),
            "w",
        )
        test_count_idx = 0
        for batch_x, batch_y in test:
            for xx in batch_x:
                xx = np.moveaxis(xx, 0, -1)
                xx_pad = np.zeros((xx.shape[0], xx.shape[1], xx.shape[2]),
                                  dtype="float32")
                xx_pad[:, :, :] = xx / 255.0
                xx_pad = xx_pad[None, :, :, :]
                annot = sess.run(annotation,
                                 feed_dict={
                                     x: xx_pad,
                                     if_trainning: False
                                 })
                h_state = sess.run(hidden_state_0, feed_dict={anno: annot})
                sample, score = wap.get_sample(
                    p,
                    w,
                    h,
                    alpha,
                    annot,
                    h_state,
                    10,
                    100,
                    False,
                    sess,
                    training=False,
                )
                score = score / np.array([len(s) for s in sample])
                ss = sample[score.argmin()]
                fpp_sample.write(test_uid_list[test_count_idx])
                test_count_idx = test_count_idx + 1
                if np.mod(test_count_idx, 100) == 0:
                    print("gen %d samples" % test_count_idx)
                    log.write("gen %d samples" % test_count_idx + "\n")
                    log.flush()
                for vv in ss:
                    if vv == 0:  # <eol>
                        break
                    fpp_sample.write(" " + worddicts_r[vv])
                fpp_sample.write("\n")
        fpp_sample.close()
        print("valid set decode done")
        log.write("valid set decode done\n")
        log.flush()
        print("Done sampling, took" + str(time.time() - _t))

        print("Start validating...")
        _t = time.time()
        probs = []
        for batch_x, batch_y in test:
            batch_x, batch_x_m, batch_y, batch_y_m = prepare_data(
                batch_x, batch_y)
            pprobs, annot = sess.run(
                [cost, annotation],
                feed_dict={
                    x: batch_x,
                    y: batch_y,
                    x_mask: batch_x_m,
                    y_mask: batch_y_m,
                    if_trainning: False,
                },
            )
            probs.append(pprobs)
        valid_errs = np.array(probs)
        valid_err_cost = valid_errs.mean()
        wer_process(
            os.path.join(args.resultPath, args.resultFileName + ".txt"),
            args.validCaptionPath,
            os.path.join(args.resultPath, args.resultFileName + ".wer"),
        )
        fpp = open(os.path.join(args.resultPath, f"{args.resultFileName}.wer"))
        stuff = fpp.readlines()
        fpp.close()
        m = re.search("WER (.*)\n", stuff[0])
        test_per = 100.0 * float(m.group(1))
        m = re.search("ExpRate (.*)\n", stuff[1])
        test_sacc = 100.0 * float(m.group(1))
        test_err = test_per

        print("Test WER: %.2f%%, ExpRate: %.2f%%, Cost: %f" %
              (test_per, test_sacc, test_err_cost))
        print(f"Done validating, took {time.time() - _t}.")
예제 #10
0
        token_type_ids = paddle.to_tensor(token_type_ids)
        seq_len = paddle.to_tensor(seq_len)
        pred_tags = model(input_ids, token_type_ids, lengths=seq_len)
        all_pred_tags.extend(pred_tags.numpy().tolist())
    results = decode(data, all_pred_tags, summary_num, idx_to_tags)
    return results


if __name__ == "__main__":
    paddle.set_device(args.device)

    data = [
        '美人鱼是周星驰执导的一部电影',
    ]

    tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt"))
    idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys())))

    model = ErnieCtmWordtagModel.from_pretrained("wordtag",
                                                 num_tag=len(tags_to_idx))
    tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")

    if args.params_path and os.path.isfile(args.params_path):
        state_dict = paddle.load(args.params_path)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.params_path)

    results = do_predict(data,
                         model,
                         tokenizer,
                         model.viterbi_decoder,
예제 #11
0
        preds = [pred[1:] for pred in preds.numpy()]
        all_preds.append(preds)
        all_lens.append(lens)
    sentences = [example[0] for example in ds.data]
    results = parse_decodes(sentences, all_preds, all_lens, label_vocab)
    return results


if __name__ == '__main__':
    paddle.set_device('gpu')

    # Create dataset, tokenizer and dataloader.
    train_ds, dev_ds, test_ds = load_dataset(datafiles=(
        './waybill_data/train.txt', './waybill_data/dev.txt', './waybill_data/test.txt'))

    label_vocab = load_dict('./conf/tag.dic')
    tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

    trans_func = partial(
        convert_to_features, tokenizer=tokenizer, label_vocab=label_vocab)

    train_ds.map(trans_func)
    dev_ds.map(trans_func)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=label_vocab.get("O", 0), dtype='int64')  # labels
    ): fn(samples)
예제 #12
0

if __name__ == "__main__":
    # yapf: disable
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_model_name", type=str, default=None, help="The name of base model.")
    parser.add_argument("--model_path", type=str, default=None, help="The path of saved model that you want to load.")
    parser.add_argument('--test_path', type=str, default=None, help="The path of test set.")
    parser.add_argument("--label_path", type=str, default=None, help="The path of label dict.")
    parser.add_argument("--batch_size", type=int, default=32, help="Batch size per GPU/CPU for training.")
    parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
    args = parser.parse_args()
    # yapf: enbale

    # load dev data
    label2id, id2label = load_dict(args.label_path)
    test_ds = load_dataset(read, data_path=args.test_path, lazy=False)

    tokenizer = PPMiniLMTokenizer.from_pretrained(args.base_model_name)
    trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len)
    test_ds = test_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
        Stack(dtype="int64"),
        Stack(dtype="int64")
    ): fn(samples)

    test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False)
    test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn)
예제 #13
0
def train():
    # set running envir
    model_name = "skep_ernie_1.0_large_ch"

    paddle.set_device(args.device)
    set_seed(args.seed)

    if not os.path.exists(args.checkpoints):
        os.mkdir(args.checkpoints)

    # load and process data
    label2id, id2label = load_dict(args.label_path)
    train_ds = load_dataset(read, data_path=args.train_path, lazy=False)
    dev_ds = load_dataset(read, data_path=args.dev_path, lazy=False)

    tokenizer = SkepTokenizer.from_pretrained(model_name)
    trans_func = partial(
        convert_example_to_feature,
        tokenizer=tokenizer,
        label2id=label2id,
        max_seq_len=args.max_seq_len)
    train_ds = train_ds.map(trans_func, lazy=False)
    dev_ds = dev_ds.map(trans_func, lazy=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
        Stack(dtype="int64"),
        Pad(axis=0, pad_val= -1, dtype="int64")
    ): fn(samples)

    train_batch_sampler = paddle.io.BatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True)
    dev_batch_sampler = paddle.io.BatchSampler(
        dev_ds, batch_size=args.batch_size, shuffle=False)
    train_loader = paddle.io.DataLoader(
        train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn)
    dev_loader = paddle.io.DataLoader(
        dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn)

    # configure model training
    model = SkepForTokenClassification.from_pretrained(
        model_name, num_classes=len(label2id))

    num_training_steps = len(train_loader) * args.num_epochs
    lr_scheduler = LinearDecayWithWarmup(
        learning_rate=args.learning_rate,
        total_steps=num_training_steps,
        warmup=args.warmup_proportion)
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=grad_clip)

    metric = ChunkEvaluator(label2id.keys())

    # start to train model
    global_step, best_f1 = 1, 0.
    model.train()
    for epoch in range(1, args.num_epochs + 1):
        for batch_data in train_loader():
            input_ids, token_type_ids, _, labels = batch_data
            # logits: batch_size, seql_len, num_tags
            logits = model(input_ids, token_type_ids=token_type_ids)
            loss = F.cross_entropy(
                logits.reshape([-1, len(label2id)]),
                labels.reshape([-1]),
                ignore_index=-1)

            loss.backward()
            lr_scheduler.step()
            optimizer.step()
            optimizer.clear_grad()

            if global_step > 0 and global_step % args.log_steps == 0:
                print(
                    f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.numpy().item():.6f}"
                )
            if (global_step > 0 and global_step % args.eval_steps == 0
                ) or global_step == num_training_steps:
                precision, recall, f1 = evaluate(model, dev_loader, metric)
                model.train()
                if f1 > best_f1:
                    print(
                        f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}"
                    )
                    best_f1 = f1
                    paddle.save(model.state_dict(),
                                f"{args.checkpoints}/best.pdparams")
                print(
                    f'evalution result: precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}'
                )

            global_step += 1

    paddle.save(model.state_dict(), f"{args.checkpoints}/final.pdparams")
예제 #14
0
def main():
    state = torch.load(opt.model)

    if opt.hier:
        mlp, encoder = state
    else:
        mlp = state

    dict = data.load_dict(opt.dictionary)

    sent_file = open(opt.inputf).read().split("\n")
    length = opt.length
    if not opt.hier:
        W = mlp.window
        opt.window = mlp.window
    else:
        W = 1

    w2i = dict["w2i"]
    i2w = dict["i2w"]

    K = opt.beamSize

    actual = open(opt.outputf).read().split('\n')

    sent_num = 0

    with torch.no_grad():
        for line in sent_file:
            if line.strip() == "":
                continue

            # Add padding
            if opt.hier:
                summaries = extractive(line).split("\t")
                print("\n> {}...".format(summaries[0]))
                encoded_summaries = [
                    encode("<s> {} </s>".format(normalize(summary)), w2i)
                    for summary in summaries
                ]
                article = HierDataLoader.torchify(encoded_summaries,
                                                  variable=True,
                                                  revsort=True,
                                                  opt=opt)

                hidden_state = encoder.init_hidden()
                summ_hidden_state = encoder.init_hidden(n=opt.summLstmLayers,
                                                        K=opt.K)

                print(hidden_state[0].shape, summ_hidden_state[0].shape)
                print(article[0].shape)
                encoder_out, hidden_state, _ = encoder(article, hidden_state,
                                                       summ_hidden_state)

            else:
                print("\n> {}".format(line))
                true_line = "<s> <s> <s> {} </s> </s> </s>".format(
                    normalize(line))

                article = torch.tensor(encode(true_line, w2i))

            n = opt.length

            hyps = apply_cuda(torch.zeros(K, W + n).long().fill_(w2i["<s>"]))
            scores = apply_cuda(torch.zeros(K).float())

            if opt.hier:
                hidden_size = len(hidden_state[0][0][0])
                hidden = apply_cuda(torch.zeros(K, hidden_size).float())
                cell = apply_cuda(torch.zeros(K, hidden_size).float())

                for k in range(K):
                    hidden[k] = hidden_state[0][0]
                    cell[k] = hidden_state[1][0]

            for step in range(n):
                new_candidates = []

                start = step
                end = step + W
                context = hyps[:, start:end]  # context

                if opt.hier:
                    model_scores = torch.zeros(K, len(w2i))
                    for c in range(K):
                        ctx = context[c].view(1, -1)
                        ctx = article[0][0][step].view(1, -1)
                        model_scores[c], new_hidden, attn = mlp(
                            encoder_out, ctx,
                            (hidden[c].view(1, 1, -1), cell[c].view(1, 1, -1)))

                        hidden[c] = new_hidden[0]
                        cell[c] = new_hidden[1]
                else:
                    article_t, context_t = AbsDataLoader.make_input(
                        article, context, K)
                    model_scores, attn = mlp(article_t, context_t)

                out_scores = model_scores.data

                # Apply hard constraints
                finalized = (step == n - 1) and opt.fixedLength
                set_hard_constraints(out_scores, w2i, finalized)

                for sample in range(K):  # Per certain context
                    top_scores, top_indexes = torch.topk(out_scores[sample], K)

                    for ix, score in zip(top_indexes, top_scores):
                        repetition = opt.noRepeat and apply_cuda(
                            ix) in apply_cuda(hyps[sample])

                        combined = torch.cat((hyps[sample][:end],
                                              apply_cuda(torch.tensor([ix]))))
                        if opt.hier:
                            candidate = [
                                combined,
                                -INF if repetition else scores[sample] +
                                apply_cuda(score), hidden[c], cell[c]
                            ]
                        else:
                            candidate = [
                                combined,
                                -INF if repetition else scores[sample] +
                                apply_cuda(score), None, None
                            ]
                        new_candidates.append(candidate)

                ordered = list(
                    reversed(sorted(new_candidates, key=lambda cand: cand[1])))

                h, s, hidden_temp, cell_temp = zip(*ordered)

                for r in range(K):
                    hyps[r][0:end + 1] = h[r]
                    scores[r] = s[r]

                    if opt.hier:
                        hidden[r] = hidden_temp[r]
                        cell[r] = cell_temp[r]

            s, top_ixs = torch.topk(scores, 1)

            final = hyps[int(top_ixs)][W:-1]

            print("= {}".format(actual[sent_num]))
            print("< {}".format(decode(final, i2w)))
            print("")

            sent_num += 1
예제 #15
0
    if use_cuda:
        embedder = embedder.cuda()
        encoder = encoder.cuda()
        hidvar = hidvar.cuda()  #!!!!!
        decoder = decoder.cuda()

    TRAIN_FILE = input_dir + 'train.h5'
    train_set = UbuntuDataset(TRAIN_FILE, max_seq_len=20)
    train_data_loader = torch.utils.data.DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=1  # multiple num_workers could introduce error (conflict?) 
    )
    vocab = load_dict(input_dir + 'vocab.json')

    train(embedder,
          encoder,
          hidvar,
          decoder,
          train_data_loader,
          vocab,
          n_iters,
          model_dir,
          p_teach_force,
          save_every=save_every,
          sample_every=sample_every,
          print_every=print_every,
          learning_rate=learning_rate)
def main(args):
    global anno, infer_y, h_pre, alpha_past, if_trainning, dictLen

    worddicts = load_dict(args.dictPath)
    dictLen = len(worddicts)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk

    train, train_uid_list = dataIterator(
        args.trainPklPath,
        args.trainCaptionPath,
        worddicts,
        batch_size=args.batch_size,
        batch_Imagesize=500000,
        maxlen=150,
        maxImagesize=500000,
    )

    valid, valid_uid_list = dataIterator(
        args.validPklPath,
        args.validCaptionPath,
        worddicts,
        batch_size=args.batch_size,
        batch_Imagesize=500000,
        maxlen=150,
        maxImagesize=500000,
    )

    print("train lenth is ", len(train))
    print("valid lenth is ", len(valid))

    x = tf.placeholder(tf.float32, shape=[None, None, None, 1])

    y = tf.placeholder(tf.int32, shape=[None, None])

    x_mask = tf.placeholder(tf.float32, shape=[None, None, None])

    y_mask = tf.placeholder(tf.float32, shape=[None, None])

    lr = tf.placeholder(tf.float32, shape=())

    if_trainning = tf.placeholder(tf.bool, shape=())

    watcher_train = Watcher_train(blocks=3,
                                  level=16,
                                  growth_rate=24,
                                  training=if_trainning)

    annotation, anno_mask = watcher_train.dense_net(x, x_mask)

    # for initilaizing validation
    anno = tf.placeholder(
        tf.float32,
        shape=[
            None,
            annotation.shape.as_list()[1],
            annotation.shape.as_list()[2],
            annotation.shape.as_list()[3],
        ],
    )
    infer_y = tf.placeholder(tf.int64, shape=(None, ))
    h_pre = tf.placeholder(tf.float32, shape=[None, 256])
    alpha_past = tf.placeholder(
        tf.float32,
        shape=[
            None,
            annotation.shape.as_list()[1],
            annotation.shape.as_list()[2]
        ],
    )

    attender = Attender(annotation.shape.as_list()[3], 256, 512)

    parser = Parser(256, 256, attender, annotation.shape.as_list()[3])

    wap = WAP(
        watcher_train,
        attender,
        parser,
        256,
        256,
        annotation.shape.as_list()[3],
        dictLen,
        if_trainning,
    )

    hidden_state_0 = tf.tanh(
        tf.tensordot(tf.reduce_mean(anno, axis=[1, 2]), wap.Wa2h, axes=1) +
        wap.ba2h)  # [batch, hidden_dim]

    cost = wap.get_cost(annotation, y, anno_mask, y_mask)

    vs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    for vv in vs:
        if not vv.name.startswith("batch_normalization"):
            cost += 1e-4 * tf.reduce_sum(tf.pow(vv, 2))

    p, w, h, alpha = wap.get_word(infer_y, h_pre, alpha_past, anno)

    optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):
        trainer = optimizer.minimize(cost)

    max_epoch = 200

    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True

    init = tf.global_variables_initializer()

    uidx = 0
    cost_s = 0
    dispFreq = 100 if args.dispFreq is None else args.dispFreq
    saveFreq = (len(train) * args.epochDispRatio
                if args.saveFreq is None else args.saveFreq)
    sampleFreq = (len(train) * args.epochSampleRatio
                  if args.sampleFreq is None else args.sampleFreq)
    validFreq = (len(train) * args.epochValidRatio
                 if args.validFreq is None else args.validFreq)
    history_errs = []
    estop = False
    halfLrFlag = 0
    patience = 15 if args.patience is None else args.patience
    lrate = args.lr
    logPath = "./log.txt" if args.logPath is None else args.logPath
    log = open(logPath, "w")

    log.write(str(vars(args)))
    log.write(str(patience))
    log.write(str(lr))

    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        sess.run(init)
        for epoch in range(max_epoch):
            n_samples = 0
            random.shuffle(train)
            for batch_x, batch_y in train:
                batch_x, batch_x_m, batch_y, batch_y_m = prepare_data(
                    batch_x, batch_y)
                n_samples += len(batch_x)
                uidx += 1

                cost_i, _ = sess.run(
                    [cost, trainer],
                    feed_dict={
                        x: batch_x,
                        y: batch_y,
                        x_mask: batch_x_m,
                        y_mask: batch_y_m,
                        if_trainning: True,
                        lr: lrate,
                    },
                )

                cost_s += cost_i

                if np.isnan(cost_i) or np.isinf(cost_i):
                    print("invalid cost value detected")
                    sys.exit(0)

                if np.mod(uidx, dispFreq) == 0:
                    cost_s /= dispFreq
                    print("Epoch ", epoch, "Update ", uidx, "Cost ", cost_s,
                          "Lr ", lrate)
                    log.write("Epoch " + str(epoch) + " Update " + str(uidx) +
                              " Cost " + str(cost_s) + " Lr " + str(lrate) +
                              "\n")
                    log.flush()
                    cost_s = 0

                if np.mod(uidx, sampleFreq) == 0:
                    print("Start sampling...")
                    _t = time.time()
                    fpp_sample = open(
                        os.path.join(args.resultPath,
                                     str(args.resultFileName) + ".txt"),
                        "w",
                    )
                    valid_count_idx = 0
                    for batch_x, batch_y in valid:
                        for xx in batch_x:
                            xx = np.moveaxis(xx, 0, -1)
                            xx_pad = np.zeros(
                                (xx.shape[0], xx.shape[1], xx.shape[2]),
                                dtype="float32")
                            xx_pad[:, :, :] = xx / 255.0
                            xx_pad = xx_pad[None, :, :, :]
                            annot = sess.run(annotation,
                                             feed_dict={
                                                 x: xx_pad,
                                                 if_trainning: False
                                             })
                            h_state = sess.run(hidden_state_0,
                                               feed_dict={anno: annot})
                            sample, score = wap.get_sample(
                                p,
                                w,
                                h,
                                alpha,
                                annot,
                                h_state,
                                10,
                                100,
                                False,
                                sess,
                                training=False,
                            )
                            score = score / np.array([len(s) for s in sample])
                            ss = sample[score.argmin()]
                            fpp_sample.write(valid_uid_list[valid_count_idx])
                            valid_count_idx = valid_count_idx + 1
                            if np.mod(valid_count_idx, 100) == 0:
                                print("gen %d samples" % valid_count_idx)
                                log.write("gen %d samples" % valid_count_idx +
                                          "\n")
                                log.flush()
                            for vv in ss:
                                if vv == 0:  # <eol>
                                    break
                                fpp_sample.write(" " + worddicts_r[vv])
                            fpp_sample.write("\n")
                    fpp_sample.close()
                    print("valid set decode done")
                    log.write("valid set decode done\n")
                    log.flush()
                    print("Done sampling, took" + str(time.time() - _t))

                if np.mod(uidx, validFreq) == 0:
                    print("Start validating...")
                    _t = time.time()
                    probs = []
                    for batch_x, batch_y in valid:
                        batch_x, batch_x_m, batch_y, batch_y_m = prepare_data(
                            batch_x, batch_y)
                        pprobs, annot = sess.run(
                            [cost, annotation],
                            feed_dict={
                                x: batch_x,
                                y: batch_y,
                                x_mask: batch_x_m,
                                y_mask: batch_y_m,
                                if_trainning: False,
                            },
                        )
                        probs.append(pprobs)
                    valid_errs = np.array(probs)
                    valid_err_cost = valid_errs.mean()
                    wer_process(
                        os.path.join(args.resultPath,
                                     args.resultFileName + ".txt"),
                        args.validCaptionPath,
                        os.path.join(args.resultPath,
                                     args.resultFileName + ".wer"),
                    )
                    fpp = open(
                        os.path.join(args.resultPath,
                                     args.resultFileName + ".wer"))
                    stuff = fpp.readlines()
                    fpp.close()
                    m = re.search("WER (.*)\n", stuff[0])
                    valid_per = 100.0 * float(m.group(1))
                    m = re.search("ExpRate (.*)\n", stuff[1])
                    valid_sacc = 100.0 * float(m.group(1))
                    valid_err = valid_per

                    history_errs.append(valid_err)

                    if (uidx / validFreq == 0
                            or valid_err <= np.array(history_errs).min()):
                        bad_counter = 0

                    if (uidx / validFreq != 0
                            and valid_err > np.array(history_errs).min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            if halfLrFlag == 2:
                                print("Early Stop!")
                                log.write("Early Stop!\n")
                                log.flush()
                                estop = True
                                break
                            else:
                                print("Lr decay and retrain!")
                                log.write("Lr decay and retrain!\n")
                                log.flush()
                                bad_counter = 0
                                lrate = lrate / 10
                                halfLrFlag += 1
                    print("bad_counter" + str(bad_counter))
                    print("Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f" %
                          (valid_per, valid_sacc, valid_err_cost))
                    log.write("Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f" %
                              (valid_per, valid_sacc, valid_err_cost) + "\n")
                    log.flush()
                    print("Done validating, took" + str(time.time() - _t))
            if estop:
                break

        save_path = saver.save(sess,
                               os.path.join(args.savePath + args.saveName))
예제 #17
0
def do_train(args):
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds = load_dataset(read_custom_data,
                            filename=os.path.join(args.data_dir, "train.txt"),
                            is_test=False,
                            lazy=False)
    dev_ds = load_dataset(read_custom_data,
                          filename=os.path.join(args.data_dir, "dev.txt"),
                          is_test=False,
                          lazy=False)
    tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt"))

    tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")
    model = ErnieCtmWordtagModel.from_pretrained("wordtag",
                                                 num_tag=len(tags_to_idx))
    model.crf_loss = LinearChainCrfLoss(
        LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False))

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_len=args.max_seq_len,
                         tags_to_idx=tags_to_idx)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'),  # tags
    ): fn(samples)

    train_data_loader = create_dataloader(train_ds,
                                          mode="train",
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    dev_data_loader = create_dataloader(dev_ds,
                                        mode="dev",
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.num_train_epochs
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    num_train_optimization_steps = len(
        train_ds) / args.batch_size * args.num_train_epochs

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)
    logger.info("WarmUp steps: %s" % warmup)

    metric = SequenceAccuracy()

    total_loss = 0
    global_step = 0

    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for total_step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, seq_len, tags = batch

            loss, _ = model(input_ids,
                            token_type_ids,
                            lengths=seq_len,
                            tag_labels=tags)
            loss = loss.mean()
            total_loss += loss
            loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            if global_step % args.logging_steps == 0 and rank == 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, total_loss / args.logging_steps,
                       speed))
                start_time = time.time()
                total_loss = 0

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps) and rank == 0:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

        evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
def predict(text):
    """

    :return:
    """
    def load_model():
        output_graph_def = tf.GraphDef()

        with open('./ckpt/ner-1.pb', "rb") as f:
            output_graph_def.ParseFromString(f.read())
            _ = tf.import_graph_def(output_graph_def, name="")

        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)
        return sess

    # 读入数据
    char2id, type2id = load_dict(char_dict="train_data/char2id.json",
                                 type_dict="train_data/type2id.json")
    id2type = {value: key for key, value in type2id.items()}
    ids = list(id2type.keys())
    sess = load_model()

    input_ids = sess.graph.get_tensor_by_name("placeholder/input_ids:0")
    # is_training = sess.graph.get_tensor_by_name("placeholder/Placeholder:0")  # is_training
    viterbi_sequence = sess.graph.get_tensor_by_name(
        "doc_segment/ReverseSequence_1:0")

    t1 = time.time()
    x = sequence_padding([char2id.get(c, 1) for c in text], max_len=384)
    feed_dict = {
        input_ids: [x],
        # is_training: True,
    }
    predicts_d = sess.run([viterbi_sequence], feed_dict)[0]
    p = predicts_d.tolist()[0]
    # 封装一下,输出结果
    IOS = []
    index = 0
    start = None
    for i in p:
        if i == 0:
            if start is None:
                pass
            else:
                IOS.append((start, index))
            break
        elif i == 1:
            if start is None:
                pass
            else:
                if index > 0:
                    IOS.append((start, index))
                start = None
        else:  # 包含实体
            if start is None:
                start = index
            else:
                if i == p[index - 1]:
                    pass
                else:
                    IOS.append((start, index))
                    start = index
        index += 1
    print(p)
    print(IOS)
    extract_dict = []
    for i in IOS:
        extract_id = p[i[0]]
        tag = id2type.get(extract_id)
        value = text[i[0]:i[1]]
        extract_dict.append({"type": tag, "value": value})
    return extract_dict
def train():
    """
        模型训练
    :return:
    """
    char2id, type2id = load_dict(char_dict="train_data/char2id.json",
                                 type_dict="train_data/type2id.json")
    # tf.flags.DEFINE_string("data_dir", "data/data.dat", "data directory")
    tf.flags.DEFINE_integer("vocab_size", len(char2id), "vocabulary size")
    tf.flags.DEFINE_integer("num_classes", len(type2id), "number of classes")
    tf.flags.DEFINE_integer("max_num", 384, "max_sentence_num")
    tf.flags.DEFINE_integer(
        "embedding_size", 256,
        "Dimensionality of character embedding (default: 200)")
    tf.flags.DEFINE_integer(
        "hidden_size", 128, "Dimensionality of GRU hidden layer (default: 50)")
    tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", 10,
                            "Number of training epochs (default: 50)")
    tf.flags.DEFINE_integer("checkpoint_every", 100,
                            "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_integer("num_checkpoints", 3,
                            "Number of checkpoints to store (default: 5)")
    tf.flags.DEFINE_integer("evaluate_every", 300,
                            "evaluate every this many batches")
    tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate")
    tf.flags.DEFINE_float("grad_clip", 5,
                          "grad clip to prevent gradient explode")
    FLAGS = tf.flags.FLAGS
    with tf.Session(config=config) as sess:
        ner = NER(vocab_size=FLAGS.vocab_size,
                  num_classes=FLAGS.num_classes,
                  embedding_size=FLAGS.embedding_size,
                  hidden_size=FLAGS.hidden_size,
                  max_num=FLAGS.max_num)

        # 外部定义 优化器
        global_step = tf.Variable(0, trainable=False)
        optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
        # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(ner.loss, tvars),
                                          FLAGS.grad_clip)
        grads_and_vars = tuple(zip(grads, tvars))
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=FLAGS.num_checkpoints)
        if not os.path.exists('./ckpt/'):
            os.makedirs("./ckpt/")

        # 恢复模型 / 重新初始化参数
        # model_file = tf.train.latest_checkpoint('./ckpt/')
        ckpt = tf.train.get_checkpoint_state('./ckpt/')
        if ckpt:
            print("load saved model:\t", ckpt.model_checkpoint_path)
            saver = tf.train.Saver()
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("init model...")
            sess.run(tf.global_variables_initializer())

        def evaluate(viterbi_sequence, Y):
            '''
                计算变长的 准确率 指标
            :return:
            '''
            t_all = 0
            t_true = 0
            for p, y in zip(viterbi_sequence, Y):
                # 当前句子的长度
                l = len(np.nonzero(y))
                # 通过两个序列,计算准确率
                t_all += l
                t_true += np.sum(np.equal(p[:l], y[:l]))
            return float(t_true) / float(t_all), t_true, t_all

        def train_step(x, y):
            feed_dict = {
                ner.input_ids: x,
                ner.output_types: y,
                ner.is_training: True,
            }
            _, step, predicts_t, cost, accuracy = sess.run([
                train_op, global_step, ner.viterbi_sequence, ner.loss, ner.acc
            ], feed_dict)
            acc_t, count, total = evaluate(np.array(predicts_t), y)
            time_str = str(int(time.time()))
            print("{}: step {}, loss {}, f_acc {}, t_acc {}".format(
                time_str, step, cost, accuracy, acc_t))
            # train_summary_writer.add_summary(summaries, step)
            return step

        def dev_step(x, y, writer=None):
            feed_dict = {
                ner.input_ids: x,
                ner.output_types: y,
                ner.is_training: False,
            }
            step, predicts_d, cost, accuracy = sess.run(
                [global_step, ner.viterbi_sequence, ner.loss, ner.acc],
                feed_dict)

            acc_d, count, total = evaluate(np.array(predicts_d), y)
            time_str = str(int(time.time()))
            print("+dev+{}: step {}, loss {}, f_acc {}, t_acc {}".format(
                time_str, step, cost, accuracy, acc_d))
            return cost, accuracy, count, total

        best_accuracy, best_at_step = 0, 0
        train_example_len = 173109
        dev_example_len = 21639
        num_train_steps = int(train_example_len / FLAGS.batch_size *
                              FLAGS.num_epochs)
        num_dev_steps = int(dev_example_len / FLAGS.batch_size)
        max_acc = 0.0
        input_ids_train, output_types_train = get_input_data(
            "./train_data/train_ner.tf_record", FLAGS.batch_size)
        input_ids_dev, output_types_dev = get_input_data(
            "./train_data/dev_ner.tf_record", FLAGS.batch_size)
        for i in range(num_train_steps):
            # batch 数据
            input_ids_train_, output_types_train_ = sess.run(
                [input_ids_train, output_types_train])
            step = train_step(input_ids_train_, output_types_train_)
            if step % FLAGS.evaluate_every == 0:
                # dev 数据过大, 也需要进行 分批
                total_dev_correct = 0
                total_devs = 0
                for j in range(num_dev_steps):
                    input_ids_dev_, output_types_dev_ = sess.run(
                        [input_ids_dev, output_types_dev])
                    loss, acc, count, total = dev_step(input_ids_dev_,
                                                       output_types_dev_)
                    total_dev_correct += count
                    total_devs += total
                dev_accuracy = float(total_dev_correct) / total_devs
                print("预测:", total_dev_correct)
                print("长度为:", total_devs)
                print("最后预测结果:", dev_accuracy)
                if dev_accuracy > max_acc:
                    print("save model:\t%f\t>%f" % (dev_accuracy, max_acc))
                    max_acc = dev_accuracy
                    saver.save(sess, './ckpt/ner.ckpt', global_step=step)

        sess.close()
예제 #20
0
def load_model(model, epoch):
    """Load parameters from checkpoint"""
    ckpt_path='./output/{}/{}/models/model_epo{}.pkl'.format(args.model, args.expname, epoch)
    print(f'Loading model parameters from {ckpt_path}')
    model.load_state_dict(torch.load(checkpoint))

config = getattr(configs, 'config_'+args.model)()

###############################################################################
# Load data
###############################################################################
train_set=APIDataset(args.data_path+'train.desc.shuf.h5', args.data_path+'train.apiseq.shuf.h5', config['maxlen'])
valid_set=APIDataset(args.data_path+'test.desc.shuf.h5', args.data_path+'test.apiseq.shuf.h5', config['maxlen'])

vocab_api = load_dict(args.data_path+'vocab.apiseq.pkl')
vocab_desc = load_dict(args.data_path+'vocab.desc.pkl')
n_tokens = len(vocab_api)

metrics=Metrics()

print("Loaded data!")

###############################################################################
# Define the models
###############################################################################

model = getattr(model, args.model)(config, n_tokens) 
if args.reload_from>=0:
    load_model(model, args.reload_from)
    
예제 #21
0
def main(data_path):
    dataset_en, unknown_en = data.load_dataset(data_path + "/dataset_en.pkl")
    dataset_vi, unknown_vi = data.load_dataset(data_path + "/dataset_vi.pkl")
    dict_en, dict_vi = data.load_dict(data_path)
    print(dataset_en[1])
def main(args):
    worddicts = load_dict(args.path + '/data/dictionary.txt')
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk

    train, train_uid_list = dataIterator(args.path + '/data/offline-train.pkl',
                                         args.path + '/data/train_caption.txt',
                                         worddicts,
                                         batch_size=args.batch_size,
                                         batch_Imagesize=400000,
                                         maxlen=100,
                                         maxImagesize=400000)

    valid, valid_uid_list = dataIterator(args.path + '/data/offline-test.pkl',
                                         args.path + '/data/test_caption.txt',
                                         worddicts,
                                         batch_size=args.batch_size,
                                         batch_Imagesize=400000,
                                         maxlen=100,
                                         maxImagesize=400000)

    print('train lenght is ', len(train))

    x = tf.placeholder(tf.float32, shape=[None, None, None, 1])

    y = tf.placeholder(tf.int32, shape=[None, None])

    x_mask = tf.placeholder(tf.float32, shape=[None, None, None])

    y_mask = tf.placeholder(tf.float32, shape=[None, None])

    lr = tf.placeholder(tf.float32, shape=())

    if_trainning = tf.placeholder(tf.bool, shape=())

    watcher_train = Watcher_train(blocks=3,
                                  level=16,
                                  growth_rate=24,
                                  training=if_trainning)

    annotation, anno_mask = watcher_train.dense_net(x, x_mask)

    # for initilaizing validation
    anno = tf.placeholder(tf.float32,
                          shape=[
                              None,
                              annotation.shape.as_list()[1],
                              annotation.shape.as_list()[2],
                              annotation.shape.as_list()[3]
                          ])
    infer_y = tf.placeholder(tf.int64, shape=(None, ))
    h_pre = tf.placeholder(tf.float32, shape=[None, 256])
    alpha_past = tf.placeholder(tf.float32,
                                shape=[
                                    None,
                                    annotation.shape.as_list()[1],
                                    annotation.shape.as_list()[2]
                                ])

    attender = Attender(annotation.shape.as_list()[3], 256, 512)

    parser = Parser(256, 256, attender, annotation.shape.as_list()[3])

    w = WAP(watcher_train, attender, parser, 256, 256,
            annotation.shape.as_list()[3], 111, if_trainning)

    hidden_state_0 = tf.tanh(
        tf.tensordot(tf.reduce_mean(anno, axis=[1, 2]), w.Wa2h, axes=1) +
        w.ba2h)  # [batch, hidden_dim]

    cost = w.get_cost(annotation, y, anno_mask, y_mask)

    vs = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    for vv in vs:
        if not vv.name.startswith('batch_normalization'):
            cost += 1e-4 * tf.reduce_sum(tf.pow(vv, 2))

    p, w, h, alpha = w.get_word(infer_y, h_pre, alpha_past, anno)

    optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):
        trainer = optimizer.minimize(cost)

    max_epoch = 200

    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True

    init = tf.global_variables_initializer()

    uidx = 0
    cost_s = 0
    dispFreq = 100
    saveFreq = len(train)
    sampleFreq = len(train)
    validFreq = len(train)
    history_errs = []
    estop = False
    halfLrFlag = 0
    patience = 15
    lrate = 1.0
    log = open(args.path + '/log-bs-6.txt', 'w')

    with tf.Session(config=config) as sess:
        sess.run(init)
        for epoch in range(max_epoch):
            n_samples = 0
            random.shuffle(train)
            for batch_x, batch_y in train:
                batch_x, batch_x_m, batch_y, batch_y_m = prepare_data(
                    batch_x, batch_y)
                n_samples += len(batch_x)
                uidx += 1

                cost_i, _ = sess.run(
                    [cost, trainer],
                    feed_dict={
                        x: batch_x,
                        y: batch_y,
                        x_mask: batch_x_m,
                        y_mask: batch_y_m,
                        if_trainning: True,
                        lr: lrate
                    })

                cost_s += cost_i

                if np.isnan(cost_i) or np.isinf(cost_i):
                    print('invalid cost value detected')
                    sys.exit(0)

                if np.mod(uidx, dispFreq) == 0:
                    cost_s /= dispFreq
                    print('Epoch ', epoch, 'Update ', uidx, 'Cost ', cost_s,
                          'Lr ', lrate)
                    log.write('Epoch ' + str(epoch) + ' Update ' + str(uidx) +
                              ' Cost ' + str(cost_s) + ' Lr ' + str(lrate) +
                              '\n')
                    log.flush()
                    cost_s = 0

                if np.mod(uidx, sampleFreq) == 0:
                    fpp_sample = open(
                        args.path + '/result/valid_decode_result-bs-6.txt',
                        'w')
                    valid_count_idx = 0
                    for batch_x, batch_y in valid:
                        for xx in batch_x:
                            xx = np.moveaxis(xx, 0, -1)
                            xx_pad = np.zeros(
                                (xx.shape[0], xx.shape[1], xx.shape[2]),
                                dtype='float32')
                            xx_pad[:, :, :] = xx / 255.
                            xx_pad = xx_pad[None, :, :, :]
                            annot = sess.run(annotation,
                                             feed_dict={
                                                 x: xx_pad,
                                                 if_trainning: False
                                             })
                            h_state = sess.run(hidden_state_0,
                                               feed_dict={anno: annot})
                            sample, score = w.get_sample(p,
                                                         w,
                                                         h,
                                                         alpha,
                                                         annot,
                                                         h_state,
                                                         10,
                                                         100,
                                                         False,
                                                         sess,
                                                         training=False)
                            score = score / np.array([len(s) for s in sample])
                            ss = sample[score.argmin()]
                            fpp_sample.write(valid_uid_list[valid_count_idx])
                            valid_count_idx = valid_count_idx + 1
                            if np.mod(valid_count_idx, 100) == 0:
                                print('gen %d samples' % valid_count_idx)
                                log.write('gen %d samples' % valid_count_idx +
                                          '\n')
                                log.flush()
                            for vv in ss:
                                if vv == 0:  # <eol>
                                    break
                                fpp_sample.write(' ' + worddicts_r[vv])
                            fpp_sample.write('\n')
                    fpp_sample.close()
                    print('valid set decode done')
                    log.write('valid set decode done\n')
                    log.flush()

                if np.mod(uidx, validFreq) == 0:
                    probs = []
                    for batch_x, batch_y in valid:
                        batch_x, batch_x_m, batch_y, batch_y_m = prepare_data(
                            batch_x, batch_y)
                        pprobs, annot = sess.run(
                            [cost, annotation],
                            feed_dict={
                                x: batch_x,
                                y: batch_y,
                                x_mask: batch_x_m,
                                y_mask: batch_y_m,
                                if_trainning: False
                            })
                        probs.append(pprobs)
                    valid_errs = np.array(probs)
                    valid_err_cost = valid_errs.mean()
                    os.system('python3.4 compute-wer.py ' + args.path +
                              '/result/valid_decode_result-bs-6.txt' + ' ' +
                              args.path + '/data/test_caption.txt' + ' ' +
                              args.path + '/result/valid-bs-6.wer')
                    fpp = open(args.path + '/result/valid-bs-6.wer')
                    stuff = fpp.readlines()
                    fpp.close()
                    m = re.search('WER (.*)\n', stuff[0])
                    valid_per = 100. * float(m.group(1))
                    m = re.search('ExpRate (.*)\n', stuff[1])
                    valid_sacc = 100. * float(m.group(1))
                    valid_err = valid_per

                    history_errs.append(valid_err)

                    if uidx / validFreq == 0 or valid_err <= np.array(
                            history_errs).min():
                        bad_counter = 0

                    if uidx / validFreq != 0 and valid_err > np.array(
                            history_errs).min():
                        bad_counter += 1
                        if bad_counter > patience:
                            if halfLrFlag == 2:
                                print('Early Stop!')
                                log.write('Early Stop!\n')
                                log.flush()
                                estop = True
                                break
                            else:
                                print('Lr decay and retrain!')
                                log.write('Lr decay and retrain!\n')
                                log.flush()
                                bad_counter = 0
                                lrate = lrate / 10
                                halfLrFlag += 1

                    print('Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' %
                          (valid_per, valid_sacc, valid_err_cost))
                    log.write('Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' %
                              (valid_per, valid_sacc, valid_err_cost) + '\n')
                    log.flush()
            if estop:
                break
예제 #23
0
        all_preds.append(preds)
        all_lens.append(lens)
    sentences = [example[0] for example in ds.data]
    results = parse_decodes(sentences, all_preds, all_lens, label_vocab)
    return results


if __name__ == '__main__':
    paddle.set_device('gpu')

    # Create dataset, tokenizer and dataloader.
    train_ds, dev_ds, test_ds = load_dataset(datafiles=('./data/train.txt',
                                                        './data/dev.txt',
                                                        './data/test.txt'))

    label_vocab = load_dict('./data/tag.dic')
    tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

    trans_func = partial(convert_to_features,
                         tokenizer=tokenizer,
                         label_vocab=label_vocab)

    train_ds.map(trans_func)
    dev_ds.map(trans_func)
    test_ds.map(trans_func)

    ignore_label = -1
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
예제 #24
0
        all_lens.append(lens)
    sentences = [example[0] for example in ds.data]
    results = parse_decodes(sentences, all_preds, all_lens, label_vocab)
    return results


if __name__ == '__main__':
    paddle.set_device(args.device)

    # Create dataset, tokenizer and dataloader.
    train_ds, dev_ds, test_ds = load_dataset(
        datafiles=(os.path.join(args.data_dir, 'train.txt'),
                   os.path.join(args.data_dir, 'dev.txt'),
                   os.path.join(args.data_dir, 'test.txt')))

    label_vocab = load_dict(os.path.join(args.data_dir, 'tag.dic'))
    tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

    trans_func = partial(convert_to_features,
                         tokenizer=tokenizer,
                         label_vocab=label_vocab)

    train_ds.map(trans_func)
    dev_ds.map(trans_func)
    test_ds.map(trans_func)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'
            ),  # token_type_ids
예제 #25
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    train_ds = load_dataset(datafiles=('./data/train.json'))
    tags_to_idx = load_dict("./data/tags.txt")
    labels_to_idx = load_dict("./data/classifier_labels.txt")
    tokenizer = ErnieCtmTokenizer.from_pretrained(args.model_dir)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_len=args.max_seq_len,
                         tags_to_idx=tags_to_idx,
                         labels_to_idx=labels_to_idx)
    train_ds.map(trans_func)

    ignore_label = tags_to_idx["O"]
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=ignore_label, dtype='int64'),  # tags
        Stack(dtype='int64'),  # cls_label
    ): fn(samples)

    train_batch_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=False, drop_last=True)
    train_data_loader = DataLoader(train_ds,
                                   batch_sampler=train_batch_sampler,
                                   num_workers=0,
                                   collate_fn=batchify_fn,
                                   return_list=True)

    model = ErnieCtmWordtagModel.from_pretrained(
        args.model_dir,
        num_cls_label=len(labels_to_idx),
        num_tag=len(tags_to_idx),
        ignore_index=tags_to_idx["O"])

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    num_train_optimization_steps = len(
        train_ds) / args.batch_size * args.num_train_epochs

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)
    logger.info("WarmUp steps: %s" % warmup)

    cls_acc = paddle.metric.Accuracy()
    seq_acc = SequenceAccuracy()
    total_loss = 0

    global_step = 0

    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for total_step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, seq_len, tags, cls_label = batch

            outputs = model(input_ids,
                            token_type_ids,
                            lengths=seq_len,
                            tag_labels=tags,
                            cls_label=cls_label)
            loss, seq_logits, cls_logits = outputs[0], outputs[1], outputs[2]
            loss = loss.mean()
            total_loss += loss
            loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            cls_correct = cls_acc.compute(pred=cls_logits.reshape(
                [-1, len(labels_to_idx)]),
                                          label=cls_label.reshape([-1]))
            cls_acc.update(cls_correct)
            seq_correct = seq_acc.compute(pred=seq_logits.reshape(
                [-1, len(tags_to_idx)]),
                                          label=tags.reshape([-1]),
                                          ignore_index=tags_to_idx["O"])
            seq_acc.update(seq_correct)

            if global_step % args.logging_steps == 0 and global_step != 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "[Training]["
                    "epoch: %s/%s][step: %s/%s] loss: %6f, Classification Accuracy: %6f, Sequence Labeling Accuracy: %6f, speed: %6f"
                    % (epoch, args.num_train_epochs, global_step,
                       num_training_steps, total_loss / args.logging_steps,
                       cls_acc.accumulate(), seq_acc.accumulate(), speed))
                start_time = time.time()
                cls_acc.reset()
                seq_acc.reset()
                total_loss = 0

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps
                ) and paddle.distributed.get_rank() == 0:
                output_dir = os.path.join(
                    args.output_dir,
                    "ernie_ctm_ft_model_%d.pdparams" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)