예제 #1
0
        tokenizer = Tokenizer(s_paras, t_paras)
        tokenizer.build(data)
        nl2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_nl, vocab_words=tokenizer.sw2i,
                                   unk_words=True, sos=False, eos=False)
        tokenizer.tw2i = lb2id_dict
        tokenizer.i2tw = id2lb_dict
        tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i,
                                   unk_words=False, sos=False, eos=False)
        pad_id = tokenizer.sw2i.get(PAD, 0)
        sw_size = len(tokenizer.sw2i)
        tw_size = len(tokenizer.tw2i)
        collate_fn = Tokenizer.collate_fn(pad_id, True)
    else:
        vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt"
        tokenizer = BPE.load(vocab_file)
        tokenizer.add_tokens(sys_tokens)
        nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)
        tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)

        pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(BPAD) is not None else tokenizer.token_to_id(PAD)
        sw_size = tokenizer.get_vocab_size()
        tw_size = tokenizer.get_vocab_size()
        collate_fn = BPE.collate_fn(pad_id, True)

    train_data, num_lines = Tokenizer.prepare_iter(filename, firstline=False, task=1)
    train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines)
    train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn)

    for i, batch in enumerate(train_dataloader):
        # inputs, outputs = batch[0], batch[1]
        nl_tensor, lb_tensor = batch
예제 #2
0
    def build_data(args):
        if not args.tl:
            if not os.path.exists(args.model_dir):
                os.mkdir(args.model_dir)
            if args.timestamped_subdir:
                sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(args.model_dir, sub_folder)):
                os.mkdir(os.path.join(args.model_dir, sub_folder))
            args.model_dir = os.path.join(args.model_dir, sub_folder)
            args.log_file = os.path.join(args.model_dir, args.log_file)
            if args.tokenize_type != "bpe":
                s_paras = [args.wl_th, args.wcutoff]
                t_paras = [args.wl_th, args.wcutoff]
                print("INFO: - Build vocabulary...")

                tokenizer = Tokenizer(s_paras, t_paras)
                files = [args.train_file]
                if args.train_file != args.dev_file:
                    files.append(args.dev_file)
                # Load datasets to build vocabulary
                data = Tokenizer.load_file(files, task=2)
                tokenizer.build(datasets=data)
                sw2i = tokenizer.sw2i
                tw2i = tokenizer.tw2i
                print("INFO: - Save vocabulary...")
                Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab"))
            else:
                print("INFO: - Load vocabulary...")
                tokenizer = BPE.load(args.vocab_file)
                tokenizer.add_tokens(sys_tokens)
                sw2i = tokenizer.get_vocab()
                tw2i = tokenizer.get_vocab()

            # args.tokenizer = tokenizer
            # Source language
            args.swd_pretrained = None
            args.twd_pretrained = None
            if len(args.swd_embfile) != 0:
                scale = np.sqrt(3.0 / args.swd_dim)
                emb_reader = Embeddings(args.swd_embfile)
                args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale)
                if args.twd_embfile == args.swd_embfile:
                    scale = np.sqrt(3.0 / args.twd_dim)
                    args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale)

            # Target language
            if len(args.twd_embfile) != 0:
                scale = np.sqrt(3.0 / args.twd_dim)
                if args.twd_pretrained is None:
                    emb_reader = Embeddings(args.swd_embfile)
                args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale)

            # directly integrate transfer learning if no updating new words
            SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args))
            return args
        else:
            print("INFO: - Use transfer learning technique")
            assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file")
            # load pre-trained argument file from a previous training folder
            margs = SaveloadHP.load(args.tlargs)
            # margs.tl = args.tl
            # margs.log_file = args.log_file

            # TODO update new vocab and all other new arguments used for new training
            # 0. Read vocab
            # 1. Update schema
            # 2. Update vocab
            # args.tokenizer = margs.tokenizer
            # 3. Use all model file directory of previous train
            args.model_dir = margs.model_dir
            args.seq2seq_file = margs.seq2seq_file
            # 4. Keep the remaining current arguments
            # add a constraint at the loading time that if fail to load any model, just skip it
            args.swd_pretrained = margs.swd_pretrained
            args.twd_pretrained = margs.twd_pretrained
            return args
예제 #3
0
        data = Tokenizer.load_file([filename], task=2)
        s_paras = [-1, 1]
        t_paras = [-1, 1]
        vocab = Tokenizer(s_paras, t_paras)
        vocab.build(data)
        nl2ids = Tokenizer.lst2idx(tokenizer=vocab.process_nl, vocab_words=vocab.sw2i,
                                   unk_words=True, eos=True)
        tg2ids = Tokenizer.lst2idx(tokenizer=vocab.process_target, vocab_words=vocab.tw2i,
                                   unk_words=False, sos=True, eos=True)
        pad_id = vocab.sw2i.get(PAD, 0)
        sw_size = len(vocab.sw2i)
        tw_size = len(vocab.tw2i)
    else:
        vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt"
        vocab = BPE.load(vocab_file)
        vocab.add_tokens([SOT, EOT, NULL])
        nl2ids = BPE.tokens2ids(vocab)
        tg2ids = BPE.tokens2ids(vocab)

        pad_id = vocab.token_to_id(BPAD) if vocab.token_to_id(BPAD) else 0
        sw_size = vocab.get_vocab_size()
        tw_size = vocab.get_vocab_size()

    collate_fn = BPE.collate_fn(pad_id, True)
    # load datasets to map into indexes
    train_data = JSON.get_iterator(filename)
    num_lines = JSON._len(filename)
    # train_data = CSV.get_iterator(filename, firstline=True)
    # num_lines = CSV._len(filename)
    train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=tg2ids, num_lines=num_lines)
    train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn)