示例#1
0
 def __init__(self, args):
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     self.args = args
     margs = torch.load(os.path.join(args.model_name_or_path, "training_args.bin"))
     margs.no_cuda = args.no_cuda
     margs.model_name_or_path = args.model_name_or_path
     margs.overwrite_output_dir = True
     self.lm = TransLanguageModel(margs)
     self.lm.model_init()
     # lm.load_model(args.model_name_or_path)
     Data2tensor.set_randseed(args.seed)
示例#2
0
 def __init__(self, args):
     args.device = torch.device("cuda" if torch.cuda.is_available()
                                and not args.no_cuda else "cpu")
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     margs = torch.load(
         os.path.join(args.model_name_or_path, "training_args.bin"))
     margs.no_cuda = args.no_cuda
     margs.label_file = args.label_file
     margs.model_name_or_path = args.model_name_or_path
     self.tagger = TransLabelerModel(margs)
     self.tagger.model_init(args.model_name_or_path)
     Data2tensor.set_randseed(args.seed)
示例#3
0
 def __init__(self, args):
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     self.args = args
     margs = torch.load(os.path.join(args.model_name_or_path, "training_args.bin"))
     margs.no_cuda = args.no_cuda
     margs.model_name_or_path = args.model_name_or_path
     margs.overwrite_output_dir = True
     self.lm = TransSeq2SeqModel(margs)
     self.lm.model_init(args.model_name_or_path)
     # lm.load_model(args.model_name_or_path)
     Data2tensor.set_randseed(args.seed)
     self.bos_token_id = self.lm.tokenizer.tw2i[SOT]
     # self.pad_token_id = self.lm.pad_id
     self.eos_token_id = self.lm.tokenizer.tw2i[EOT]
示例#4
0
    @staticmethod
    def decode_batch(pad_ids, i2t, level=2):
        return Tokenizer.idx2text(pad_ids=pad_ids, i2t=i2t, level=level)


if __name__ == '__main__':
    import torch
    from mlmodels.utils.idx2tensor import Data2tensor, seqPAD
    from mlmodels.utils.dataset import IterDataset, collate_fn, tokens2ids
    from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset
    from mlmodels.utils.BPEtonkenizer import BPE
    from mlmodels.utils.special_tokens import BPAD, PAD, NULL
    from mlmodels.utils.txtIO import TXT

    Data2tensor.set_randseed(12345)
    device = torch.device("cpu")
    dtype = torch.long
    use_cuda = False
    filename = "../../data/reviews/processed_csv/train_res4.csv"
    label_file = "../../data/reviews/processed_csv/labels.txt"
    labels_list = TXT.read(label_file, firstline=False)
    lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list)
    id2lb_dict = Tokenizer.reversed_dict(lb2id_dict)
    lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target,
                               vocab_words=lb2id_dict,
                               unk_words=False,
                               sos=False,
                               eos=False)
    tokenize_type = "bpe"
    if tokenize_type != "bpe":
示例#5
0
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    Data2tensor.set_randseed(args.seed, args.n_gpu)

    # Prepare CONLL-2003 task
    labels = TextDataset.get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(