예제 #1
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model,
                                              cache_dir=args.transformer_cache)

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(
                rows,
                dump_path,
                tokenizer,
                task_def.data_type,
                lab_dict=task_def.label_vocab,
                workers=args.workers,
            )
예제 #2
0
def main(args):
    # hyper param
    do_lower_case = args.do_lower_case
    root = "dl/" + args.root_dir
    assert os.path.exists(root)

    literal_model_type = args.model.split('-')[0].upper()

    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in args.model:
        mt_dnn_suffix += "_base"
    elif 'large' in args.model:
        mt_dnn_suffix += "_large"

    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained("dl/mt-dnn-models/vocab.txt",
                                                do_lower_case=do_lower_case)

    if 'uncased' in args.model:
        mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix)

    if do_lower_case:
        mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in ['test']:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            print(file_path)
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       task_def.data_type,
                       encoderModelType=encoder_model,
                       lab_dict=task_def.label_vocab)
예제 #3
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model, mirror='tuna')

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            if args.task_type == "clue":
                file_path = os.path.join(root, task, f"{split_name}.json")
            else:
                file_path = os.path.join(root,
                                         "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit" % file_path)
                sys.exit(1)
            if args.task_type == "glue":
                rows = load_data(file_path, task_def)
            elif args.task_type == "clue":
                rows = load_clue_data(file_path, task_def)
            elif args.task_type == "qianyan":
                rows = load_qianyan_data(file_path, task_def)
            else:
                raise ValueError(f"{args.task_type} not implemented")
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       task_def.data_type,
                       lab_dict=task_def.label_vocab)
예제 #4
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)
    suffix = args.model.split("/")[-1]
    literal_model_type = suffix.split("-")[0].upper()

    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if "base" in args.model:
        mt_dnn_suffix += "_base"
    elif "large" in args.model:
        mt_dnn_suffix += "_large"

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        args.model,
        cache_dir=args.cache_dir,
        use_fast=True,
        from_slow=True,
        revision=args.model_revision,
    )
    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if "uncased" in args.model:
        mt_dnn_suffix = "{}_uncased".format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = "{}_cased".format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            print(root)
            file_path = os.path.join(root, "%s_%s.json" % (task, split_name))
            print(file_path)

            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            logger.warning("processing %s" % file_path)
            is_training = True
            if not "train" in split_name:
                is_training = False

            rows = flat_squad(file_path, is_training)
            dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            if is_training:
                prepare_train_feature(
                    tokenizer,
                    rows,
                    dump_path,
                    pad_on_right=pad_on_right,
                    label_mapper=task_def.label_vocab,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                )
            else:
                prepare_validation_features(
                    tokenizer,
                    rows,
                    dump_path,
                    pad_on_right=pad_on_right,
                    label_mapper=task_def.label_vocab,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                )