def main(args): # hyper param root = args.root_dir assert os.path.exists(root) tokenizer = AutoTokenizer.from_pretrained(args.model, cache_dir=args.transformer_cache) mt_dnn_root = os.path.join(root, args.model) if not os.path.isdir(mt_dnn_root): os.makedirs(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in task_def.split_names: file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name)) if not os.path.exists(file_path): logger.warning("File %s doesnot exit") sys.exit(1) rows = load_data(file_path, task_def) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data( rows, dump_path, tokenizer, task_def.data_type, lab_dict=task_def.label_vocab, workers=args.workers, )
def main(args): # hyper param do_lower_case = args.do_lower_case root = "dl/" + args.root_dir assert os.path.exists(root) literal_model_type = args.model.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in args.model: mt_dnn_suffix += "_base" elif 'large' in args.model: mt_dnn_suffix += "_large" config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained("dl/mt-dnn-models/vocab.txt", do_lower_case=do_lower_case) if 'uncased' in args.model: mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix) else: mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix) if do_lower_case: mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in ['test']: file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name)) print(file_path) if not os.path.exists(file_path): logger.warning("File %s doesnot exit") sys.exit(1) rows = load_data(file_path, task_def) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, task_def.data_type, encoderModelType=encoder_model, lab_dict=task_def.label_vocab)
def main(args): # hyper param root = args.root_dir assert os.path.exists(root) tokenizer = AutoTokenizer.from_pretrained(args.model, mirror='tuna') mt_dnn_root = os.path.join(root, args.model) if not os.path.isdir(mt_dnn_root): os.makedirs(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in task_def.split_names: if args.task_type == "clue": file_path = os.path.join(root, task, f"{split_name}.json") else: file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name)) if not os.path.exists(file_path): logger.warning("File %s doesnot exit" % file_path) sys.exit(1) if args.task_type == "glue": rows = load_data(file_path, task_def) elif args.task_type == "clue": rows = load_clue_data(file_path, task_def) elif args.task_type == "qianyan": rows = load_qianyan_data(file_path, task_def) else: raise ValueError(f"{args.task_type} not implemented") dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, task_def.data_type, lab_dict=task_def.label_vocab)
def main(args): # hyper param root = args.root_dir assert os.path.exists(root) suffix = args.model.split("/")[-1] literal_model_type = suffix.split("-")[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if "base" in args.model: mt_dnn_suffix += "_base" elif "large" in args.model: mt_dnn_suffix += "_large" # tokenizer tokenizer = AutoTokenizer.from_pretrained( args.model, cache_dir=args.cache_dir, use_fast=True, from_slow=True, revision=args.model_revision, ) # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if "uncased" in args.model: mt_dnn_suffix = "{}_uncased".format(mt_dnn_suffix) else: mt_dnn_suffix = "{}_cased".format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in task_def.split_names: print(root) file_path = os.path.join(root, "%s_%s.json" % (task, split_name)) print(file_path) if not os.path.exists(file_path): logger.warning("File %s doesnot exit") sys.exit(1) logger.warning("processing %s" % file_path) is_training = True if not "train" in split_name: is_training = False rows = flat_squad(file_path, is_training) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) if is_training: prepare_train_feature( tokenizer, rows, dump_path, pad_on_right=pad_on_right, label_mapper=task_def.label_vocab, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, ) else: prepare_validation_features( tokenizer, rows, dump_path, pad_on_right=pad_on_right, label_mapper=task_def.label_vocab, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, )