def full_chunk_and_save(task, phase, examples, feat_spec, tokenizer, args: RunConfiguration): """Convert Examples to ListDataset, optionally truncate sequences if possible, and save to disk. Args: task: Task object phase (str): string identifying the data subset (e.g., train, val or test). examples (list[Example]): list of task Examples. feat_spec: (FeaturizationSpec): Tokenization-related metadata. tokenizer: TODO (issue #1188) args (RunConfiguration): run configuration object. """ dataset = preprocessing.convert_examples_to_dataset( task=task, examples=examples, feat_spec=feat_spec, tokenizer=tokenizer, phase=phase, verbose=True, ) if args.smart_truncate: dataset, length = preprocessing.smart_truncate( dataset=dataset, max_seq_length=args.max_seq_length, verbose=True, ) os.makedirs(os.path.join(args.output_dir, phase), exist_ok=True) py_io.write_json( data={"truncated_to": int(length)}, path=os.path.join(args.output_dir, phase, "smart_truncate.json"), ) shared_caching.chunk_and_save( data=dataset.data, chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=os.path.join(args.output_dir, phase), )
def do_tokenize(phase: str): evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task) output_dir = os.path.join(args.output_dir, f"{phase}") labels_output_dir = os.path.join(args.output_dir, f"{phase}_labels") if phase == PHASE.TRAIN: get_examples_func = task.get_train_examples elif phase == PHASE.VAL: get_examples_func = task.get_val_examples elif phase == PHASE.TEST: # get_examples_func = task.get_test_examples def get_examples_func(): try: return task.get_examples('test') except NotImplementedError: logger.warning('The labels for "test" split is not retrieved, so, metrics for the "test" split will not be evaluated properly.') return task.get_test_examples() chunk_and_save( # HONOKA task=task, phase=phase, examples=get_examples_func(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict[phase] = output_dir shared_caching.chunk_and_save( data=evaluation_scheme.get_labels_from_cache_and_examples( task=task, cache=shared_caching.ChunkedFilesDataCache(output_dir), examples=get_examples_func(), ), chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=labels_output_dir, ) paths_dict[f"{phase}_labels"] = labels_output_dir
def main(args: RunConfiguration): task = tasks.create_task_from_config_path( config_path=args.task_config_path, verbose=True) feat_spec = model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ) tokenizer = model_setup.get_tokenizer( model_type=args.model_type, tokenizer_path=args.model_tokenizer_path, ) if isinstance(args.phases, str): phases = args.phases.split(",") else: phases = args.phases assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST} paths_dict = {} os.makedirs(args.output_dir, exist_ok=True) if PHASE.TRAIN in phases: chunk_and_save( task=task, phase=PHASE.TRAIN, examples=task.get_train_examples(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict["train"] = os.path.join(args.output_dir, PHASE.TRAIN) if PHASE.VAL in phases: val_examples = task.get_val_examples() chunk_and_save( task=task, phase=PHASE.VAL, examples=val_examples, feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task) shared_caching.chunk_and_save( data=evaluation_scheme.get_labels_from_cache_and_examples( task=task, cache=shared_caching.ChunkedFilesDataCache( os.path.join(args.output_dir, PHASE.VAL)), examples=val_examples, ), chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=os.path.join(args.output_dir, "val_labels"), ) paths_dict[PHASE.VAL] = os.path.join(args.output_dir, PHASE.VAL) paths_dict["val_labels"] = os.path.join(args.output_dir, "val_labels") if PHASE.TEST in phases: chunk_and_save( task=task, phase=PHASE.TEST, examples=task.get_test_examples(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict[PHASE.TEST] = os.path.join(args.output_dir, PHASE.TEST) if not args.skip_write_output_paths: py_io.write_json(data=paths_dict, path=os.path.join(args.output_dir, "paths.json"))