def create_task_dict(task_config_dict: dict, verbose: bool = True) -> Dict[str, tasks.Task]: """Make map of task name to task instances from map of task name to task config file paths. Args: task_config_dict (Dict): map from task name to task config filepath. verbose (bool): True to print task config info. Returns: Dict mapping from task name to task instance. """ task_dict = {} for task_name, task_config_path in task_config_dict.items(): task = tasks.create_task_from_config_path(config_path=task_config_path, verbose=False) if not task.name == task_name: warnings.warn( "task {} from {} has conflicting names: {}/{}. Using {}". format( task_name, task_config_path, task_name, task.name, task_name, )) task.name = task_name task_dict[task_name] = task if verbose: print("Creating Tasks:") for task_name, task_config_path in task_config_dict.items(): task_class = task_dict[task_name].__class__.__name__ print(f" {task_name} ({task_class}): {task_config_path}") return task_dict
def test_featurization_of_task_data(): # Test reading the task-specific toy dataset into examples. task = create_task_from_config_path(os.path.join(os.path.dirname(__file__), "resources/mnli.json"), verbose=False) # Test getting train, val, and test examples. Only the contents of train are checked. train_examples = task.get_train_examples() val_examples = task.get_val_examples() test_examples = task.get_test_examples() for train_example_dataclass, raw_example_dict in zip( train_examples, TRAIN_EXAMPLES): assert train_example_dataclass.to_dict() == raw_example_dict assert val_examples assert test_examples # Testing conversion of examples into tokenized examples # the dummy tokenizer requires a vocab — using a Counter here to find that vocab from the data: token_counter = Counter() for example in train_examples: token_counter.update(example.premise.split()) token_counter.update(example.hypothesis.split()) token_vocab = list(token_counter.keys()) tokenizer = SimpleSpaceTokenizer(vocabulary=token_vocab) tokenized_examples = [ example.tokenize(tokenizer) for example in train_examples ] for tokenized_example, expected_tokenized_example in zip( tokenized_examples, TOKENIZED_TRAIN_EXAMPLES): assert tokenized_example.to_dict() == expected_tokenized_example # Testing conversion of a tokenized example to a featurized example train_example_0_length = len(tokenized_examples[0].premise) + len( tokenized_examples[0].hypothesis) feat_spec = model_resolution.build_featurization_spec( model_type="bert-", max_seq_length=train_example_0_length) featurized_examples = [ tokenized_example.featurize(tokenizer=tokenizer, feat_spec=feat_spec) for tokenized_example in tokenized_examples ] featurized_example_0_dict = featurized_examples[0].to_dict() # not bothering to compare the input_ids because they were made by a dummy tokenizer. assert "input_ids" in featurized_example_0_dict assert featurized_example_0_dict["guid"] == FEATURIZED_TRAIN_EXAMPLE_0[ "guid"] assert (featurized_example_0_dict["input_mask"] == FEATURIZED_TRAIN_EXAMPLE_0["input_mask"]).all() assert (featurized_example_0_dict["segment_ids"] == FEATURIZED_TRAIN_EXAMPLE_0["segment_ids"]).all() assert featurized_example_0_dict["label_id"] == FEATURIZED_TRAIN_EXAMPLE_0[ "label_id"] assert featurized_example_0_dict["tokens"] == FEATURIZED_TRAIN_EXAMPLE_0[ "tokens"]
def test_featurization_of_task_data(): # Test reading the task-specific toy dataset into examples. task = create_task_from_config_path(os.path.join(os.path.dirname(__file__), "resources/sst.json"), verbose=True) examples = task.get_train_examples() for example_dataclass, raw_example_dict in zip(examples, TRAIN_EXAMPLES): assert example_dataclass.to_dict() == raw_example_dict # Testing conversion of examples into tokenized examples # the dummy tokenizer requires a vocab — using a Counter here to find that vocab from the data: token_counter = Counter() for example in examples: token_counter.update(example.text.split()) token_vocab = list(token_counter.keys()) tokenizer = SimpleSpaceTokenizer(vocabulary=token_vocab) tokenized_examples = [example.tokenize(tokenizer) for example in examples] for tokenized_example, expected_tokenized_example in zip( tokenized_examples, TOKENIZED_TRAIN_EXAMPLES): assert tokenized_example.to_dict() == expected_tokenized_example # Testing conversion of a tokenized example to a featurized example feat_spec = tokenizer.get_feat_spec(max_seq_length=10) featurized_examples = [ tokenized_example.featurize(tokenizer=tokenizer, feat_spec=feat_spec) for tokenized_example in tokenized_examples ] featurized_example_0_dict = featurized_examples[0].to_dict() # not bothering to compare the input_ids because they were made by a dummy tokenizer. assert "input_ids" in featurized_example_0_dict assert featurized_example_0_dict["guid"] == FEATURIZED_TRAIN_EXAMPLE_0[ "guid"] assert (featurized_example_0_dict["input_mask"] == FEATURIZED_TRAIN_EXAMPLE_0["input_mask"]).all() assert (featurized_example_0_dict["segment_ids"] == FEATURIZED_TRAIN_EXAMPLE_0["segment_ids"]).all() assert featurized_example_0_dict["label_id"] == FEATURIZED_TRAIN_EXAMPLE_0[ "label_id"] assert featurized_example_0_dict["tokens"] == FEATURIZED_TRAIN_EXAMPLE_0[ "tokens"]
def create_task_dict(task_config_dict: dict, verbose: bool = True) -> Dict[str, tasks.Task]: """Make map of task name to task instances from map of task name to task config file paths. Args: task_config_dict (Dict): map from task name to task config filepath. verbose (bool): True to print task config info. Returns: Dict mapping from task name to task instance. """ task_dict = { task_name: tasks.create_task_from_config_path(config_path=task_config_path, verbose=False) for task_name, task_config_path in task_config_dict.items() } if verbose: print("Creating Tasks:") for task_name, task_config_path in task_config_dict.items(): task_class = task_dict[task_name].__class__.__name__ print(f" {task_name} ({task_class}): {task_config_path}") return task_dict
def main(args: RunConfiguration): task = tasks.create_task_from_config_path( config_path=args.task_config_path, verbose=True) feat_spec = model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ) tokenizer = model_setup.get_tokenizer( model_type=args.model_type, tokenizer_path=args.model_tokenizer_path, ) if isinstance(args.phases, str): phases = args.phases.split(",") else: phases = args.phases assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST} paths_dict = {} os.makedirs(args.output_dir, exist_ok=True) if PHASE.TRAIN in phases: chunk_and_save( task=task, phase=PHASE.TRAIN, examples=task.get_train_examples(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict["train"] = os.path.join(args.output_dir, PHASE.TRAIN) if PHASE.VAL in phases: val_examples = task.get_val_examples() chunk_and_save( task=task, phase=PHASE.VAL, examples=val_examples, feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task) shared_caching.chunk_and_save( data=evaluation_scheme.get_labels_from_cache_and_examples( task=task, cache=shared_caching.ChunkedFilesDataCache( os.path.join(args.output_dir, PHASE.VAL)), examples=val_examples, ), chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=os.path.join(args.output_dir, "val_labels"), ) paths_dict[PHASE.VAL] = os.path.join(args.output_dir, PHASE.VAL) paths_dict["val_labels"] = os.path.join(args.output_dir, "val_labels") if PHASE.TEST in phases: chunk_and_save( task=task, phase=PHASE.TEST, examples=task.get_test_examples(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict[PHASE.TEST] = os.path.join(args.output_dir, PHASE.TEST) if not args.skip_write_output_paths: py_io.write_json(data=paths_dict, path=os.path.join(args.output_dir, "paths.json"))
def test_featurization_of_task_data(): # Test reading the task-specific toy dataset into examples. task = create_task_from_config_path(os.path.join(os.path.dirname(__file__), "resources/spr1.json"), verbose=True) # Test getting train, val, and test examples. Only the contents of train are checked. train_examples = task.get_train_examples() val_examples = task.get_val_examples() for train_example_dataclass, raw_example_dict in zip( train_examples, TRAIN_EXAMPLES): assert train_example_dataclass.to_dict() == raw_example_dict assert val_examples # Testing conversion of examples into tokenized examples # the dummy tokenizer requires a vocab — using a Counter here to find that vocab from the data: token_counter = Counter() for example in train_examples: token_counter.update(example.text.split()) token_vocab = list(token_counter.keys()) space_tokenizer = SimpleSpaceTokenizer(vocabulary=token_vocab) # Mocking to pass normalize_tokenizations's isinstance check during example.tokenize(tokenizer) tokenizer = Mock(spec_set=transformers.RobertaTokenizer) tokenizer.tokenize.side_effect = space_tokenizer.tokenize tokenized_examples = [ example.tokenize(tokenizer) for example in train_examples ] for tokenized_example, expected_tokenized_example in zip( tokenized_examples, TOKENIZED_TRAIN_EXAMPLES): assert tokenized_example.to_dict() == expected_tokenized_example # Dropping the mock and continuing the test with the space tokenizer tokenizer = space_tokenizer # Testing conversion of a tokenized example to a featurized example train_example_0_length = len(tokenized_examples[0].tokens) + 4 feat_spec = model_resolution.build_featurization_spec( model_type="bert-", max_seq_length=train_example_0_length) featurized_examples = [ tokenized_example.featurize(tokenizer=tokenizer, feat_spec=feat_spec) for tokenized_example in tokenized_examples ] featurized_example_0_dict = featurized_examples[0].to_dict() # not bothering to compare the input_ids because they were made by a dummy tokenizer. assert "input_ids" in featurized_example_0_dict assert featurized_example_0_dict["guid"] == FEATURIZED_TRAIN_EXAMPLE_0[ "guid"] assert (featurized_example_0_dict["input_mask"] == FEATURIZED_TRAIN_EXAMPLE_0["input_mask"]).all() assert (featurized_example_0_dict["segment_ids"] == FEATURIZED_TRAIN_EXAMPLE_0["segment_ids"]).all() assert (featurized_example_0_dict["label_ids"] == FEATURIZED_TRAIN_EXAMPLE_0["label_ids"]).all() assert featurized_example_0_dict["tokens"] == FEATURIZED_TRAIN_EXAMPLE_0[ "tokens"] assert featurized_example_0_dict[ "span1_text"] == FEATURIZED_TRAIN_EXAMPLE_0["span1_text"] assert featurized_example_0_dict[ "span2_text"] == FEATURIZED_TRAIN_EXAMPLE_0["span2_text"] assert (featurized_example_0_dict["spans"] == FEATURIZED_TRAIN_EXAMPLE_0["spans"]).all()
def main(args: RunConfiguration): task = tasks.create_task_from_config_path(config_path=args.task_config_path, verbose=True) feat_spec = model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ) tokenizer = model_setup.get_tokenizer( model_type=args.model_type, tokenizer_path=args.model_tokenizer_path, ) if isinstance(args.phases, str): phases = args.phases.split(",") else: phases = args.phases assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST} paths_dict = {} os.makedirs(args.output_dir, exist_ok=True) def do_tokenize(phase: str): evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task) output_dir = os.path.join(args.output_dir, f"{phase}") labels_output_dir = os.path.join(args.output_dir, f"{phase}_labels") if phase == PHASE.TRAIN: get_examples_func = task.get_train_examples elif phase == PHASE.VAL: get_examples_func = task.get_val_examples elif phase == PHASE.TEST: # get_examples_func = task.get_test_examples def get_examples_func(): try: return task.get_examples('test') except NotImplementedError: logger.warning('The labels for "test" split is not retrieved, so, metrics for the "test" split will not be evaluated properly.') return task.get_test_examples() chunk_and_save( # HONOKA task=task, phase=phase, examples=get_examples_func(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict[phase] = output_dir shared_caching.chunk_and_save( data=evaluation_scheme.get_labels_from_cache_and_examples( task=task, cache=shared_caching.ChunkedFilesDataCache(output_dir), examples=get_examples_func(), ), chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=labels_output_dir, ) paths_dict[f"{phase}_labels"] = labels_output_dir if PHASE.TRAIN in phases: do_tokenize(PHASE.TRAIN) if PHASE.VAL in phases: do_tokenize(PHASE.VAL) if PHASE.TEST in phases: do_tokenize(PHASE.TEST) if not args.skip_write_output_paths: py_io.write_json(data=paths_dict, path=os.path.join(args.output_dir, "paths.json"))
def main(args: RunConfiguration): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # === Shared model components setup === # model_type = "roberta-base" model_arch = ModelArchitectures.from_model_type(model_type=model_type) transformers_class_spec = model_setup.TRANSFORMERS_CLASS_SPEC_DICT[ model_arch] ancestor_model = model_setup.get_ancestor_model( transformers_class_spec=transformers_class_spec, model_config_path=args.model_config_path, ) encoder = model_setup.get_encoder( model_arch=model_arch, ancestor_model=ancestor_model, ) tokenizer = shared_model_setup.get_tokenizer( model_type=model_type, tokenizer_path=args.model_tokenizer_path, ) # === Taskmodels setup === # task_dict = { "mnli": tasks.create_task_from_config_path( os.path.join( args.task_config_base_path, "mnli.json", )), "qnli": tasks.create_task_from_config_path( os.path.join( args.task_config_base_path, "qnli.json", )), "rte": tasks.create_task_from_config_path( os.path.join( args.task_config_base_path, "qnli.json", )) } taskmodels_dict = { "nli": taskmodels.ClassificationModel( encoder=encoder, classification_head=heads.ClassificationHead( hidden_size=encoder.config.hidden_size, hidden_dropout_prob=encoder.config.hidden_dropout_prob, num_labels=len(task_dict["mnli"].LABELS), ), ), "rte": taskmodels.ClassificationModel( encoder=encoder, classification_head=heads.ClassificationHead( hidden_size=encoder.config.hidden_size, hidden_dropout_prob=encoder.config.hidden_dropout_prob, num_labels=len(task_dict["rte"].LABELS), ), ), } task_to_taskmodel_map = { "mnli": "nli", "qnli": "nli", "rte": "rte", } # === Final === # jiant_model = JiantModel( task_dict=task_dict, encoder=encoder, taskmodels_dict=taskmodels_dict, task_to_taskmodel_map=task_to_taskmodel_map, tokenizer=tokenizer, ) jiant_model = jiant_model.to(device) # === Run === # task_dataloader_dict = {} for task_name, task in task_dict.items(): train_cache = caching.ChunkedFilesDataCache( cache_fol_path=os.path.join(args.task_cache_base_path, task_name, "train"), ) train_dataset = train_cache.get_iterable_dataset(buffer_size=10000, shuffle=True) train_dataloader = torch_utils.DataLoaderWithLength( dataset=train_dataset, batch_size=4, collate_fn=task.collate_fn, ) task_dataloader_dict[task_name] = train_dataloader for task_name, task in task_dict.items(): batch, batch_metadata = next(iter(task_dataloader_dict[task_name])) batch = batch.to(device) with torch.no_grad(): model_output = wrap_jiant_forward( jiant_model=jiant_model, batch=batch, task=task, compute_loss=True, ) print(task_name) print(model_output) print()