def generate( generate_task_examples, train_data: List[Dict[str, Any]], test_data: List[Dict[str, Any]], validation_size: float = 0.1, validation_data: Optional[List[Dict[str, Any]]] = None, ) -> List[ProbingTaskExample]: if validation_data is None: train_data, validation_data = train_val_split(train_data, validation_size) logger.info(f"Num train examples: {len(train_data)}") logger.info(f"Num validation examples: {len(validation_data)}") logger.info(f"Num test examples: {len(test_data)}") task_examples = [] train_task_examples = generate_task_examples(train_data, split="tr") task_examples.extend(train_task_examples) validation_task_examples = generate_task_examples(validation_data, split="va") task_examples.extend(validation_task_examples) test_task_examples = generate_task_examples(test_data, split="te") task_examples.extend(test_task_examples) logger.info(f"Num train task examples: {len(train_task_examples)}") logger.info(f"Num validation task examples: {len(validation_task_examples)}") logger.info(f"Num test task examples: {len(test_task_examples)}") return task_examples
def generate( train_data: List[Dict[str, Any]], test_data: List[Dict[str, Any]], validation_size: float = 0.1, validation_data: Optional[List[Dict[str, Any]]] = None, argument: str = "head", roles: Optional[List[str]] = None, ) -> List[ProbingTaskExample]: logger.info("Generating dataset for probing task: ArgumentGrammaticalRole") if argument not in {"head", "tail"}: raise (f"Invalid argument [{argument}]") if roles is None: roles = DEFAULT_ROLES if validation_data is None: train_data, validation_data = train_val_split(train_data, validation_size) logger.info(f"Using argument: {argument}") logger.info(f"Num train examples: {len(train_data)}") logger.info(f"Num validation examples: {len(validation_data)}") logger.info(f"Num test examples: {len(test_data)}") task_examples = [] train_task_examples = generate_task_examples(train_data, argument, roles, split="tr") task_examples.extend(train_task_examples) validation_task_examples = generate_task_examples(validation_data, argument, roles, split="va") task_examples.extend(validation_task_examples) test_task_examples = generate_task_examples(test_data, argument, roles, split="te") task_examples.extend(test_task_examples) logger.info(f"Num train task examples: {len(train_task_examples)}") logger.info( f"Num validation task examples: {len(validation_task_examples)}") logger.info(f"Num test task examples: {len(test_task_examples)}") return task_examples
def generate( train_data: List[Dict[str, Any]], test_data: List[Dict[str, Any]], validation_size: float = 0.1, validation_data: Optional[List[Dict[str, Any]]] = None, buckets: Optional[List[Tuple[int, int]]] = None, ) -> List[ProbingTaskExample]: logger.info("Generating dataset for probing task: SentLength") if buckets is None: buckets = DEFAULT_BUCKETS logger.info(f"Buckets: {buckets}") if validation_data is None: train_data, validation_data = train_val_split(train_data, validation_size) logger.info(f"Num train examples: {len(train_data)}") logger.info(f"Num validation examples: {len(validation_data)}") logger.info(f"Num test examples: {len(test_data)}") task_examples = [] train_task_examples = generate_task_examples(train_data, buckets, split="tr") task_examples.extend(train_task_examples) validation_task_examples = generate_task_examples(validation_data, buckets, split="va") task_examples.extend(validation_task_examples) test_task_examples = generate_task_examples(test_data, buckets, split="te") task_examples.extend(test_task_examples) logger.info(f"Num train task examples: {len(train_task_examples)}") logger.info( f"Num validation task examples: {len(validation_task_examples)}") logger.info(f"Num test task examples: {len(test_task_examples)}") return task_examples
def generate( train_data: List[Dict[str, Any]], test_data: List[Dict[str, Any]], validation_size: float = 0.1, validation_data: Optional[List[Dict[str, Any]]] = None, ner_count_tag: str = "ORGANIZATION", ) -> List[ProbingTaskExample]: logger.info("Generating dataset for probing task: EntCountBetween") if validation_data is None: train_data, validation_data = train_val_split(train_data, validation_size) logger.info(f"Using NER tag: {ner_count_tag}") logger.info(f"Num train examples: {len(train_data)}") logger.info(f"Num validation examples: {len(validation_data)}") logger.info(f"Num test examples: {len(test_data)}") task_examples = [] train_task_examples = generate_task_examples(train_data, ner_count_tag, split="tr") task_examples.extend(train_task_examples) validation_task_examples = generate_task_examples(validation_data, ner_count_tag, split="va") task_examples.extend(validation_task_examples) test_task_examples = generate_task_examples(test_data, ner_count_tag, split="te") task_examples.extend(test_task_examples) logger.info(f"Num train task examples: {len(train_task_examples)}") logger.info( f"Num validation task examples: {len(validation_task_examples)}") logger.info(f"Num test task examples: {len(test_task_examples)}") return task_examples
def generate( train_data: List[Dict[str, Any]], test_data: List[Dict[str, Any]], argument: str, validation_size: float = 0.1, validation_data: Optional[List[Dict[str, Any]]] = None, keep_types: Optional[List[str]] = None, ) -> List[ProbingTaskExample]: logger.info( f"Generating dataset for probing task: ArgumentType{argument.capitalize()}" ) if argument not in ["head", "tail"]: raise ValueError(f"'{argument}' is not a valid argument.") if validation_data is None: train_data, validation_data = train_val_split(train_data, validation_size) logger.info(f"Argument: {argument}") logger.info(f"Num train examples: {len(train_data)}") logger.info(f"Num validation examples: {len(validation_data)}") logger.info(f"Num test examples: {len(test_data)}") all_arg_types = Counter() for data in [train_data, validation_data, test_data]: for example in data: field = f"{argument}_type" all_arg_types.update([example[field]]) logger.info(f"Label distribution: {all_arg_types.most_common()}") type2idx = {arg_type: i for i, arg_type in enumerate(list(all_arg_types))} task_examples = [] train_task_examples = generate_task_examples(train_data, argument, type2idx, keep_types, split="tr") task_examples.extend(train_task_examples) idx2type = {v: k for k, v in type2idx.items()} class_distribution = Counter( [idx2type[example.label] for example in train_task_examples]) logger.info(f"CT: {class_distribution}") validation_task_examples = generate_task_examples(validation_data, argument, type2idx, keep_types, split="va") task_examples.extend(validation_task_examples) test_task_examples = generate_task_examples(test_data, argument, type2idx, keep_types, split="te") task_examples.extend(test_task_examples) logger.info(f"Num train task examples: {len(train_task_examples)}") logger.info( f"Num validation task examples: {len(validation_task_examples)}") logger.info(f"Num test task examples: {len(test_task_examples)}") return task_examples
def generate( train_data: List[Dict[str, Any]], test_data: List[Dict[str, Any]], argument: str, position: str, validation_size: float = 0.1, validation_data: Optional[List[Dict[str, Any]]] = None, keep_tags: Optional[List[str]] = None, ) -> List[ProbingTaskExample]: logger.info("Generating dataset for probing task: " + f"PosTag{argument.capitalize()}{position.capitalize()}") if argument not in ["head", "tail"]: raise ValueError(f"'{argument}' is not a valid argument.") if position not in ["left", "right"]: raise ValueError(f"'{position}' is not a valid position.") if validation_data is None: train_data, validation_data = train_val_split(train_data, validation_size) logger.info(f"Argument: {argument}") logger.info(f"Position: {position}") logger.info(f"Num train examples: {len(train_data)}") logger.info(f"Num validation examples: {len(validation_data)}") logger.info(f"Num test examples: {len(test_data)}") all_pos_tags = Counter() for data in [train_data, validation_data, test_data]: for example in data: all_pos_tags.update(example["pos"]) logger.info(f"Label distribution: {all_pos_tags}") pos2idx = {pos_tag: i for i, pos_tag in enumerate(list(all_pos_tags))} task_examples = [] train_task_examples = generate_task_examples(train_data, argument, position, pos2idx, keep_tags, split="tr") task_examples.extend(train_task_examples) idx2pos = {v: k for k, v in pos2idx.items()} class_distribution = Counter( [idx2pos[example.label] for example in train_task_examples]) logger.info(f"CT: {class_distribution}") validation_task_examples = generate_task_examples(validation_data, argument, position, pos2idx, keep_tags, split="va") task_examples.extend(validation_task_examples) test_task_examples = generate_task_examples(test_data, argument, position, pos2idx, keep_tags, split="te") task_examples.extend(test_task_examples) logger.info(f"Num train task examples: {len(train_task_examples)}") logger.info( f"Num validation task examples: {len(validation_task_examples)}") logger.info(f"Num test task examples: {len(test_task_examples)}") return task_examples