Exemplo n.º 1
0
def generate(
    generate_task_examples,
    train_data: List[Dict[str, Any]],
    test_data: List[Dict[str, Any]],
    validation_size: float = 0.1,
    validation_data: Optional[List[Dict[str, Any]]] = None,
) -> List[ProbingTaskExample]:

    if validation_data is None:
        train_data, validation_data = train_val_split(train_data, validation_size)

    logger.info(f"Num train examples: {len(train_data)}")
    logger.info(f"Num validation examples: {len(validation_data)}")
    logger.info(f"Num test examples: {len(test_data)}")

    task_examples = []

    train_task_examples = generate_task_examples(train_data, split="tr")
    task_examples.extend(train_task_examples)

    validation_task_examples = generate_task_examples(validation_data, split="va")
    task_examples.extend(validation_task_examples)

    test_task_examples = generate_task_examples(test_data, split="te")
    task_examples.extend(test_task_examples)

    logger.info(f"Num train task examples: {len(train_task_examples)}")
    logger.info(f"Num validation task examples: {len(validation_task_examples)}")
    logger.info(f"Num test task examples: {len(test_task_examples)}")

    return task_examples
def generate(
    train_data: List[Dict[str, Any]],
    test_data: List[Dict[str, Any]],
    validation_size: float = 0.1,
    validation_data: Optional[List[Dict[str, Any]]] = None,
    argument: str = "head",
    roles: Optional[List[str]] = None,
) -> List[ProbingTaskExample]:
    logger.info("Generating dataset for probing task: ArgumentGrammaticalRole")
    if argument not in {"head", "tail"}:
        raise (f"Invalid argument [{argument}]")
    if roles is None:
        roles = DEFAULT_ROLES
    if validation_data is None:
        train_data, validation_data = train_val_split(train_data,
                                                      validation_size)

    logger.info(f"Using argument: {argument}")
    logger.info(f"Num train examples: {len(train_data)}")
    logger.info(f"Num validation examples: {len(validation_data)}")
    logger.info(f"Num test examples: {len(test_data)}")

    task_examples = []

    train_task_examples = generate_task_examples(train_data,
                                                 argument,
                                                 roles,
                                                 split="tr")
    task_examples.extend(train_task_examples)

    validation_task_examples = generate_task_examples(validation_data,
                                                      argument,
                                                      roles,
                                                      split="va")
    task_examples.extend(validation_task_examples)

    test_task_examples = generate_task_examples(test_data,
                                                argument,
                                                roles,
                                                split="te")
    task_examples.extend(test_task_examples)

    logger.info(f"Num train task examples: {len(train_task_examples)}")
    logger.info(
        f"Num validation task examples: {len(validation_task_examples)}")
    logger.info(f"Num test task examples: {len(test_task_examples)}")

    return task_examples
Exemplo n.º 3
0
def generate(
    train_data: List[Dict[str, Any]],
    test_data: List[Dict[str, Any]],
    validation_size: float = 0.1,
    validation_data: Optional[List[Dict[str, Any]]] = None,
    buckets: Optional[List[Tuple[int, int]]] = None,
) -> List[ProbingTaskExample]:
    logger.info("Generating dataset for probing task: SentLength")

    if buckets is None:
        buckets = DEFAULT_BUCKETS

    logger.info(f"Buckets: {buckets}")

    if validation_data is None:
        train_data, validation_data = train_val_split(train_data,
                                                      validation_size)

    logger.info(f"Num train examples: {len(train_data)}")
    logger.info(f"Num validation examples: {len(validation_data)}")
    logger.info(f"Num test examples: {len(test_data)}")

    task_examples = []

    train_task_examples = generate_task_examples(train_data,
                                                 buckets,
                                                 split="tr")
    task_examples.extend(train_task_examples)

    validation_task_examples = generate_task_examples(validation_data,
                                                      buckets,
                                                      split="va")
    task_examples.extend(validation_task_examples)

    test_task_examples = generate_task_examples(test_data, buckets, split="te")
    task_examples.extend(test_task_examples)

    logger.info(f"Num train task examples: {len(train_task_examples)}")
    logger.info(
        f"Num validation task examples: {len(validation_task_examples)}")
    logger.info(f"Num test task examples: {len(test_task_examples)}")

    return task_examples
def generate(
    train_data: List[Dict[str, Any]],
    test_data: List[Dict[str, Any]],
    validation_size: float = 0.1,
    validation_data: Optional[List[Dict[str, Any]]] = None,
    ner_count_tag: str = "ORGANIZATION",
) -> List[ProbingTaskExample]:
    logger.info("Generating dataset for probing task: EntCountBetween")

    if validation_data is None:
        train_data, validation_data = train_val_split(train_data,
                                                      validation_size)

    logger.info(f"Using NER tag: {ner_count_tag}")
    logger.info(f"Num train examples: {len(train_data)}")
    logger.info(f"Num validation examples: {len(validation_data)}")
    logger.info(f"Num test examples: {len(test_data)}")

    task_examples = []

    train_task_examples = generate_task_examples(train_data,
                                                 ner_count_tag,
                                                 split="tr")
    task_examples.extend(train_task_examples)

    validation_task_examples = generate_task_examples(validation_data,
                                                      ner_count_tag,
                                                      split="va")
    task_examples.extend(validation_task_examples)

    test_task_examples = generate_task_examples(test_data,
                                                ner_count_tag,
                                                split="te")
    task_examples.extend(test_task_examples)

    logger.info(f"Num train task examples: {len(train_task_examples)}")
    logger.info(
        f"Num validation task examples: {len(validation_task_examples)}")
    logger.info(f"Num test task examples: {len(test_task_examples)}")

    return task_examples
Exemplo n.º 5
0
def generate(
    train_data: List[Dict[str, Any]],
    test_data: List[Dict[str, Any]],
    argument: str,
    validation_size: float = 0.1,
    validation_data: Optional[List[Dict[str, Any]]] = None,
    keep_types: Optional[List[str]] = None,
) -> List[ProbingTaskExample]:
    logger.info(
        f"Generating dataset for probing task: ArgumentType{argument.capitalize()}"
    )

    if argument not in ["head", "tail"]:
        raise ValueError(f"'{argument}' is not a valid argument.")

    if validation_data is None:
        train_data, validation_data = train_val_split(train_data,
                                                      validation_size)

    logger.info(f"Argument: {argument}")
    logger.info(f"Num train examples: {len(train_data)}")
    logger.info(f"Num validation examples: {len(validation_data)}")
    logger.info(f"Num test examples: {len(test_data)}")

    all_arg_types = Counter()
    for data in [train_data, validation_data, test_data]:
        for example in data:
            field = f"{argument}_type"
            all_arg_types.update([example[field]])

    logger.info(f"Label distribution: {all_arg_types.most_common()}")

    type2idx = {arg_type: i for i, arg_type in enumerate(list(all_arg_types))}

    task_examples = []

    train_task_examples = generate_task_examples(train_data,
                                                 argument,
                                                 type2idx,
                                                 keep_types,
                                                 split="tr")
    task_examples.extend(train_task_examples)

    idx2type = {v: k for k, v in type2idx.items()}
    class_distribution = Counter(
        [idx2type[example.label] for example in train_task_examples])
    logger.info(f"CT: {class_distribution}")

    validation_task_examples = generate_task_examples(validation_data,
                                                      argument,
                                                      type2idx,
                                                      keep_types,
                                                      split="va")
    task_examples.extend(validation_task_examples)

    test_task_examples = generate_task_examples(test_data,
                                                argument,
                                                type2idx,
                                                keep_types,
                                                split="te")
    task_examples.extend(test_task_examples)

    logger.info(f"Num train task examples: {len(train_task_examples)}")
    logger.info(
        f"Num validation task examples: {len(validation_task_examples)}")
    logger.info(f"Num test task examples: {len(test_task_examples)}")

    return task_examples
def generate(
    train_data: List[Dict[str, Any]],
    test_data: List[Dict[str, Any]],
    argument: str,
    position: str,
    validation_size: float = 0.1,
    validation_data: Optional[List[Dict[str, Any]]] = None,
    keep_tags: Optional[List[str]] = None,
) -> List[ProbingTaskExample]:
    logger.info("Generating dataset for probing task: " +
                f"PosTag{argument.capitalize()}{position.capitalize()}")

    if argument not in ["head", "tail"]:
        raise ValueError(f"'{argument}' is not a valid argument.")

    if position not in ["left", "right"]:
        raise ValueError(f"'{position}' is not a valid position.")

    if validation_data is None:
        train_data, validation_data = train_val_split(train_data,
                                                      validation_size)

    logger.info(f"Argument: {argument}")
    logger.info(f"Position: {position}")
    logger.info(f"Num train examples: {len(train_data)}")
    logger.info(f"Num validation examples: {len(validation_data)}")
    logger.info(f"Num test examples: {len(test_data)}")

    all_pos_tags = Counter()
    for data in [train_data, validation_data, test_data]:
        for example in data:
            all_pos_tags.update(example["pos"])

    logger.info(f"Label distribution: {all_pos_tags}")

    pos2idx = {pos_tag: i for i, pos_tag in enumerate(list(all_pos_tags))}

    task_examples = []

    train_task_examples = generate_task_examples(train_data,
                                                 argument,
                                                 position,
                                                 pos2idx,
                                                 keep_tags,
                                                 split="tr")
    task_examples.extend(train_task_examples)

    idx2pos = {v: k for k, v in pos2idx.items()}
    class_distribution = Counter(
        [idx2pos[example.label] for example in train_task_examples])
    logger.info(f"CT: {class_distribution}")

    validation_task_examples = generate_task_examples(validation_data,
                                                      argument,
                                                      position,
                                                      pos2idx,
                                                      keep_tags,
                                                      split="va")
    task_examples.extend(validation_task_examples)

    test_task_examples = generate_task_examples(test_data,
                                                argument,
                                                position,
                                                pos2idx,
                                                keep_tags,
                                                split="te")
    task_examples.extend(test_task_examples)

    logger.info(f"Num train task examples: {len(train_task_examples)}")
    logger.info(
        f"Num validation task examples: {len(validation_task_examples)}")
    logger.info(f"Num test task examples: {len(test_task_examples)}")

    return task_examples