Exemplo n.º 1
0
def samples_to_features_ner(
    sample,
    tasks,
    max_seq_len,
    tokenizer,
    non_initial_token="X",
    **kwargs
):
    """
    Generates a dictionary of features for a given input sample that is to be consumed by an NER model.

    :param sample: Sample object that contains human readable text and label fields from a single NER data sample
    :type sample: Sample
    :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name)
    :type tasks: dict
    :param max_seq_len: Sequences are truncated after this many tokens
    :type max_seq_len: int
    :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens
    :param non_initial_token: Token that is inserted into the label sequence in positions where there is a
                              non-word-initial token. This is done since the default NER performs prediction
                              only on word initial tokens
    :return: A list with one dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask"
             (also "label_ids" if not in inference mode). The values are lists containing those features.
    :rtype: list
    """

    tokens = sample.tokenized["tokens"]

    if tokenizer.is_fast:
        text = sample.clear_text["text"]
        # Here, we tokenize the sample for the second time to get all relevant ids
        # This should change once we git rid of FARM's tokenize_with_metadata()
        inputs = tokenizer(text,
                           return_token_type_ids=True,
                           truncation=True,
                           truncation_strategy="longest_first",
                           max_length=max_seq_len,
                           return_special_tokens_mask=True)

        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
                         f"from number of tokens produced in tokenize_with_metadata().\n"
                         f"Further processing is likely to be wrong!")
    else:
        inputs = tokenizer.encode_plus(text=tokens,
                                       text_pair=None,
                                       add_special_tokens=True,
                                       truncation=False,
                                       return_special_tokens_mask=True,
                                       return_token_type_ids=True,
                                       is_pretokenized=False
                                       )

    input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"]

    # We construct a mask to identify the first token of a word. We will later only use them for predicting entities.
    # Special tokens don't count as initial tokens => we add 0 at the positions of special tokens
    # For BERT we add a 0 in the start and end (for CLS and SEP)
    initial_mask = [int(x) for x in sample.tokenized["start_of_word"]]
    initial_mask = insert_at_special_tokens_pos(initial_mask, special_tokens_mask, insert_element=0)
    assert len(initial_mask) == len(input_ids)

    for task_name, task in tasks.items():
        try:
            label_list = task["label_list"]
            label_name = task["label_name"]
            label_tensor_name = task["label_tensor_name"]
            labels_word = sample.clear_text[label_name]
            labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
            # labels_token = add_cls_sep(labels_token, cls_token, sep_token)
            label_ids = [label_list.index(lt) for lt in labels_token]
        except ValueError:
            label_ids = None
            problematic_labels = set(labels_token).difference(set(label_list))
            logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                           f"\nWe found a problem with labels {str(problematic_labels)}")
        except KeyError:
            # For inference mode we don't expect labels
            label_ids = None
            logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                           "\nIf your are running in *inference* mode: Don't worry!"
                           "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")

        # This mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        padding_mask = [1] * len(input_ids)

        # Padding up to the sequence length.
        # Normal case: adding multiple 0 to the right
        # Special cases:
        # a) xlnet pads on the left and uses  "4" for padding token_type_ids
        if tokenizer.__class__.__name__ == "XLNetTokenizer":
            pad_on_left = True
            segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
        else:
            pad_on_left = False
            segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)

        input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
        padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
        initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left)
        if label_ids:
            label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left)

        feature_dict = {
            "input_ids": input_ids,
            "padding_mask": padding_mask,
            "segment_ids": segment_ids,
            "initial_mask": initial_mask,
        }

        if label_ids:
            feature_dict[label_tensor_name] = label_ids

    return [feature_dict]
Exemplo n.º 2
0
def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=True):
    """
    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
    IDs, LM labels, padding_mask, CLS and SEP tokens etc.

    :param sample: Sample, containing sentence input as strings and is_next label
    :type sample: Sample
    :param max_seq_len: Maximum length of sequence.
    :type max_seq_len: int
    :param tokenizer: Tokenizer
    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
    """

    if next_sent_pred:
        tokens_a = sample.tokenized["text_a"]["tokens"]
        tokens_b = sample.tokenized["text_b"]["tokens"]

        # mask random words
        tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab,
                                               token_groups=sample.tokenized["text_a"]["start_of_word"])

        tokens_b, t2_label = mask_random_words(tokens_b, tokenizer.vocab,
                                               token_groups=sample.tokenized["text_b"]["start_of_word"])

        if tokenizer.is_fast:
            # Detokenize input as fast tokenizer can't handle tokenized input
            tokens_a = " ".join(tokens_a)
            tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a)
            tokens_b = " ".join(tokens_b)
            tokens_b = re.sub(r"(^|\s)(##)", "", tokens_b)

        # convert lm labels to ids
        t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label]
        t2_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t2_label]
        lm_label_ids = t1_label_ids + t2_label_ids

        # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true!
        if sample.clear_text["nextsentence_label"]:
            is_next_label_id = [0]
        else:
            is_next_label_id = [1]
    else:
        tokens_a = sample.tokenized["text_a"]["tokens"]
        tokens_b = None
        tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab,
                                               token_groups=sample.tokenized["text_a"]["start_of_word"])
        if tokenizer.is_fast:
            # Detokenize input as fast tokenizer can't handle tokenized input
            tokens_a = " ".join(tokens_a)
            tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a)

        # convert lm labels to ids
        lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label]

    if tokenizer.is_fast:
        inputs = tokenizer(text=tokens_a,
                           text_pair=tokens_b,
                           add_special_tokens=True,
                           return_special_tokens_mask=True,
                           return_token_type_ids=True)

        seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0
        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \
           (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len):
            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
                         f"from number of tokens produced in tokenize_with_metadata(). \n"
                         f"Further processing is likely to be wrong.")
    else:
        # encode string tokens to input_ids and add special tokens
        inputs = tokenizer.encode_plus(text=tokens_a,
                                       text_pair=tokens_b,
                                       add_special_tokens=True,
                                       truncation=False,
                                       truncation_strategy='do_not_truncate',
                                       # We've already truncated our tokens before
                                       return_special_tokens_mask=True,
                                       return_token_type_ids=True
                                       )

    input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs[
        "special_tokens_mask"]

    # account for special tokens (CLS, SEP, SEP..) in lm_label_ids
    lm_label_ids = insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    padding_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    # Padding up to the sequence length.
    # Normal case: adding multiple 0 to the right
    # Special cases:
    # a) xlnet pads on the left and uses  "4" for padding token_type_ids
    if tokenizer.__class__.__name__ == "XLNetTokenizer":
        pad_on_left = True
        segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
    else:
        pad_on_left = False
        segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)

    input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
    padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
    lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left)

    feature_dict = {
        "input_ids": input_ids,
        "padding_mask": padding_mask,
        "segment_ids": segment_ids,
        "lm_label_ids": lm_label_ids,
    }

    if next_sent_pred:
        feature_dict["nextsentence_label_ids"] = is_next_label_id

    assert len(input_ids) == max_seq_len
    assert len(padding_mask) == max_seq_len
    assert len(segment_ids) == max_seq_len
    assert len(lm_label_ids) == max_seq_len

    return [feature_dict]
Exemplo n.º 3
0
def samples_to_features_bert_lm(sample,
                                max_seq_len,
                                tokenizer,
                                next_sent_pred=True):
    """
    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
    IDs, LM labels, padding_mask, CLS and SEP tokens etc.

    :param sample: Sample, containing sentence input as strings and is_next label
    :type sample: Sample
    :param max_seq_len: Maximum length of sequence.
    :type max_seq_len: int
    :param tokenizer: Tokenizer
    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
    """

    if next_sent_pred:
        tokens_a = sample.tokenized["text_a"]["tokens"]
        tokens_b = sample.tokenized["text_b"]["tokens"]

        # mask random words
        tokens_a, t1_label = mask_random_words(
            tokens_a,
            tokenizer.vocab,
            token_groups=sample.tokenized["text_a"]["start_of_word"])

        tokens_b, t2_label = mask_random_words(
            tokens_b,
            tokenizer.vocab,
            token_groups=sample.tokenized["text_b"]["start_of_word"])
        # convert lm labels to ids
        t1_label_ids = [
            -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label
        ]
        t2_label_ids = [
            -1 if tok == '' else tokenizer.vocab[tok] for tok in t2_label
        ]
        lm_label_ids = t1_label_ids + t2_label_ids

        # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true!
        if sample.clear_text["nextsentence_label"]:
            is_next_label_id = [0]
        else:
            is_next_label_id = [1]
    else:
        tokens_a = sample.tokenized["text_a"]["tokens"]
        tokens_b = None
        tokens_a, t1_label = mask_random_words(
            tokens_a,
            tokenizer.vocab,
            token_groups=sample.tokenized["text_a"]["start_of_word"])
        # convert lm labels to ids
        lm_label_ids = [
            -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label
        ]

    # encode string tokens to input_ids and add special tokens
    inputs = tokenizer.encode_plus(text=tokens_a,
                                   text_pair=tokens_b,
                                   add_special_tokens=True,
                                   max_length=max_seq_len,
                                   truncation_strategy='do_not_truncate'
                                   # We've already truncated our tokens before
                                   )

    input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs[
        "token_type_ids"], inputs["special_tokens_mask"]

    # account for special tokens (CLS, SEP, SEP..) in lm_label_ids
    lm_label_ids = insert_at_special_tokens_pos(lm_label_ids,
                                                special_tokens_mask,
                                                insert_element=-1)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    padding_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    # Padding up to the sequence length.
    # Normal case: adding multiple 0 to the right
    # Special cases:
    # a) xlnet pads on the left and uses  "4" for padding token_type_ids
    if tokenizer.__class__.__name__ == "XLNetTokenizer":
        pad_on_left = True
        segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
    else:
        pad_on_left = False
        segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)

    input_ids = pad(input_ids,
                    max_seq_len,
                    tokenizer.pad_token_id,
                    pad_on_left=pad_on_left)
    padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
    lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left)

    feature_dict = {
        "input_ids": input_ids,
        "padding_mask": padding_mask,
        "segment_ids": segment_ids,
        "lm_label_ids": lm_label_ids,
    }

    if next_sent_pred:
        feature_dict["nextsentence_label_ids"] = is_next_label_id

    assert len(input_ids) == max_seq_len
    assert len(padding_mask) == max_seq_len
    assert len(segment_ids) == max_seq_len
    assert len(lm_label_ids) == max_seq_len

    return [feature_dict]