def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=True): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, padding_mask, CLS and SEP tokens etc. :param sample: Sample, containing sentence input as strings and is_next label :type sample: Sample :param max_seq_len: Maximum length of sequence. :type max_seq_len: int :param tokenizer: Tokenizer :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) """ if next_sent_pred: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = sample.tokenized["text_b"]["tokens"] # mask random words tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) tokens_b, t2_label = mask_random_words(tokens_b, tokenizer.vocab, token_groups=sample.tokenized["text_b"]["start_of_word"]) if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input tokens_a = " ".join(tokens_a) tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a) tokens_b = " ".join(tokens_b) tokens_b = re.sub(r"(^|\s)(##)", "", tokens_b) # convert lm labels to ids t1_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] t2_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t2_label] lm_label_ids = t1_label_ids + t2_label_ids # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if sample.clear_text["nextsentence_label"]: is_next_label_id = [0] else: is_next_label_id = [1] else: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = None tokens_a, t1_label = mask_random_words(tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) if tokenizer.is_fast: # Detokenize input as fast tokenizer can't handle tokenized input tokens_a = " ".join(tokens_a) tokens_a = re.sub(r"(^|\s)(##)", "", tokens_a) # convert lm labels to ids lm_label_ids = [-1 if tok == '' else tokenizer.convert_tokens_to_ids(tok) for tok in t1_label] if tokenizer.is_fast: inputs = tokenizer(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, return_special_tokens_mask=True, return_token_type_ids=True) seq_b_len = len(sample.tokenized["text_b"]["tokens"]) if "text_b" in sample.tokenized else 0 if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != \ (len(sample.tokenized["text_a"]["tokens"]) + seq_b_len): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " f"from number of tokens produced in tokenize_with_metadata(). \n" f"Further processing is likely to be wrong.") else: # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, truncation=False, truncation_strategy='do_not_truncate', # We've already truncated our tokens before return_special_tokens_mask=True, return_token_type_ids=True ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs[ "special_tokens_mask"] # account for special tokens (CLS, SEP, SEP..) in lm_label_ids lm_label_ids = insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "lm_label_ids": lm_label_ids, } if next_sent_pred: feature_dict["nextsentence_label_ids"] = is_next_label_id assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len assert len(lm_label_ids) == max_seq_len return [feature_dict]
def samples_to_features_ner( sample, tasks, max_seq_len, tokenizer, non_initial_token="X", **kwargs ): """ Generates a dictionary of features for a given input sample that is to be consumed by an NER model. :param sample: Sample object that contains human readable text and label fields from a single NER data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :param non_initial_token: Token that is inserted into the label sequence in positions where there is a non-word-initial token. This is done since the default NER performs prediction only on word initial tokens :return: A list with one dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: list """ tokens = sample.tokenized["tokens"] if tokenizer.is_fast: text = sample.clear_text["text"] # Here, we tokenize the sample for the second time to get all relevant ids # This should change once we git rid of FARM's tokenize_with_metadata() inputs = tokenizer(text, return_token_type_ids=True, truncation=True, truncation_strategy="longest_first", max_length=max_seq_len, return_special_tokens_mask=True) if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " f"from number of tokens produced in tokenize_with_metadata().\n" f"Further processing is likely to be wrong!") else: inputs = tokenizer.encode_plus(text=tokens, text_pair=None, add_special_tokens=True, truncation=False, return_special_tokens_mask=True, return_token_type_ids=True, is_pretokenized=False ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"] # We construct a mask to identify the first token of a word. We will later only use them for predicting entities. # Special tokens don't count as initial tokens => we add 0 at the positions of special tokens # For BERT we add a 0 in the start and end (for CLS and SEP) initial_mask = [int(x) for x in sample.tokenized["start_of_word"]] initial_mask = insert_at_special_tokens_pos(initial_mask, special_tokens_mask, insert_element=0) assert len(initial_mask) == len(input_ids) for task_name, task in tasks.items(): try: label_list = task["label_list"] label_name = task["label_name"] label_tensor_name = task["label_tensor_name"] labels_word = sample.clear_text[label_name] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) # labels_token = add_cls_sep(labels_token, cls_token, sep_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: # For inference mode we don't expect labels label_ids = None logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!" "\nIf your are running in *inference* mode: Don't worry!" "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") # This mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left) if label_ids: label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } if label_ids: feature_dict[label_tensor_name] = label_ids return [feature_dict]
def sample_to_features_text( sample, tasks, max_seq_len, tokenizer ): """ Generates a dictionary of features for a given input sample that is to be consumed by a text classification model. :param sample: Sample object that contains human readable text and label fields from a single text classification data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :return: A list with one dictionary containing the keys "input_ids", "padding_mask" and "segment_ids" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: list """ if tokenizer.is_fast: text = sample.clear_text["text"] # Here, we tokenize the sample for the second time to get all relevant ids # This should change once we git rid of FARM's tokenize_with_metadata() inputs = tokenizer(text, return_token_type_ids=True, truncation=True, truncation_strategy="longest_first", max_length=max_seq_len, return_special_tokens_mask=True) if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]): logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to " f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs " f"from number of tokens produced in tokenize_with_metadata(). \n" f"Further processing is likely to be wrong.") else: # TODO It might be cleaner to adjust the data structure in sample.tokenized tokens_a = sample.tokenized["tokens"] tokens_b = sample.tokenized.get("tokens_b", None) inputs = tokenizer.encode_plus( tokens_a, tokens_b, add_special_tokens=True, truncation=False, # truncation_strategy is deprecated return_token_type_ids=True, is_pretokenized=False, ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len feat_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, } # Add Labels for different tasks for task_name, task in tasks.items(): try: label_name = task["label_name"] label_raw = sample.clear_text[label_name] label_list = task["label_list"] if task["task_type"] == "classification": # id of label try: label_ids = [label_list.index(label_raw)] except ValueError as e: raise ValueError(f'[Task: {task_name}] Observed label {label_raw} not in defined label_list') elif task["task_type"] == "multilabel_classification": # multi-hot-format label_ids = [0] * len(label_list) for l in label_raw.split(","): if l != "": label_ids[label_list.index(l)] = 1 elif task["task_type"] == "regression": label_ids = [float(label_raw)] else: raise ValueError(task["task_type"]) except KeyError: # For inference mode we don't expect labels label_ids = None if label_ids is not None: feat_dict[task["label_tensor_name"]] = label_ids return [feat_dict]
def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=True): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, padding_mask, CLS and SEP tokens etc. :param sample: Sample, containing sentence input as strings and is_next label :type sample: Sample :param max_seq_len: Maximum length of sequence. :type max_seq_len: int :param tokenizer: Tokenizer :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) """ if next_sent_pred: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = sample.tokenized["text_b"]["tokens"] # mask random words tokens_a, t1_label = mask_random_words( tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) tokens_b, t2_label = mask_random_words( tokens_b, tokenizer.vocab, token_groups=sample.tokenized["text_b"]["start_of_word"]) # convert lm labels to ids t1_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label ] t2_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t2_label ] lm_label_ids = t1_label_ids + t2_label_ids # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if sample.clear_text["nextsentence_label"]: is_next_label_id = [0] else: is_next_label_id = [1] else: tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = None tokens_a, t1_label = mask_random_words( tokens_a, tokenizer.vocab, token_groups=sample.tokenized["text_a"]["start_of_word"]) # convert lm labels to ids lm_label_ids = [ -1 if tok == '' else tokenizer.vocab[tok] for tok in t1_label ] # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus(text=tokens_a, text_pair=tokens_b, add_special_tokens=True, max_length=max_seq_len, truncation_strategy='do_not_truncate' # We've already truncated our tokens before ) input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs[ "token_type_ids"], inputs["special_tokens_mask"] # account for special tokens (CLS, SEP, SEP..) in lm_label_ids lm_label_ids = insert_at_special_tokens_pos(lm_label_ids, special_tokens_mask, insert_element=-1) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) lm_label_ids = pad(lm_label_ids, max_seq_len, -1, pad_on_left=pad_on_left) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "lm_label_ids": lm_label_ids, } if next_sent_pred: feature_dict["nextsentence_label_ids"] = is_next_label_id assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len assert len(lm_label_ids) == max_seq_len return [feature_dict]
def sample_to_features_text(sample, tasks, max_seq_len, tokenizer): """ Generates a dictionary of features for a given input sample that is to be consumed by a text classification model. :param sample: Sample object that contains human readable text and label fields from a single text classification data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :return: A list with one dictionary containing the keys "input_ids", "padding_mask" and "segment_ids" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: list """ #TODO It might be cleaner to adjust the data structure in sample.tokenized # Verify if this current quickfix really works for pairs tokens_a = sample.tokenized["tokens"] tokens_b = sample.tokenized.get("tokens_b", None) inputs = tokenizer.encode_plus( tokens_a, tokens_b, add_special_tokens=True, max_length=max_seq_len, truncation_strategy= 'do_not_truncate' # We've already truncated our tokens before ) input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len feat_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, } # Add Labels for different tasks for task_name, task in tasks.items(): try: label_name = task["label_name"] label_raw = sample.clear_text[label_name] label_list = task["label_list"] if task["task_type"] == "classification": # id of label try: label_ids = [label_list.index(label_raw)] except ValueError as e: raise ValueError( f'[Task: {task_name}] Observed label {label_raw} not in defined label_list' ) elif task["task_type"] == "multilabel_classification": # multi-hot-format label_ids = [0] * len(label_list) for l in label_raw.split(","): if l != "": label_ids[label_list.index(l)] = 1 elif task["task_type"] == "regression": label_ids = [float(label_raw)] else: raise ValueError(task["task_type"]) except KeyError: # For inference mode we don't expect labels label_ids = None if label_ids is not None: feat_dict[task["label_tensor_name"]] = label_ids return [feat_dict]
def samples_to_features_ner(sample, tasks, max_seq_len, tokenizer, cls_token="[CLS]", sep_token="[SEP]", non_initial_token="X", **kwargs): """ Generates a dictionary of features for a given input sample that is to be consumed by an NER model. :param sample: Sample object that contains human readable text and label fields from a single NER data sample :type sample: Sample :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name) :type tasks: dict :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :param cls_token: Token used to represent the beginning of the sequence :type cls_token: str :param sep_token: Token used to represent the border between two sequences :type sep_token: str :param non_initial_token: Token that is inserted into the label sequence in positions where there is a non-word-initial token. This is done since the default NER performs prediction only on word initial tokens :return: A dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: dict """ # Tokenize words and extend the labels so they are aligned with the tokens # words = sample.clear_text["text"].split(" ") # tokens, initial_mask = words_to_tokens(words, tokenizer, max_seq_len) tokens = sample.tokenized["tokens"] custom_data = sample.tokenized["custom_data"] initial_mask = [int(x) for x in sample.tokenized["start_of_word"]] # initial_mask = # Add CLS and SEP tokens tokens = add_cls_sep(tokens, cls_token, sep_token) custom_data = [5] + custom_data + [4] initial_mask = [0] + initial_mask + [ 0 ] # CLS and SEP don't count as initial tokens padding_mask = [1] * len(tokens) # Convert to input and labels to ids, generate masks input_ids = tokenizer.convert_tokens_to_ids(tokens) for task_name, task in tasks.items(): try: label_list = task["label_list"] label_name = task["label_name"] label_tensor_name = task["label_tensor_name"] labels_word = sample.clear_text[label_name] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) # labels_token = add_cls_sep(labels_token, cls_token, sep_token) #label_ids = [label_list.index(lt) for lt in labels_token] label_ids = [ label_list.index(lt) for lt in sample.tokenized['ner_label'] ] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) logger.warning( f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: # For inference mode we don't expect labels label_ids = None logger.warning( f"[Task: {task_name}] Could not convert labels to ids via label_list!" "\nIf your are running in *inference* mode: Don't worry!" "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are corre" ) label_ids = [5] + label_ids + [4] segment_ids = [] next_sent = False for x in input_ids: if x == 102: segment_ids.append(0) next_sent = True elif next_sent == True: segment_ids.append(1) else: segment_ids.append(0) segment_ids[len(segment_ids) - 1] = 1 # Pad input_ids = pad(input_ids, max_seq_len, 0) if label_ids: label_ids = pad(label_ids, max_seq_len, 0) initial_mask = pad(initial_mask, max_seq_len, 0) padding_mask = pad(padding_mask, max_seq_len, 0) custom_data = pad(custom_data, max_seq_len, 0) segment_ids = pad(segment_ids, max_seq_len, 0) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, "custom_data": custom_data, } if label_ids: feature_dict[label_tensor_name] = label_ids return [feature_dict]
def samples_to_features_ner(sample, label_list, max_seq_len, tokenizer, cls_token="[CLS]", pad_token="[PAD]", sep_token="[SEP]", non_initial_token="X", **kwargs): """ Generates a dictionary of features for a given input sample that is to be consumed by an NER model. :param sample: Sample object that contains human readable text and label fields from a single NER data sample :type sample: Sample :param label_list: A list of all unique labels :type label_list: list :param max_seq_len: Sequences are truncated after this many tokens :type max_seq_len: int :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens :param cls_token: Token used to represent the beginning of the sequence :type cls_token: str :param pad_token: Token used to represent sequence padding :type pad_token: str :param sep_token: Token used to represent the border between two sequences :type sep_token: str :param non_initial_token: Token that is inserted into the label sequence in positions where there is a non-word-initial token. This is done since the default NER performs prediction only on word initial tokens :return: A dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask" (also "label_ids" if not in inference mode). The values are lists containing those features. :rtype: dict """ # Tokenize words and extend the labels so they are aligned with the tokens # words = sample.clear_text["text"].split(" ") # tokens, initial_mask = words_to_tokens(words, tokenizer, max_seq_len) tokens = sample.tokenized["tokens"] initial_mask = [int(x) for x in sample.tokenized["start_of_word"]] # initial_mask = # Add CLS and SEP tokens tokens = add_cls_sep(tokens, cls_token, sep_token) initial_mask = [0] + initial_mask + [ 0 ] # CLS and SEP don't count as initial tokens padding_mask = [1] * len(tokens) # Convert to input and labels to ids, generate masks input_ids = tokenizer.convert_tokens_to_ids(tokens) if "label" in sample.clear_text: labels_word = sample.clear_text["label"] labels_token = expand_labels(labels_word, initial_mask, non_initial_token) # labels_token = add_cls_sep(labels_token, cls_token, sep_token) label_ids = [label_list.index(lt) for lt in labels_token] # Inference mode else: label_ids = None segment_ids = [0] * max_seq_len # Pad input_ids = pad(input_ids, max_seq_len, 0) if label_ids: label_ids = pad(label_ids, max_seq_len, 0) initial_mask = pad(initial_mask, max_seq_len, 0) padding_mask = pad(padding_mask, max_seq_len, 0) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } if label_ids: feature_dict["label_ids"] = label_ids return [feature_dict]
def samples_to_features_admission_discharge_match(sample, max_seq_len, tokenizer): """ Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with IDs, LM labels, padding_mask, CLS and SEP tokens etc. :param sample: Sample, containing sentence input as strings and is_next label :type sample: Sample :param max_seq_len: Maximum length of sequence. :type max_seq_len: int :param tokenizer: Tokenizer :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) """ tokens_a = sample.tokenized["text_a"]["tokens"] tokens_b = sample.tokenized["text_b"]["tokens"] # Convert is_next_label: Note that in Bert, is_next_labelid = 0 is used for next_sentence=true! if sample.clear_text["nextsentence_label"]: is_next_label_id = [0] else: is_next_label_id = [1] # encode string tokens to input_ids and add special tokens inputs = tokenizer.encode_plus( text=tokens_a, text_pair=tokens_b, add_special_tokens=True, max_length=max_seq_len, truncation_strategy='do_not_truncate', # We've already truncated our tokens before return_special_tokens_mask=True) input_ids, special_tokens_mask = inputs["input_ids"], inputs[ "special_tokens_mask"] # Use existing segment ids or set them according to a and b text length (with special tokens considered) segment_ids = inputs["token_type_ids"] if "token_type_ids" in inputs else [ 0 ] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 2) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. padding_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. # Padding up to the sequence length. # Normal case: adding multiple 0 to the right # Special cases: # a) xlnet pads on the left and uses "4" for padding token_type_ids if tokenizer.__class__.__name__ == "XLNetTokenizer": pad_on_left = True segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left) else: pad_on_left = False segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left) input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left) padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left) sample_id = random.randint(0, 1000000) feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "text_classification_ids": is_next_label_id, "sample_id": sample_id } assert len(input_ids) == max_seq_len assert len(padding_mask) == max_seq_len assert len(segment_ids) == max_seq_len return [feature_dict]