def __init__( self, labels: T.List[str], tokenizer: PreTrainedTokenizer, label_map: T.Dict[str, int], dset_filename: str, content_column: str, label_column: T.Optional[str], ): """. labels: list of valid labels (can be strings/ints) tokenizer: AutoTokenizer object that can tokenize input text label_map: maps labels to ints for machine-readability dset_filename: name of the filename (full filepath) of the dataset being loaded content_column: column name of the content to be read label_column: column name where the labels can be found """ suffix = dset_filename.split(".")[-1] # type: ignore if suffix in CSV_EXTENSIONS: doc_reader = lambda b: pd.read_csv(b, dtype=object) else: raise ValueError( f"The file {dset_filename} doesn't have a recognized extension." ) self.labels = labels self.label_map = label_map self.tokenizer = tokenizer df = doc_reader(dset_filename) # type: ignore self.len_dset = len(df) self.content_series = df[ content_column] # For later, if we need to output predictions self.encoded_content = self.tokenizer.batch_encode_plus( df[content_column], max_length=None, pad_to_max_length=True, ) if label_column is not None: self.encoded_labels: T.Optional[T.List[int]] = [ self.label_map[label] for label in df[label_column] ] else: self.encoded_labels = None self.features = [] for i in range(len(self.encoded_content["input_ids"])): inputs = { k: self.encoded_content[k][i] for k in self.encoded_content.keys() } if self.encoded_labels is not None: feature = InputFeatures(**inputs, label=self.encoded_labels[i]) else: feature = InputFeatures(**inputs, label=None) self.features.append(feature)
def convert_examples_to_features(examples, tokenizer, max_length, label_list): label_map = {label: i for i, label in enumerate(label_list)} features = [] logging.info('>>> {} examples convert to features'.format(len(examples))) for ex_index, example in enumerate(examples): inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, truncation=True) input_ids, token_type_ids, attention_mask = inputs[ "input_ids"], inputs["token_type_ids"], inputs["attention_mask"] padding_length = max_length - len(input_ids) input_ids = input_ids + [0] * padding_length token_type_ids = token_type_ids + [0] * padding_length attention_mask = attention_mask + [0] * padding_length assert len(input_ids) == max_length label = label_map[example.label] features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, )) if ex_index < 5: logging.info(">>> writing example %d" % (ex_index)) logging.info('>>> text is {} '.format(example.text_a)) logging.info('>>> input_ids is {}'.format(input_ids)) logging.info('>>> label text is {} and label_ids is {}'.format( example.label, label)) return features
def __init__(self, input_ids, attention_masks, labels): assert len(input_ids) == len(attention_masks) == len(labels) self.features = [] for index in range(len(labels)): feature = InputFeatures(input_ids=input_ids[index], attention_mask=attention_masks[index], label=labels[index]) self.features.append(feature)
def _text_to_features(self, texts: List[str]): batch_encoding = self.tokenizer.batch_encode_plus( texts, max_length=self.max_length, pad_to_max_length=True) features = [] for i in range(len(texts)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs) features.append(feature) return features
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128): features = [] # -> will hold InputFeatures to be converted later for e in examples: # Documentation is really strong for this method, so please take a look at it input_dict = tokenizer.encode_plus( e.text_a, add_special_tokens=True, max_length=max_length, # truncates if len(s) > max_length return_token_type_ids=True, return_attention_mask=True, pad_to_max_length= True, # pads to the right by default # CHECK THIS for pad_to_max_length truncation=True) input_ids, token_type_ids, attention_mask = ( input_dict["input_ids"], input_dict["token_type_ids"], input_dict['attention_mask']) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label)) def gen(): for f in features: yield ( { "input_ids": f.input_ids, "attention_mask": f.attention_mask, "token_type_ids": f.token_type_ids, }, f.label, ) return tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), )
def _glue_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float]: if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def examples2features(examples, tokenizer, label_list, max_length=128): label_map = {label: i for i, label in enumerate(label_list)} logger.info("Converting examples to features") features = [] for ex_index, example in enumerate(tqdm(examples)): inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) # Im so sorry for this xD (input_ids, token_type_ids) = itemgetter("input_ids", "token_type_ids")(inputs) attention_mask = [1] * len(input_ids) # Pad everything pad_token = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) # Assert that everything was padded correctly assert len(input_ids) == max_length assert len(token_type_ids) == max_length assert len(attention_mask) == max_length features.append( InputFeatures( input_ids, attention_mask, token_type_ids, label=label_map[example.label], )) # Log some examples to check for example, feature in islice(zip(examples, features), 5): logger.info("******** Example ********") logger.info(f"Guid: {example.guid}") logger.info(f"Sentence A: {example.text_a}") logger.info(f"Sentence B: {example.text_b}") logger.info(f"input_ids: {feature.input_ids}") logger.info(f"attention_mask: {feature.attention_mask}") logger.info(f"token_type_ids: {feature.token_type_ids}") logger.info(f"label: {example.label} (id = {feature.label})") return features
def classification_convert_example_to_feature(example, max_length=512, label_map=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, set_type='train'): inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len( input_ids) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if set_type != 'test': label = label_map[example.label] else: label = None return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)
def create_features(examples, tokenizer, max_len): features = [] pad_on_left = False pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0] pad_token_segment_id = 0 mask_padding_with_zero = True for example in tqdm(examples, desc='convert examples to features'): inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_len, return_token_type_ids=True) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_len - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len( input_ids) == max_len, "Error with input length {} vs {}".format( len(input_ids), max_len) assert len(attention_mask ) == max_len, "Error with input length {} vs {}".format( len(attention_mask), max_len) assert len(token_type_ids ) == max_len, "Error with input length {} vs {}".format( len(token_type_ids), max_len) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=None)) return features
def convert_examples_to_features(examples, tokenizer, max_length=512, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): features = [] for (ex_index, example) in tqdm(enumerate(examples)): inputs = tokenizer.encode_plus( example[COMPLAINT_TEXT], add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example[LABEL])) return features
def convert_examples_to_features( examples: List[InputExample], tokenizer: AutoTokenizer, max_length: Optional[int] = None, label_list: List = None, output_mode="classification", ): if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer( [(example.text_a, example.text_b) for example in examples], max_length=max_length, padding="max_length", truncation=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def retrieval_examples_to_features(examples, tokenizer, max_length): batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def _convert_examples_to_features( self, examples, ): labels = [float(example.label) for example in examples] batch_encoding = self.tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=self.max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) return features
def convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len processor = TwitterProcessor() label_list = processor.get_labels() label_map = {label: i for i, label in enumerate(label_list)} def label_from_example( example: InputExample) -> Union[int, float, None]: return label_map[example.label] labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def convert_examples_to_features( self, examples, tokenizer, max_length=None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample): if example.label is None: return None elif output_mode == 'classification': return label_map[example.label] elif output_mode == 'regression': return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) return features
def process_inputs(self, texts, labels=None, to_dataset=True): """ convert text to tf dataset used as model input (e.g. for training) """ if labels is None: labels = repeat(0) # tokenize tokenized = [] for text, label in zip(texts, labels): inputs = (self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.max_length, pad_to_max_length=True)) tokenized.append( InputFeatures(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids'], label=label)) if to_dataset: tokenized = self.to_dataset(tokenized) return tokenized
def __convert_examples_to_tf_dataset(self, data, max_length=128): """ Performs the tokenization where each word of each document has a max_length and returns a tensorflow dataset. Every element of the dataset consists of: 1. a dict with the tokenized text and the attention mask, used to specify which tokens are valid and which ones are used for padding 2. the tweet label This format is known and used by Bert :param data: input data. A list of InputExample objects. :type data: list :param max_length: fixed length of the tokenization :type max_length: int, optiona :return: a tensorflow dataset as described before :rtype: tf.data.Dataset """ # A list of InputFeatures of a single tweet. Every feature contains: # tweet's tokens, tweet's attention mask, tweet's label. # For more info: https://huggingface.co/transformers/main_classes/processors.html#transformers.data.processors.utils.InputFeatures features = [] for sample in data: # For every tweet creates a dictionary. This dictionary contains tweet's # tokens ('input_ids') and the tweet's attention mask ('attention mask'). input_dict = self.__tokenizer( # The tweet itself. Remember that the sample is an InputExample. sample.text_a, # Specify to add the padding add_special_tokens=True, # Fixed tweet vector length max_length=max_length, # Not needed because we are not comparing text_a to text_b, # since we don't have a text_b. # For more info: https://huggingface.co/transformers/glossary.html#token-type-ids return_token_type_ids=False, # Specify to return a binary vector of lenght = max_length. The vector # takes 1 when the corresponding token in the tweet representation is # valid, 0 if it is a special character used for padding. # For more info: https://huggingface.co/transformers/glossary.html#attention-mask return_attention_mask=True, # Padding added to the right padding='max_length', # Truncate the tweet if it is longer than 128 words truncation=True) input_ids, attention_mask = (input_dict['input_ids'], input_dict['attention_mask']) # For every tweet it creates an object of type InputFeatures # and adds it to the list. features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=sample.label)) # Creating a generator to convert the features list into a tensorflow dataset. def gen(): for f in features: yield ( { 'input_ids': f.input_ids, 'attention_mask': f.attention_mask, }, f.label, ) # Returns the dataset from the generator. # For more info: https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator return tf.data.Dataset.from_generator( gen, ({ 'input_ids': tf.int32, 'attention_mask': tf.int32, }, tf.int64), ( { 'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), }, tf.TensorShape([]), ), )
def __init__( self, args: GlueMemDataTrainingArguments, tokenizer: PreTrainedTokenizer, mem_size=20, limit_length: Optional[int] = None, mode: Union[str, Split] = Split.train, cache_dir: Optional[str] = None, ): self.args = args self.processor = glue_processors[args.task_name]() self.output_mode = glue_output_modes[args.task_name] if isinstance(mode, str): try: mode = Split[mode] except KeyError: raise KeyError("mode is not a valid split name") # Load data features from cache or dataset file cached_features_file = os.path.join( cache_dir if cache_dir is not None else args.data_dir, "cached_mem_{}_{}_{}_{}".format( mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, ), ) label_list = self.processor.get_labels() if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, BartTokenizer, BartTokenizerFast, ): # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] self.label_list = label_list # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {args.data_dir}") if mode == Split.dev: examples = self.processor.get_dev_examples(args.data_dir) elif mode == Split.test: examples = self.processor.get_test_examples(args.data_dir) else: examples = self.processor.get_train_examples(args.data_dir) if limit_length is not None: examples = examples[:limit_length] self.features = glue_convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length - args.mem_size, label_list=label_list, output_mode=self.output_mode, ) mem_id = tokenizer.added_tokens_encoder['[mem]'] input_ids = [el.input_ids for el in self.features] input_ids = [[el[0]] + args.mem_size*[mem_id] + el[1:] for el in input_ids] attention_mask = [el.attention_mask for el in self.features] attention_mask = [args.mem_size*[1] + el for el in attention_mask] token_type_ids = [el.token_type_ids for el in self.features] token_type_ids = [args.mem_size*[0] + el for el in token_type_ids] labels = [el.label for el in self.features] self.features = [InputFeatures(input_ids=el[0], attention_mask=el[1], token_type_ids=el[2], label=el[3]) for el in zip(input_ids, attention_mask, token_type_ids, labels)] start = time.time() torch.save(self.features, cached_features_file) # ^ This seems to take a lot of time so I want to investigate why and how we can improve. logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start )
def convert_examples_to_features( examples, tokenizer, processor, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Adapted from glue_convert_examples_to_features from transformers Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples``. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: a list of task-specific ``InputFeatures`` which can be fed to the model. """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = ( inputs["input_ids"], inputs["token_type_ids"], ) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert (len(input_ids) == max_length ), "Error with input length {} vs {}".format( len(input_ids), max_length) assert (len(attention_mask) == max_length ), "Error with input length {} vs {}".format( len(attention_mask), max_length) assert (len(token_type_ids) == max_length ), "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, )) return features
def glue_convert_examples_to_features( examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ logger.info("I am using the right script!") is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): len_examples = 0 if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) len_examples = tf.data.experimental.cardinality(examples) else: len_examples = len(examples) if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len_examples)) input_ids_1, attention_mask_1, token_type_ids_1 = encode_text( tokenizer, pad_token, example.text_a, [], [], []) input_ids_2, attention_mask_2, token_type_ids_2 = encode_text( tokenizer, pad_token, example.text_b, [], [], []) input_ids = (input_ids_1, input_ids_2) attention_mask = (attention_mask_1, attention_mask_2) token_type_ids = (token_type_ids_1, token_type_ids_2) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_len: %s" % " ".join([str(len(x)) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) return features
def convert_examples_to_features( examples: List[InputExample], max_seq_len: int, tokenizer: PreTrainedTokenizer, cls_token_segment_id=0, pad_token_segment_id=0, sequence_a_segment_id=0, mask_padding_with_zero=True) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures`""" # Setting based on the current model type cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token pad_token_id = tokenizer.pad_token_id features = [] for (ex_index, example) in enumerate(examples): tokens = tokenizer.tokenize(example.text_a) # Account for [CLS] and [SEP] special_tokens_count = 2 if len(tokens) > max_seq_len - special_tokens_count: tokens = tokens[:(max_seq_len - special_tokens_count)] # Add [SEP] token tokens += [sep_token] token_type_ids = [sequence_a_segment_id] * len(tokens) # Add [CLS] token tokens = [cls_token] + tokens token_type_ids = [cls_token_segment_id] + token_type_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_len - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_seq_len, "Error with input length {} vs {}".format( len(input_ids), max_seq_len) assert len( attention_mask ) == max_seq_len, "Error with attention mask length {} vs {}".format( len(attention_mask), max_seq_len) assert len( token_type_ids ) == max_seq_len, "Error with token type length {} vs {}".format( len(token_type_ids), max_seq_len) label_id = example.label features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label_id)) return features
def glue_convert_examples_to_features(examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ({ 'input_ids': ex.input_ids, 'attention_mask': ex.attention_mask, 'token_type_ids': ex.token_type_ids }, ex.label) return tf.data.Dataset.from_generator(gen, ({ 'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32 }, tf.int64), ({ 'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 'token_type_ids': tf.TensorShape([None]) }, tf.TensorShape([]))) return features
def _convert_examples_to_features(self, mode: str, tokenizer: PreTrainedTokenizer, return_dataset: str = "tf"): features = [] for (ex_index, example) in enumerate(self.examples[mode]): if ex_index % 10000 == 0: logger.info("Tokenizing example %d", ex_index) # This can now be done in one batch (see transformers) # and will be sped up even further in the coming months. feature = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True) label = self.labels.index(example.label) assert len(feature["input_ids"]) == self.max_seq_length assert len(feature["attention_mask"]) == self.max_seq_length assert len(feature["token_type_ids"]) == self.max_seq_length if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in feature["input_ids"]])) logger.info( "attention_mask: %s" % " ".join([str(x) for x in feature["attention_mask"]])) logger.info( "token_type_ids: %s" % " ".join([str(x) for x in feature["token_type_ids"]])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=feature["input_ids"], attention_mask=feature["attention_mask"], token_type_ids=feature["token_type_ids"], label=label)) if len(features) == 0: return None if return_dataset == "tf": if not is_tf_available(): raise RuntimeError( "return_dataset set to 'tf' but TensorFlow 2.0 can't be imported" ) import tensorflow as tf def gen(): for ex in features: yield ({ "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids }, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64), ({ "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]) }, tf.TensorShape([])), ) return dataset elif return_dataset == "pt": if not is_torch_available(): raise RuntimeError( "return_dataset set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long) token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, token_type_ids, all_labels) return dataset else: raise ValueError("return_dataset should be one of 'tf' or 'pt'")
def _convert_examples_to_features(self, mode: str, tokenizer: PreTrainedTokenizer, return_dataset: str = "tf"): features = [] for (ex_index, example) in enumerate(self.examples[mode]): if ex_index % 10000 == 0: logger.info("Tokenizing example %d", ex_index) tokens = [] label_ids = [] for word, label in zip(example.text_a, example.label): word_tokens = tokenizer.tokenize(word) if len(word_tokens) > 0: tokens.extend(word_tokens) label_ids.extend([self.labels.index(label)] + [-1] * (len(word_tokens) - 1)) special_tokens_count = tokenizer.num_special_tokens_to_add() if len(tokens) > self.max_seq_length - 2: tokens = tokens[:(self.max_seq_length - special_tokens_count)] label_ids = label_ids[:(self.max_seq_length - special_tokens_count)] tokens += [tokenizer.sep_token] label_ids += [-1] segment_ids = [0] * len(tokens) tokens = [tokenizer.cls_token] + tokens label_ids = [-1] + label_ids segment_ids = [0] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) padding_length = self.max_seq_length - len(input_ids) input_ids += [tokenizer.pad_token_id] * padding_length input_mask += [0] * padding_length segment_ids += [tokenizer.pad_token_type_id] * padding_length label_ids += [-1] * padding_length assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length assert len(label_ids) == self.max_seq_length if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label: %s " % (label_ids)) features.append( InputFeatures(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label=label_ids)) if len(features) == 0: return None if return_dataset == "tf": if not is_tf_available(): raise RuntimeError( "return_dataset set to 'tf' but TensorFlow 2.0 can't be imported" ) import tensorflow as tf def gen(): for ex in features: yield ({ "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids }, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64), ({ "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]) }, tf.TensorShape([None])), ) return dataset elif return_dataset == "pt": if not is_torch_available(): raise RuntimeError( "return_dataset set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long) token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, token_type_ids, all_labels) return dataset else: raise ValueError("return_dataset should be one of 'tf' or 'pt'")
def convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = {} batch_encoding["input_ids"] = [] batch_encoding["attention_mask"] = [] batch_encoding["token_type_ids"] = [] # batch_encoding = tokenizer( # [(example.text_a, example.text_b) for example in examples], # max_length=max_length, # padding="max_length", # truncation=True, # ) # print(batch_encoding) def _encode(x, max_length, doc=False): input_ids = tokenizer.encode(x, add_special_tokens=False, max_length=max_length) padding_length = max_length - len(input_ids) - 2 attention_mask = [1] * len(input_ids) + [0] * padding_length input_ids = input_ids + [103] * padding_length # if not doc: # input_ids = [101] + input_ids + [102] # attention_mask = [1] + attention_mask + [1] # else: # input_ids = input_ids + [102] # attention_mask = attention_mask + [102] return input_ids, attention_mask for example in examples: x, y = example.text_a, example.text_b ids1, mask1 = _encode(x, max_length=20) ids1 = [101] + ids1 + [102] mask1 = [1] + mask1 + [1] tids1 = [0] * len(ids1) ids2, mask2 = _encode(y, max_length=489) ids2 = ids2 + [102] mask2 = mask2 + [1] tids2 = [1] * len(ids2) input_ids = ids1 + ids2 attention_mask = mask1 + mask2 token_type_ids = tids1 + tids2 batch_encoding["input_ids"].append(input_ids) batch_encoding["attention_mask"].append(attention_mask) batch_encoding["token_type_ids"].append(token_type_ids) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logging.info("*** Example ***") logging.info("guid: %s" % (example.guid)) logging.info("features: %s" % features[i]) return features
# if ex_index > 100: # break # if len(full_length_encoded) < 5000: # continue ############ input_ids_list = [] attention_mask_list = [] token_type_ids_list = [] input_ids_list = full_length_encoded feature_v.append( InputFeatures( input_ids=input_ids_list, attention_mask=attention_mask_list, token_type_ids=token_type_ids_list, label=1 if row.scum else 0 ) ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) from torch.utils.data import Dataset class MyDS(Dataset): def __init__(self, features): self.features=features def __getitem__(self, idx): return self.features[idx] def __len__(self): return len(self.features)