def set_seed(seed: int): """ Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if installed). Args: seed (:obj:`int`): The seed to set. """ random.seed(seed) np.random.seed(seed) if is_torch_available(): import torch torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # ^^ safe to call this function even if cuda is not available if is_tf_available(): import tensorflow as tf tf.random.set_seed(seed)
import unittest import sys, os import pdb sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../src/") from file_utils import is_tf_available from testing_utils import require_tf import json if is_tf_available(): import datetime, pickle, codecs, re, string from tqdm import tqdm import tensorflow as tf import pandas as pd import numpy as np import string from preprocess.utils import ( Params, get_dataset, fix_fn, _py_fn, load_subword_embedding, normalize, ) from models.hierarchical_attention.han import HAN_Model @require_tf class TestHANLoader(unittest.TestCase): @classmethod
def glue_convert_examples_to_features(examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) inputs2 = tokenizer.encode_plus( example.text_a, example.text_c, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] input_ids2, token_type_ids2 = inputs2["input_ids"], inputs2[ "token_type_ids"] text_a_len = token_type_ids.count(0) text_b_len = len(token_type_ids) - text_a_len text_c_len = len(token_type_ids2) - text_a_len # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) attention_mask2 = [1 if mask_padding_with_zero else 0 ] * len(input_ids2) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) padding_length2 = max_length - len(input_ids2) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_ids2 = ([pad_token] * padding_length2) + input_ids2 attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask attention_mask2 = ([0 if mask_padding_with_zero else 1] * padding_length2) + attention_mask2 token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids token_type_ids2 = ([pad_token_segment_id] * padding_length2) + token_type_ids2 """ 生成对齐Attention p a b a b """ align_mask =[[0 if mask_padding_with_zero else 1] *len(input_ids)]*padding_length\ +[[0 if mask_padding_with_zero else 1]*(padding_length+text_a_len)+[1 if mask_padding_with_zero else 0]*text_b_len]*text_a_len\ +[[0 if mask_padding_with_zero else 1]*padding_length+[1 if mask_padding_with_zero else 0]*text_a_len+[0 if mask_padding_with_zero else 1]*text_b_len]*text_b_len align_mask2 =[[0 if mask_padding_with_zero else 1] *len(input_ids2)]*padding_length2\ +[[0 if mask_padding_with_zero else 1]*(padding_length2+text_a_len)+[1 if mask_padding_with_zero else 0]*text_c_len]*text_a_len\ +[[0 if mask_padding_with_zero else 1]*padding_length2+[1 if mask_padding_with_zero else 0]*text_a_len+[0 if mask_padding_with_zero else 1]*text_c_len]*text_c_len else: input_ids = input_ids + ([pad_token] * padding_length) input_ids2 = input_ids2 + ([pad_token] * padding_length2) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) attention_mask2 = attention_mask2 + ( [0 if mask_padding_with_zero else 1] * padding_length2) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) token_type_ids2 = token_type_ids2 + ([pad_token_segment_id] * padding_length2) align_mask =[[0 if mask_padding_with_zero else 1]*text_a_len+[1 if mask_padding_with_zero else 0]*text_b_len+[0 if mask_padding_with_zero else 1]*padding_length]*text_a_len\ +[[1 if mask_padding_with_zero else 0]*text_a_len+[0 if mask_padding_with_zero else 1]*(text_b_len+padding_length)]*text_b_len \ +[[0 if mask_padding_with_zero else 1] * len(input_ids)] * padding_length align_mask2 = [[0 if mask_padding_with_zero else 1] * text_a_len + [ 1 if mask_padding_with_zero else 0] * text_c_len + [ 0 if mask_padding_with_zero else 1] * padding_length2] * text_a_len \ + [[1 if mask_padding_with_zero else 0] * text_a_len + [0 if mask_padding_with_zero else 1] * ( text_c_len + padding_length2)] * text_c_len \ + [[0 if mask_padding_with_zero else 1] * len(input_ids2)] * padding_length2 assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(align_mask[0] ) == max_length, "Error with input length {} vs {}".format( len(align_mask[0]), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) assert len(input_ids2 ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask2 ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(align_mask2[0] ) == max_length, "Error with input length {} vs {}".format( len(align_mask[0]), max_length) assert len(token_type_ids2 ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if example.label is not None: if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) else: label = None if example.label2 is not None: if output_mode == "classification": label2 = label_map[example.label2] elif output_mode == "regression": label2 = float(example.label2) else: raise KeyError(output_mode) else: label2 = None if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("input_ids2: %s" % " ".join([str(x) for x in input_ids2])) logger.info("attention_mask2: %s" % " ".join([str(x) for x in attention_mask2])) logger.info("token_type_ids2: %s" % " ".join([str(x) for x in token_type_ids2])) if label is not None: logger.info("label: %s (id = %d)" % (example.label, label)) if label2 is not None: logger.info("label2: %s (id = %d)" % (example.label2, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, align_mask=align_mask, token_type_ids=token_type_ids, label=label, input_ids2=input_ids2, attention_mask2=attention_mask2, align_mask2=align_mask2, token_type_ids2=token_type_ids2, label2=label2)) return features
def glue_convert_examples_to_features( examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): len_examples = 0 if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) len_examples = tf.data.experimental.cardinality(examples) else: len_examples = len(examples) if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len_examples)) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) return features
def squad_convert_examples_to_features( examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1 ): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. Args: examples: list of :class:`~transformers.data.processors.squad.SquadExample` tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` max_seq_length: The maximum sequence length of the inputs. doc_stride: The stride used when the context is too large and is split across several features. max_query_length: The maximum length of the query. is_training: whether to create features for model evaluation or model training. return_dataset: Default False. Either 'pt' or 'tf'. if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset threads: multiple processing threadsa-smi Returns: list of :class:`~transformers.data.processors.squad.SquadFeatures` Example:: processor = SquadV2Processor() examples = processor.get_dev_examples(data_dir) features = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, ) """ # Defining helper methods features = [] threads = min(threads, cpu_count()) with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: annotate_ = partial( squad_convert_example_to_features, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training, ) features = list( tqdm( p.imap(annotate_, examples, chunksize=32), total=len(examples), desc="convert squad examples to features", mininterval=5, ) ) new_features = [] unique_id = 1000000000 example_index = 0 for example_features in tqdm(features, total=len(features), desc="add example index and unique id", mininterval=5): if not example_features: continue for example_feature in example_features: example_feature.example_index = example_index example_feature.unique_id = unique_id new_features.append(example_feature) unique_id += 1 example_index += 1 features = new_features del new_features if return_dataset == "pt": if not is_torch_available(): raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float) if not is_training: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask, all_is_impossible, ) return features, dataset elif return_dataset == "tf": if not is_tf_available(): raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.") def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, { "start_position": ex.start_position, "end_position": ex.end_position, "cls_index": ex.cls_index, "p_mask": ex.p_mask, "is_impossible": ex.is_impossible, }, ) return tf.data.Dataset.from_generator( gen, ( {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, { "start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32, "is_impossible": tf.int32, }, ), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, { "start_position": tf.TensorShape([]), "end_position": tf.TensorShape([]), "cls_index": tf.TensorShape([]), "p_mask": tf.TensorShape([None]), "is_impossible": tf.TensorShape([]), }, ), ) return features
def get_features( self, tokenizer, max_length=None, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, return_tensors=None, ): """ Convert examples in a list of ``InputFeatures`` Args: tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(self.labels)} all_input_ids = [] for (ex_index, example) in enumerate(self.examples): if ex_index % 10000 == 0: logger.info("Tokenizing example %d", ex_index) input_ids = tokenizer.encode( example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len), ) all_input_ids.append(input_ids) batch_length = max(len(input_ids) for input_ids in all_input_ids) features = [] for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)): if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len(self.examples))) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0 ] * len(input_ids) # Zero-pad up to the sequence length. padding_length = batch_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) assert len( input_ids ) == batch_length, "Error with input length {} vs {}".format( len(input_ids), batch_length) assert len( attention_mask ) == batch_length, "Error with input length {} vs {}".format( len(attention_mask), batch_length) if self.mode == "classification": label = label_map[example.label] elif self.mode == "regression": label = float(example.label) else: raise ValueError(self.mode) if ex_index < 5 and self.verbose: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) if return_tensors is None: return features elif return_tensors == "tf": if not is_tf_available(): raise RuntimeError( "return_tensors set to 'tf' but TensorFlow 2.0 can't be imported" ) import tensorflow as tf def gen(): for ex in features: yield ({ "input_ids": ex.input_ids, "attention_mask": ex.attention_mask }, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32 }, tf.int64), ({ "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]) }, tf.TensorShape([])), ) return dataset elif return_tensors == "pt": if not is_torch_available(): raise RuntimeError( "return_tensors set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long) if self.mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif self.mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset else: raise ValueError("return_tensors should be one of 'tf' or 'pt'")