def __init__(self, dataframe, tokenizer, max_seq_length=256):
     """
     Args:
       dataframe: A Pandas dataframe containing the data.
       tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
         tokenize the data.
       max_seq_length: Maximum sequence length to either pad or truncate every
         input example to.
     """
     self.encoded_data = data_utils.encode_data(dataframe, tokenizer,
                                                max_seq_length)
     self.label_list = data_utils.extract_labels(dataframe)
예제 #2
0
파일: boolq.py 프로젝트: amishavyas/hw3
    def __init__(self, dataframe, tokenizer, max_seq_length=256):
        """
        Args:
          dataframe: A Pandas dataframe containing the data.
          tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
            tokenize the data.
          max_seq_length: Maximum sequence length to either pad or truncate every
            input example to.
        """
        ## TODO: Use encode_data() from data_utils to store the input IDs and
        ## attention masks for the data.
        self.encoded_data = data_utils.encode_data(
            dataframe, tokenizer,
            max_seq_length)  #self.encoded_data should be a tuple of 2 tensors

        ## TODO: Use extract_labels() from data_utils to store the labels.
        self.label_list = data_utils.extract_labels(
            dataframe)  #should be a list of labels (0,1..)
예제 #3
0
 def __init__(self,
              dataframe,
              tokenizer,
              level,
              label_space_file='',
              max_seq_length=256):
     """
     Args:
       dataframe: A Pandas dataframe containing the data.
       tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
         tokenize the data.
       level: A level at which the labels are
       label_space_file: A file for label space, where dialect converted to integer
       max_seq_length: Maximum sequence length to either pad or truncate every
         input example to.
     """
     self.encoded_data = data_utils.encode_data(dataframe, tokenizer,
                                                max_seq_length)
     self.label_list = data_utils.extract_labels(dataframe, level,
                                                 label_space_file)
예제 #4
0
    def test_encode_data(self):
        ## TODO: Write a unit test that asserts that the dimensions and dtype of the
        ## output of encode_data() are correct.
        ## input_ids should have shape [len(self.dataset), self.max_seq_len] and type torch.long.
        ## attention_mask should have the same shape and type.

        #input_ids.shape == (len(self.dataset) , self.max_seq_len)
        #input_ids.dtype == torch.long

        input_ids, attention_mask = data_utils.encode_data(
            self.dataset, self.tokenizer,
            self.max_seq_len)  #getting the output of encode_data()
        shape = (len(self.dataset), self.max_seq_len)
        tensor_type = torch.long

        self.assertEqual(input_ids.shape,
                         shape)  #comparing the shape of input_ids
        self.assertEqual(attention_mask.shape,
                         shape)  #comparing the shape of attention_masks

        self.assertEqual(input_ids.dtype,
                         tensor_type)  #comparing the type of input_ids
        self.assertEqual(attention_mask.dtype,
                         tensor_type)  #comparing the type of attention_masks
예제 #5
0
 def test_encode_data(self):
     encode = data_utils.encode_data(self.dataset, self.tokenizer, self.max_seq_len)
     self.assertEqual(list(encode[0].shape), [len(self.dataset), self.max_seq_len])
     self.assertEqual(list(encode[1].shape), [len(self.dataset), self.max_seq_len])
     self.assertEqual(encode[0].dtype, torch.long)
     self.assertEqual(encode[1].dtype, torch.long)