def __init__(self, dataframe, tokenizer, max_seq_length=256): """ Args: dataframe: A Pandas dataframe containing the data. tokenizer: A transformers.PreTrainedTokenizerFast object that is used to tokenize the data. max_seq_length: Maximum sequence length to either pad or truncate every input example to. """ self.encoded_data = data_utils.encode_data(dataframe, tokenizer, max_seq_length) self.label_list = data_utils.extract_labels(dataframe)
def __init__(self, dataframe, tokenizer, max_seq_length=256): """ Args: dataframe: A Pandas dataframe containing the data. tokenizer: A transformers.PreTrainedTokenizerFast object that is used to tokenize the data. max_seq_length: Maximum sequence length to either pad or truncate every input example to. """ ## TODO: Use encode_data() from data_utils to store the input IDs and ## attention masks for the data. self.encoded_data = data_utils.encode_data( dataframe, tokenizer, max_seq_length) #self.encoded_data should be a tuple of 2 tensors ## TODO: Use extract_labels() from data_utils to store the labels. self.label_list = data_utils.extract_labels( dataframe) #should be a list of labels (0,1..)
def __init__(self, dataframe, tokenizer, level, label_space_file='', max_seq_length=256): """ Args: dataframe: A Pandas dataframe containing the data. tokenizer: A transformers.PreTrainedTokenizerFast object that is used to tokenize the data. level: A level at which the labels are label_space_file: A file for label space, where dialect converted to integer max_seq_length: Maximum sequence length to either pad or truncate every input example to. """ self.encoded_data = data_utils.encode_data(dataframe, tokenizer, max_seq_length) self.label_list = data_utils.extract_labels(dataframe, level, label_space_file)
def test_encode_data(self): ## TODO: Write a unit test that asserts that the dimensions and dtype of the ## output of encode_data() are correct. ## input_ids should have shape [len(self.dataset), self.max_seq_len] and type torch.long. ## attention_mask should have the same shape and type. #input_ids.shape == (len(self.dataset) , self.max_seq_len) #input_ids.dtype == torch.long input_ids, attention_mask = data_utils.encode_data( self.dataset, self.tokenizer, self.max_seq_len) #getting the output of encode_data() shape = (len(self.dataset), self.max_seq_len) tensor_type = torch.long self.assertEqual(input_ids.shape, shape) #comparing the shape of input_ids self.assertEqual(attention_mask.shape, shape) #comparing the shape of attention_masks self.assertEqual(input_ids.dtype, tensor_type) #comparing the type of input_ids self.assertEqual(attention_mask.dtype, tensor_type) #comparing the type of attention_masks
def test_encode_data(self): encode = data_utils.encode_data(self.dataset, self.tokenizer, self.max_seq_len) self.assertEqual(list(encode[0].shape), [len(self.dataset), self.max_seq_len]) self.assertEqual(list(encode[1].shape), [len(self.dataset), self.max_seq_len]) self.assertEqual(encode[0].dtype, torch.long) self.assertEqual(encode[1].dtype, torch.long)