def make_dataset(language: Language, mapping: dict, num_positive: int, num_negative: int): positive_strings = language.generate_batch_data( num_datapoints=num_positive, truth_value=True) negative_strings = language.generate_batch_data( num_datapoints=num_negative, truth_value=False) # TODO: fix language._max_length max_length = max( [len(string) for string in positive_strings + negative_strings]) # print(max_length) positive_tensor, positive_lengths = strings_to_tensor( positive_strings, max_length, mapping) positive_labels = torch.zeros(len(positive_strings), dtype=torch.long) negative_tensor, negative_lengths = strings_to_tensor( negative_strings, max_length, mapping) negative_labels = torch.ones(len(negative_strings), dtype=torch.long) all_tensors = torch.cat((positive_tensor, negative_tensor)) all_labels = torch.cat((positive_labels, negative_labels)) all_lengths = torch.cat((positive_lengths, negative_lengths)) return LanguageDataset(all_tensors, all_labels, all_lengths) # unshuffled, must be shuffled at use