def __init__(self, path='./data/', dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_', test_data_name='', full_meta_data_name='explanations_5panels.csv', label_size=5, fix_length=None, meta_data=None): """ :param meta_data: MetaData class instance. Will be used for vocab building. """ # we will add metalabel here and make iterators self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length) self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor, fix_length=fix_length) # it's actually this step that will take 5 minutes self.train, self.val, self.test = data.TabularDataset.splits( path=path, train=dataset_prefix + 'train.csv', validation=dataset_prefix + 'valid.csv', test=dataset_prefix + 'test.csv', format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) self.full_meta_data = data.TabularDataset( path=pjoin(path, full_meta_data_name), format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) self.meta_data = meta_data self.is_vocab_bulit = False self.iterators = [] if test_data_name != '': self.external_test = data.TabularDataset( path=path + test_data_name, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.external_test = None
def __init__(self, path='./data/', weak_train_dataset="", acmg_weak_data_path="", dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_', test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv', multi_task_train_dataset="", label_size=5, fix_length=None): self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length) self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor, fix_length=fix_length) if weak_train_dataset != "": self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) if acmg_weak_data_path != "": acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) # this should be enough! self.weak_train.examples.extend(acmg_weak_data.examples) else: self.weak_train = None if multi_task_train_dataset != "": self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.multi_task_train = None # it's actually this step that will take 5 minutes self.train, self.val, self.test = data.TabularDataset.splits( path=path, train=dataset_prefix + 'train.csv', validation=dataset_prefix + 'valid.csv', test=dataset_prefix + 'test.csv', format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) if test_data_name != '': self.external_test = data.TabularDataset(path=path + test_data_name, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.external_test = None self.is_vocab_bulit = False self.iterators = [] self.test_iterator = None self.weak_train_iterator = None self.multi_task_train_iterator = None
def __init__(self, data_path, batch_size=5, num_meta_labels=5, fix_length=None): """ :param data_path: "./models/data/" :param batch_size: number of explanations to draw, let's say 5 :param data_path: data should be in tsv format, and last label should be the grouping factor :param num_meta_labels: """ self.num_meta_labels = num_meta_labels self.fix_length = fix_length self.batch_size = batch_size self.data_path = data_path self.TEXT_FIELD = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=self.fix_length) # the vocab will be shared with the main text field in the main dataset self.datasets = [] self.data_iters = []
labels = {} for true_label in range(1, 19): labels[str(true_label)] = true_label - 1 # actual label we see # # map labels to list label_list = [None] * len(labels) for k, v in labels.items(): label_list[v] = k labels = label_list logger.info("available labels: ") logger.info(labels) TEXT = ReversibleField(sequential=True, tokenize=tokenizer, include_lengths=True, lower=False) LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=18, tensor_type=torch.FloatTensor) if args.dataset == 'major': train, val, test = data.TabularDataset.splits( path='../../data/csu/', train='maj_label_train.tsv', validation='maj_label_valid.tsv', test='maj_label_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)])
class Dataset(object): def __init__(self, path='./data/', dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_', test_data_name='', full_meta_data_name='explanations_5panels.csv', label_size=5, fix_length=None, meta_data=None): """ :param meta_data: MetaData class instance. Will be used for vocab building. """ # we will add metalabel here and make iterators self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length) self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor, fix_length=fix_length) # it's actually this step that will take 5 minutes self.train, self.val, self.test = data.TabularDataset.splits( path=path, train=dataset_prefix + 'train.csv', validation=dataset_prefix + 'valid.csv', test=dataset_prefix + 'test.csv', format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) self.full_meta_data = data.TabularDataset( path=pjoin(path, full_meta_data_name), format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) self.meta_data = meta_data self.is_vocab_bulit = False self.iterators = [] if test_data_name != '': self.external_test = data.TabularDataset( path=path + test_data_name, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.external_test = None def get_iterators(self, device, val_batch_size=128): if not self.is_vocab_bulit: raise Exception( "Vocabulary is not built yet..needs to call build_vocab()") if len(self.iterators) > 0: return self.iterators # return stored iterator # only get them after knowing the device (inside trainer or evaluator) train_iter, val_iter, test_iter = data.Iterator.splits( (self.train, self.val, self.test), sort_key=lambda x: len(x.Text ), # no global sort, but within-batch-sort batch_sizes=(32, val_batch_size, val_batch_size), device=device, sort_within_batch=True, repeat=False) return train_iter, val_iter, test_iter def xavier_uniform(self, tensor, fan_in, fan_out, gain=1): # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) std = gain * math.sqrt(2.0 / (fan_in + fan_out)) a = math.sqrt( 3.0) * std # Calculate uniform bounds from standard deviation with torch.no_grad(): return tensor.uniform_(-a, a) def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False): # we can try randn or glorot # mode="unk"|"all", all means initialize everything emb_vectors = vocab.vectors sweep_range = len(vocab) running_norm = 0. num_non_zero = 0 total_words = 0 fan_in, fan_out = emb_vectors.size() for i in range(num_special_toks, sweep_range): if len(emb_vectors[i, :].nonzero()) == 0: # std = 0.5 is based on the norm of average GloVE word vectors self.xavier_uniform(emb_vectors[i], fan_in, fan_out) else: num_non_zero += 1 running_norm += torch.norm(emb_vectors[i]) total_words += 1 if not silent: print( "average GloVE norm is {}, number of known words are {}, total number of words are {}" .format( running_norm / num_non_zero, num_non_zero, total_words)) # directly printing into Jupyter Notebook def build_vocab(self, config, silent=False): if config.emb_corpus == 'common_crawl': self.TEXT.build_vocab(self.train, self.full_meta_data, vectors="glove.840B.300d") config.emb_dim = 300 # change the config emb dimension else: # add all datasets self.TEXT.build_vocab(self.train, self.full_meta_data, vectors="glove.6B.{}d".format( config.emb_dim)) self.is_vocab_bulit = True self.vocab = self.TEXT.vocab if config.rand_unk: if not silent: print("initializing random vocabulary") self.init_emb(self.vocab, silent=silent) # synchronize vocab by making them the same object self.meta_data.TEXT_FIELD.vocab = self.TEXT.vocab
# {0: [12, 34, 13], meta_label_size = len(label_grouping) with open('../../data/csu/snomed_label_to_meta_map.json', 'rb') as f: meta_label_mapping = json.load(f) # {42: 14} maps snomed_indexed_label -> meta_labels with open('../../data/csu/snomed_labels_to_name.json', 'r') as f: labels = json.load(f) meta_category_groups = label_grouping.values() logger.info("available labels are: ") logger.info(labels) TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False) label_size = 42 # 18 if args.dataset != "multi_top_snomed_no_des" else 42 LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor) # load in adobe if args.abbr: adobe_test = data.TabularDataset( path= '../../data/csu/adobe_abbr_matched_snomed_multi_label_no_des_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)])
class Dataset(object): def __init__(self, path='./data/', weak_train_dataset="", acmg_weak_data_path="", dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_', test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv', multi_task_train_dataset="", label_size=5, fix_length=None): self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length) self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor, fix_length=fix_length) if weak_train_dataset != "": self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) if acmg_weak_data_path != "": acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) # this should be enough! self.weak_train.examples.extend(acmg_weak_data.examples) else: self.weak_train = None if multi_task_train_dataset != "": self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.multi_task_train = None # it's actually this step that will take 5 minutes self.train, self.val, self.test = data.TabularDataset.splits( path=path, train=dataset_prefix + 'train.csv', validation=dataset_prefix + 'valid.csv', test=dataset_prefix + 'test.csv', format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) if test_data_name != '': self.external_test = data.TabularDataset(path=path + test_data_name, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.external_test = None self.is_vocab_bulit = False self.iterators = [] self.test_iterator = None self.weak_train_iterator = None self.multi_task_train_iterator = None def xavier_uniform(self, tensor, fan_in, fan_out, gain=1): # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) std = gain * math.sqrt(2.0 / (fan_in + fan_out)) a = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation with torch.no_grad(): return tensor.uniform_(-a, a) def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False): # we can try randn or glorot # mode="unk"|"all", all means initialize everything emb_vectors = vocab.vectors sweep_range = len(vocab) running_norm = 0. num_non_zero = 0 total_words = 0 fan_in, fan_out = emb_vectors.size() # 16870, 300 # std = 0.01 # a = 1.73 * 0.01 for i in range(num_special_toks, sweep_range): if len(emb_vectors[i, :].nonzero()) == 0: # std = 0.5 is based on the norm of average GloVE word vectors self.xavier_uniform(emb_vectors[i], fan_in, fan_out) else: num_non_zero += 1 running_norm += torch.norm(emb_vectors[i]) total_words += 1 if not silent: print("average GloVE norm is {}, number of known words are {}, total number of words are {}".format( running_norm / num_non_zero, num_non_zero, total_words)) # directly printing into Jupyter Notebook def build_vocab(self, config, silent=False): datasets = [self.train] if self.weak_train is not None and args.weak_vocab: datasets.append(self.weak_train) if self.multi_task_train is not None: datasets.append(self.multi_task_train) # we always build vocab for multitask if config.emb_corpus == 'common_crawl': # self.TEXT.build_vocab(self.train, vectors="glove.840B.300d") self.TEXT.build_vocab(*datasets, vectors="glove.840B.300d") config.emb_dim = 300 # change the config emb dimension else: self.TEXT.build_vocab(*datasets, vectors="glove.6B.{}d".format(config.emb_dim)) self.is_vocab_bulit = True self.vocab = self.TEXT.vocab if config.rand_unk: if not silent: print("initializing random vocabulary") self.init_emb(self.vocab, silent=silent) def get_iterators(self, device, val_batch_size=128): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if len(self.iterators) > 0: return self.iterators # return stored iterator # only get them after knowing the device (inside trainer or evaluator) train_iter, val_iter, test_iter = data.Iterator.splits( (self.train, self.val, self.test), sort_key=lambda x: len(x.Text), # no global sort, but within-batch-sort batch_sizes=(32, val_batch_size, val_batch_size), device=device, sort_within_batch=True, repeat=False) return train_iter, val_iter, test_iter def get_test_iterator(self, device): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if self.test_iterator is not None: return self.test_iterator external_test_iter = data.Iterator(self.external_test, 128, sort_key=lambda x: len(x.Text), device=device, train=False, repeat=False, sort_within_batch=True) return external_test_iter def get_weak_train_iterator(self, device): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if self.weak_train_iterator is not None: return self.weak_train_iterator weak_train_iterator = data.Iterator(self.weak_train, 128, sort_key=lambda x: len(x.Text), device=device, train=True, repeat=False, sort_within_batch=True) return weak_train_iterator def get_multi_task_train_iterator(self, device): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if self.multi_task_train_iterator is not None: return self.multi_task_train_iterator self.multi_task_train_iterator = data.Iterator(self.multi_task_train, 128, sort_key=lambda x: len(x.Text), device=device, train=True, repeat=False, sort_within_batch=True) return self.multi_task_train_iterator