Exemplo n.º 1
0
    def __init__(self,
                 path='./data/',
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='',
                 full_meta_data_name='explanations_5panels.csv',
                 label_size=5,
                 fix_length=None,
                 meta_data=None):
        """
        :param meta_data: MetaData class instance. Will be used for vocab building.
        """
        # we will add metalabel here and make iterators
        self.TEXT = ReversibleField(sequential=True,
                                    include_lengths=True,
                                    lower=False,
                                    fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True,
                                     use_vocab=False,
                                     label_size=label_size,
                                     tensor_type=torch.FloatTensor,
                                     fix_length=fix_length)

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path,
            train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv',
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.full_meta_data = data.TabularDataset(
            path=pjoin(path, full_meta_data_name),
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.meta_data = meta_data

        self.is_vocab_bulit = False
        self.iterators = []

        if test_data_name != '':
            self.external_test = data.TabularDataset(
                path=path + test_data_name,
                format='tsv',
                fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None
Exemplo n.º 2
0
    def __init__(self, path='./data/',
                 weak_train_dataset="",
                 acmg_weak_data_path="",
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv',
                 multi_task_train_dataset="",
                 label_size=5, fix_length=None):
        self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size,
                                     tensor_type=torch.FloatTensor, fix_length=fix_length)

        if weak_train_dataset != "":
            self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
            if acmg_weak_data_path != "":
                acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
                # this should be enough!
                self.weak_train.examples.extend(acmg_weak_data.examples)
        else:
            self.weak_train = None

        if multi_task_train_dataset != "":
            self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv',
                                                        fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.multi_task_train = None

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path, train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv', format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        if test_data_name != '':
            self.external_test = data.TabularDataset(path=path + test_data_name,
                                                     format='tsv',
                                                     fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None

        self.is_vocab_bulit = False
        self.iterators = []
        self.test_iterator = None
        self.weak_train_iterator = None
        self.multi_task_train_iterator = None
Exemplo n.º 3
0
    def __init__(self,
                 data_path,
                 batch_size=5,
                 num_meta_labels=5,
                 fix_length=None):
        """
        :param data_path: "./models/data/"
        :param batch_size: number of explanations to draw, let's say 5
        :param data_path: data should be in tsv format, and last label should be the grouping factor
        :param num_meta_labels:
        """
        self.num_meta_labels = num_meta_labels
        self.fix_length = fix_length
        self.batch_size = batch_size
        self.data_path = data_path

        self.TEXT_FIELD = ReversibleField(sequential=True,
                                          include_lengths=True,
                                          lower=False,
                                          fix_length=self.fix_length)
        # the vocab will be shared with the main text field in the main dataset

        self.datasets = []
        self.data_iters = []
Exemplo n.º 4
0
    labels = {}
    for true_label in range(1, 19):
        labels[str(true_label)] = true_label - 1  # actual label we see

    # # map labels to list
    label_list = [None] * len(labels)
    for k, v in labels.items():
        label_list[v] = k

    labels = label_list
    logger.info("available labels: ")
    logger.info(labels)

    TEXT = ReversibleField(sequential=True,
                           tokenize=tokenizer,
                           include_lengths=True,
                           lower=False)

    LABEL = MultiLabelField(sequential=True,
                            use_vocab=False,
                            label_size=18,
                            tensor_type=torch.FloatTensor)

    if args.dataset == 'major':
        train, val, test = data.TabularDataset.splits(
            path='../../data/csu/',
            train='maj_label_train.tsv',
            validation='maj_label_valid.tsv',
            test='maj_label_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
Exemplo n.º 5
0
class Dataset(object):
    def __init__(self,
                 path='./data/',
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='',
                 full_meta_data_name='explanations_5panels.csv',
                 label_size=5,
                 fix_length=None,
                 meta_data=None):
        """
        :param meta_data: MetaData class instance. Will be used for vocab building.
        """
        # we will add metalabel here and make iterators
        self.TEXT = ReversibleField(sequential=True,
                                    include_lengths=True,
                                    lower=False,
                                    fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True,
                                     use_vocab=False,
                                     label_size=label_size,
                                     tensor_type=torch.FloatTensor,
                                     fix_length=fix_length)

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path,
            train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv',
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.full_meta_data = data.TabularDataset(
            path=pjoin(path, full_meta_data_name),
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.meta_data = meta_data

        self.is_vocab_bulit = False
        self.iterators = []

        if test_data_name != '':
            self.external_test = data.TabularDataset(
                path=path + test_data_name,
                format='tsv',
                fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None

    def get_iterators(self, device, val_batch_size=128):
        if not self.is_vocab_bulit:
            raise Exception(
                "Vocabulary is not built yet..needs to call build_vocab()")

        if len(self.iterators) > 0:
            return self.iterators  # return stored iterator

        # only get them after knowing the device (inside trainer or evaluator)
        train_iter, val_iter, test_iter = data.Iterator.splits(
            (self.train, self.val, self.test),
            sort_key=lambda x: len(x.Text
                                   ),  # no global sort, but within-batch-sort
            batch_sizes=(32, val_batch_size, val_batch_size),
            device=device,
            sort_within_batch=True,
            repeat=False)

        return train_iter, val_iter, test_iter

    def xavier_uniform(self, tensor, fan_in, fan_out, gain=1):
        # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
        std = gain * math.sqrt(2.0 / (fan_in + fan_out))
        a = math.sqrt(
            3.0) * std  # Calculate uniform bounds from standard deviation
        with torch.no_grad():
            return tensor.uniform_(-a, a)

    def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False):
        # we can try randn or glorot
        # mode="unk"|"all", all means initialize everything
        emb_vectors = vocab.vectors
        sweep_range = len(vocab)
        running_norm = 0.
        num_non_zero = 0
        total_words = 0

        fan_in, fan_out = emb_vectors.size()

        for i in range(num_special_toks, sweep_range):
            if len(emb_vectors[i, :].nonzero()) == 0:
                # std = 0.5 is based on the norm of average GloVE word vectors
                self.xavier_uniform(emb_vectors[i], fan_in, fan_out)
            else:
                num_non_zero += 1
                running_norm += torch.norm(emb_vectors[i])
            total_words += 1
        if not silent:
            print(
                "average GloVE norm is {}, number of known words are {}, total number of words are {}"
                .format(
                    running_norm / num_non_zero, num_non_zero,
                    total_words))  # directly printing into Jupyter Notebook

    def build_vocab(self, config, silent=False):
        if config.emb_corpus == 'common_crawl':
            self.TEXT.build_vocab(self.train,
                                  self.full_meta_data,
                                  vectors="glove.840B.300d")
            config.emb_dim = 300  # change the config emb dimension
        else:
            # add all datasets
            self.TEXT.build_vocab(self.train,
                                  self.full_meta_data,
                                  vectors="glove.6B.{}d".format(
                                      config.emb_dim))
        self.is_vocab_bulit = True
        self.vocab = self.TEXT.vocab
        if config.rand_unk:
            if not silent:
                print("initializing random vocabulary")
            self.init_emb(self.vocab, silent=silent)

        # synchronize vocab by making them the same object
        self.meta_data.TEXT_FIELD.vocab = self.TEXT.vocab
Exemplo n.º 6
0
        # {0: [12, 34, 13],
        meta_label_size = len(label_grouping)

    with open('../../data/csu/snomed_label_to_meta_map.json', 'rb') as f:
        meta_label_mapping = json.load(f)
        # {42: 14} maps snomed_indexed_label -> meta_labels

    with open('../../data/csu/snomed_labels_to_name.json', 'r') as f:
        labels = json.load(f)

    meta_category_groups = label_grouping.values()

    logger.info("available labels are: ")
    logger.info(labels)

    TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False)

    label_size = 42  # 18 if args.dataset != "multi_top_snomed_no_des" else 42

    LABEL = MultiLabelField(sequential=True,
                            use_vocab=False,
                            label_size=label_size,
                            tensor_type=torch.FloatTensor)

    # load in adobe
    if args.abbr:
        adobe_test = data.TabularDataset(
            path=
            '../../data/csu/adobe_abbr_matched_snomed_multi_label_no_des_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
Exemplo n.º 7
0
class Dataset(object):
    def __init__(self, path='./data/',
                 weak_train_dataset="",
                 acmg_weak_data_path="",
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv',
                 multi_task_train_dataset="",
                 label_size=5, fix_length=None):
        self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size,
                                     tensor_type=torch.FloatTensor, fix_length=fix_length)

        if weak_train_dataset != "":
            self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
            if acmg_weak_data_path != "":
                acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
                # this should be enough!
                self.weak_train.examples.extend(acmg_weak_data.examples)
        else:
            self.weak_train = None

        if multi_task_train_dataset != "":
            self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv',
                                                        fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.multi_task_train = None

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path, train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv', format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        if test_data_name != '':
            self.external_test = data.TabularDataset(path=path + test_data_name,
                                                     format='tsv',
                                                     fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None

        self.is_vocab_bulit = False
        self.iterators = []
        self.test_iterator = None
        self.weak_train_iterator = None
        self.multi_task_train_iterator = None

    def xavier_uniform(self, tensor, fan_in, fan_out, gain=1):
        # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
        std = gain * math.sqrt(2.0 / (fan_in + fan_out))
        a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
        with torch.no_grad():
            return tensor.uniform_(-a, a)

    def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False):
        # we can try randn or glorot
        # mode="unk"|"all", all means initialize everything
        emb_vectors = vocab.vectors
        sweep_range = len(vocab)
        running_norm = 0.
        num_non_zero = 0
        total_words = 0

        fan_in, fan_out = emb_vectors.size()  # 16870, 300 # std = 0.01 # a = 1.73 * 0.01

        for i in range(num_special_toks, sweep_range):
            if len(emb_vectors[i, :].nonzero()) == 0:
                # std = 0.5 is based on the norm of average GloVE word vectors
                self.xavier_uniform(emb_vectors[i], fan_in, fan_out)
            else:
                num_non_zero += 1
                running_norm += torch.norm(emb_vectors[i])
            total_words += 1
        if not silent:
            print("average GloVE norm is {}, number of known words are {}, total number of words are {}".format(
                running_norm / num_non_zero, num_non_zero, total_words))  # directly printing into Jupyter Notebook

    def build_vocab(self, config, silent=False):
        datasets = [self.train]
        if self.weak_train is not None and args.weak_vocab:
            datasets.append(self.weak_train)

        if self.multi_task_train is not None:
            datasets.append(self.multi_task_train)  # we always build vocab for multitask

        if config.emb_corpus == 'common_crawl':
            # self.TEXT.build_vocab(self.train, vectors="glove.840B.300d")
            self.TEXT.build_vocab(*datasets, vectors="glove.840B.300d")
            config.emb_dim = 300  # change the config emb dimension
        else:
            self.TEXT.build_vocab(*datasets, vectors="glove.6B.{}d".format(config.emb_dim))

        self.is_vocab_bulit = True
        self.vocab = self.TEXT.vocab
        if config.rand_unk:
            if not silent:
                print("initializing random vocabulary")
            self.init_emb(self.vocab, silent=silent)

    def get_iterators(self, device, val_batch_size=128):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if len(self.iterators) > 0:
            return self.iterators  # return stored iterator

        # only get them after knowing the device (inside trainer or evaluator)
        train_iter, val_iter, test_iter = data.Iterator.splits(
            (self.train, self.val, self.test), sort_key=lambda x: len(x.Text),  # no global sort, but within-batch-sort
            batch_sizes=(32, val_batch_size, val_batch_size), device=device,
            sort_within_batch=True, repeat=False)

        return train_iter, val_iter, test_iter

    def get_test_iterator(self, device):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if self.test_iterator is not None:
            return self.test_iterator

        external_test_iter = data.Iterator(self.external_test, 128, sort_key=lambda x: len(x.Text),
                                           device=device, train=False, repeat=False, sort_within_batch=True)
        return external_test_iter

    def get_weak_train_iterator(self, device):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if self.weak_train_iterator is not None:
            return self.weak_train_iterator

        weak_train_iterator = data.Iterator(self.weak_train, 128, sort_key=lambda x: len(x.Text),
                                           device=device, train=True, repeat=False, sort_within_batch=True)

        return weak_train_iterator

    def get_multi_task_train_iterator(self, device):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if self.multi_task_train_iterator is not None:
            return self.multi_task_train_iterator

        self.multi_task_train_iterator = data.Iterator(self.multi_task_train, 128, sort_key=lambda x: len(x.Text),
                                        device=device, train=True, repeat=False, sort_within_batch=True)

        return self.multi_task_train_iterator