示例#1
0
    def __init__(self,
                 path='./data/',
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='',
                 full_meta_data_name='explanations_5panels.csv',
                 label_size=5,
                 fix_length=None,
                 meta_data=None):
        """
        :param meta_data: MetaData class instance. Will be used for vocab building.
        """
        # we will add metalabel here and make iterators
        self.TEXT = ReversibleField(sequential=True,
                                    include_lengths=True,
                                    lower=False,
                                    fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True,
                                     use_vocab=False,
                                     label_size=label_size,
                                     tensor_type=torch.FloatTensor,
                                     fix_length=fix_length)

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path,
            train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv',
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.full_meta_data = data.TabularDataset(
            path=pjoin(path, full_meta_data_name),
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.meta_data = meta_data

        self.is_vocab_bulit = False
        self.iterators = []

        if test_data_name != '':
            self.external_test = data.TabularDataset(
                path=path + test_data_name,
                format='tsv',
                fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None
示例#2
0
    def __init__(self, path='./data/',
                 weak_train_dataset="",
                 acmg_weak_data_path="",
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv',
                 multi_task_train_dataset="",
                 label_size=5, fix_length=None):
        self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size,
                                     tensor_type=torch.FloatTensor, fix_length=fix_length)

        if weak_train_dataset != "":
            self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
            if acmg_weak_data_path != "":
                acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
                # this should be enough!
                self.weak_train.examples.extend(acmg_weak_data.examples)
        else:
            self.weak_train = None

        if multi_task_train_dataset != "":
            self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv',
                                                        fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.multi_task_train = None

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path, train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv', format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        if test_data_name != '':
            self.external_test = data.TabularDataset(path=path + test_data_name,
                                                     format='tsv',
                                                     fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None

        self.is_vocab_bulit = False
        self.iterators = []
        self.test_iterator = None
        self.weak_train_iterator = None
        self.multi_task_train_iterator = None
示例#3
0
    # # map labels to list
    label_list = [None] * len(labels)
    for k, v in labels.items():
        label_list[v] = k

    labels = label_list
    logger.info("available labels: ")
    logger.info(labels)

    TEXT = ReversibleField(sequential=True,
                           tokenize=tokenizer,
                           include_lengths=True,
                           lower=False)

    LABEL = MultiLabelField(sequential=True,
                            use_vocab=False,
                            label_size=18,
                            tensor_type=torch.FloatTensor)

    if args.dataset == 'major':
        train, val, test = data.TabularDataset.splits(
            path='../../data/csu/',
            train='maj_label_train.tsv',
            validation='maj_label_valid.tsv',
            test='maj_label_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
    elif args.dataset == 'multi':
        train, val, test = data.TabularDataset.splits(
            path='../../data/csu/',
            train='multi_label_train.tsv',
            validation='multi_label_valid.tsv',