示例#1
0
    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
        assert not self.cfg.include_src or len(src_tokens[0]) == 2
        input_src = None
        if self.cfg.include_src:
            input_src = TokenBlockDataset(
                [t[0] for t in src_tokens],
                [l[0] for l in src_lengths],
                block_size=None,  # ignored for "eos" break mode
                pad=self.source_dictionary.pad(),
                eos=self.source_dictionary.eos(),
                break_mode="eos",
            )
            input_src = PrependTokenDataset(input_src, self.dictionary.bos())
            input_src = TruncateDataset(input_src, self.cfg.max_positions)

        input_tgt = TokenBlockDataset(
            [t[-1] for t in src_tokens],
            [l[-1] for l in src_lengths],
            block_size=None,  # ignored for "eos" break mode
            pad=self.source_dictionary.pad(),
            eos=self.source_dictionary.eos(),
            break_mode="eos",
        )
        input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions)
        if self.cfg.include_src:
            src_tokens = ConcatSentencesDataset(input_src, input_tgt)
            src_lengths = NumelDataset(input_src, reduce=False)
        else:
            input_tgt = PrependTokenDataset(input_tgt, self.dictionary.bos())
            src_tokens = input_tgt
            src_lengths = NumelDataset(src_tokens, reduce=False)

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens":
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths":
                src_lengths,
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
        }

        return NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )
示例#2
0
文件: MLMetKE.py 项目: yydxhn/KEPLER
 def desc_dataset(type, dictionary, relation_desc=None):
     now_path=get_path(type)
     #print(now_path)
     dataset=data_utils.load_indexed_dataset(
         now_path,
         dictionary,
         self.args.dataset_impl,
         combine=combine,
     )
     if self.args.init_token is not None:
         dataset = PrependTokenDataset(dataset, self.args.init_token)
     if relation_desc is not None:
         dataset = ConcatSentencesDataset(dataset, relation_desc)
     dataset = TruncateDataset(dataset, self.args.tokens_per_sample) #???
     dataset = RightPadDataset(dataset, pad_idx=self.source_dictionary.pad())
     return dataset
示例#3
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(key, split):
            return os.path.join(self.cfg.data, key, split)

        def make_dataset(key, dictionary):
            split_path = get_path(key, split)

            try:
                dataset = data_utils.load_indexed_dataset(
                    split_path,
                    dictionary,
                    combine=combine,
                )
            except Exception as e:
                if "StorageException: [404] Path not found" in str(e):
                    logger.warning(f"dataset {e} not found")
                    dataset = None
                else:
                    raise e
            return dataset

        input0 = make_dataset("input0", self.source_dictionary)
        assert input0 is not None, "could not find dataset: {}".format(
            get_path("input0", split))
        input1 = make_dataset("input1", self.source_dictionary)

        if self.cfg.init_token is not None:
            input0 = PrependTokenDataset(input0, self.cfg.init_token)

        if input1 is None:
            src_tokens = input0
        else:
            if self.cfg.separator_token is not None:
                input1 = PrependTokenDataset(input1, self.cfg.separator_token)

            src_tokens = ConcatSentencesDataset(input0, input1)

        with data_utils.numpy_seed(self.cfg.seed):
            shuffle = np.random.permutation(len(src_tokens))

        src_tokens = maybe_shorten_dataset(
            src_tokens,
            split,
            self.cfg.shorten_data_split_list,
            self.cfg.shorten_method,
            self.max_positions(),
            self.cfg.seed,
        )

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens":
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths":
                NumelDataset(src_tokens, reduce=False),
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
        }

        if self.cfg.add_prev_output_tokens:
            prev_tokens_dataset = RightPadDataset(
                RollDataset(src_tokens, 1),
                pad_idx=self.dictionary.pad(),
            )
            dataset["net_input"].update(
                prev_output_tokens=prev_tokens_dataset, )

        if not self.cfg.regression_target:
            label_dataset = make_dataset("label", self.label_dictionary)
            if label_dataset is not None:
                dataset.update(target=OffsetTokensDataset(
                    StripTokenDataset(
                        label_dataset,
                        id_to_strip=self.label_dictionary.eos(),
                    ),
                    offset=-self.label_dictionary.nspecial,
                ))
        else:
            label_path = "{0}.label".format(get_path("label", split))
            if os.path.exists(label_path):

                def parse_regression_target(i, line):
                    values = line.split()
                    assert (
                        len(values) == self.cfg.num_classes
                    ), f'expected num_classes={self.cfg.num_classes} regression target values on line {i}, found: "{line}"'
                    return [float(x) for x in values]

                with open(label_path) as h:
                    dataset.update(target=RawLabelDataset([
                        parse_regression_target(i, line.strip())
                        for i, line in enumerate(h.readlines())
                    ]))

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        if self.cfg.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(
            split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]
示例#4
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(type, split):
            return os.path.join(self.args.data, type, split)

        def make_dataset(type, dictionary):
            split_path = get_path(type, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        input0 = make_dataset('input0', self.source_dictionary)
        assert input0 is not None, 'could not find dataset: {}'.format(get_path(type, split))
        input1 = make_dataset('input1', self.source_dictionary)

        if self.args.init_token is not None:
            input0 = PrependTokenDataset(input0, self.args.init_token)

        if input1 is None:
            src_tokens = input0
        else:
            if self.args.separator_token is not None:
                input1 = PrependTokenDataset(input1, self.args.separator_token)

            src_tokens = ConcatSentencesDataset(input0, input1)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens))

        src_tokens = maybe_shorten_dataset(
            src_tokens,
            split,
            self.args.shorten_data_split_whitelist,
            self.args.shorten_method,
            self.args.max_positions,
            self.args.seed,
        )

        dataset = {
            'id': IdDataset(),
            'net_input': {
                'src_tokens': RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                'src_lengths': NumelDataset(src_tokens, reduce=False),
            },
            'nsentences': NumSamplesDataset(),
            'ntokens': NumelDataset(src_tokens, reduce=True),
        }

        if self.args.add_prev_output_tokens:
            prev_tokens_dataset = RightPadDataset(
                RollDataset(src_tokens, 1),
                pad_idx=self.dictionary.pad(),
            )
            dataset['net_input'].update(
                prev_output_tokens=prev_tokens_dataset,
            )

        if not self.args.regression_target:
            label_dataset = make_dataset('label', self.label_dictionary)
            if label_dataset is not None:
                dataset.update(
                    target=OffsetTokensDataset(
                        StripTokenDataset(
                            label_dataset,
                            id_to_strip=self.label_dictionary.eos(),
                        ),
                        offset=-self.label_dictionary.nspecial,
                    )
                )
        else:
            label_path = "{0}.label".format(get_path('label', split))
            if os.path.exists(label_path):
                def parse_regression_target(i, line):
                    values = line.split()
                    assert len(values) == self.args.num_classes, \
                        f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
                    return [float(x) for x in values]
                dataset.update(
                    target=RawLabelDataset([
                        parse_regression_target(i, line.strip()) for i, line in enumerate(open(label_path).readlines())
                    ])
                )

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]
示例#5
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(type, split):
            return os.path.join(self.args.data, type, split)

        def make_dataset(type, dictionary):
            split_path = get_path(type, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                self.source_dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        input0 = make_dataset('input0', self.source_dictionary)
        input_options = [
            make_dataset('input{idx}'.format(idx=idx + 1),
                         self.source_dictionary)
            for idx in range(self.args.num_classes)
        ]

        if self.args.separator_token is not None:
            input0 = PrependTokenDataset(input0, self.args.separator_token)

        src_tokens = []
        for input_option in input_options:
            if self.args.init_token is not None:
                input_option = PrependTokenDataset(input_option,
                                                   self.args.init_token)
            if self.args.max_option_length is not None:
                input_option = TruncateDataset(input_option,
                                               self.args.max_option_length)
            src_token = ConcatSentencesDataset(input_option, input0)
            if self.args.truncate_sequence:
                src_token = TruncateDataset(src_token, self.args.max_positions)
            src_tokens.append(src_token)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens[0]))

        dataset = {
            'id': IdDataset(),
            'nsentences': NumSamplesDataset(),
            'ntokens': NumelDataset(src_tokens[0], reduce=True),
        }

        for src_token_idx in range(len(src_tokens)):
            dataset.update({
                'net_input{idx}'.format(idx=src_token_idx + 1): {
                    'src_tokens':
                    RightPadDataset(
                        src_tokens[src_token_idx],
                        pad_idx=self.source_dictionary.pad(),
                    ),
                    'src_lengths':
                    NumelDataset(src_tokens[src_token_idx], reduce=False),
                }
            })

        label_path = '{}.label'.format(get_path('label', split))
        if os.path.exists(label_path):
            with open(label_path) as h:
                dataset.update(target=RawLabelDataset(
                    [int(x.strip()) for x in h.readlines()]))

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[
                np.maximum.reduce(
                    [src_token.sizes for src_token in src_tokens])
            ],
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(
            split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]
示例#6
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(type, split):
            return os.path.join(self.args.data, type, split)

        def make_dataset(type, dictionary):
            split_path = get_path(type, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                self.source_dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        input0 = make_dataset('input0', self.source_dictionary)
        assert input0 is not None, 'could not find dataset: {}'.format(
            get_path(type, split))
        input1 = make_dataset('input1', self.source_dictionary)

        if self.args.init_token is not None:
            input0 = PrependTokenDataset(input0, self.args.init_token)

        if input1 is None:
            src_tokens = input0
        else:
            if self.args.separator_token is not None:
                input1 = PrependTokenDataset(input1, self.args.separator_token)

            src_tokens = ConcatSentencesDataset(input0, input1)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens))

        if self.args.truncate_sequence:
            src_tokens = TruncateDataset(src_tokens, self.args.max_positions)

        dataset = {
            'id': IdDataset(),
            'net_input': {
                'src_tokens':
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                'src_lengths':
                NumelDataset(src_tokens, reduce=False),
            },
            'nsentences': NumSamplesDataset(),
            'ntokens': NumelDataset(src_tokens, reduce=True),
        }

        if not self.args.regression_target:
            label_dataset = make_dataset('label', self.target_dictionary)
            if label_dataset is not None:
                dataset.update(target=OffsetTokensDataset(
                    StripTokenDataset(
                        label_dataset,
                        id_to_strip=self.target_dictionary.eos(),
                    ),
                    offset=-self.target_dictionary.nspecial,
                ))
        else:
            label_path = f"{get_path('label', split)}.label"
            if os.path.exists(label_path):
                dataset.update(target=RawLabelDataset(
                    [float(x.strip()) for x in open(label_path).readlines()]))

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        print(f"| Loaded {split} with #samples: {len(dataset)}")

        self.datasets[split] = dataset
        return self.datasets[split]
    def load_dataset(self, split, epoch=0, combine=False):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        def get_path(type, split):
            return os.path.join(self.args.data, type, split)

        def make_dataset(type, dictionary):
            split_path = get_path(type, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                self.source_dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        input0 = make_dataset('input0', self.source_dictionary)
        assert input0 is not None, 'could not find dataset: {}'.format(
            get_path(type, split))
        input1 = make_dataset('input1', self.source_dictionary)

        input0 = PrependTokenDataset(input0, self.source_dictionary.bos())

        if input1 is None:
            src_tokens = input0
        else:
            input1 = PrependTokenDataset(input1, self.source_dictionary.eos())
            src_tokens = ConcatSentencesDataset(input0, input1)

        src_tokens = TruncateDataset(src_tokens, self.args.max_positions)

        assert not self.args.mask_whole_words

        src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
            src_tokens,
            self.source_dictionary,
            pad_idx=self.source_dictionary.pad(),
            mask_idx=self.mask_idx,
            seed=self.args.seed,
            mask_prob=self.args.mask_prob,
            leave_unmasked_prob=self.args.leave_unmasked_prob,
            random_token_prob=self.args.random_token_prob,
            freq_weighted_replacement=self.args.freq_weighted_replacement,
            mask_whole_words=None,
        )

        with data_utils.numpy_seed(self.args.seed + epoch):
            shuffle = np.random.permutation(len(src_dataset))

        self.datasets[split] = SortDataset(
            NestedDictionaryDataset(
                {
                    'id':
                    IdDataset(),
                    'net_input': {
                        'src_tokens':
                        PadToLenDataset(src_dataset,
                                        pad_idx=self.source_dictionary.pad(),
                                        left_pad=False,
                                        pad_len=self.args.max_positions),
                        'src_lengths':
                        NumelDataset(src_dataset, reduce=False),
                    },
                    'target':
                    PadToLenDataset(tgt_dataset,
                                    pad_idx=self.source_dictionary.pad(),
                                    left_pad=False,
                                    pad_len=self.args.max_positions),
                    'nsentences':
                    NumSamplesDataset(),
                    'ntokens':
                    NumelDataset(src_dataset, reduce=True),
                },
                sizes=[src_dataset.sizes],
            ),
            sort_order=[
                shuffle,
                src_dataset.sizes,
            ],
        )
示例#8
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        if self.cfg.data.endswith("1"):
            data_shard = (epoch - 1) % self.cfg.num_data_splits + 1
            data_path = self.cfg.data[:-1] + str(data_shard)
        else:
            data_path = self.cfg.data

        def get_path(type, data_split):
            return os.path.join(data_path, str(type), data_split)

        def make_dataset(type, dictionary, data_split, combine):
            split_path = get_path(type, data_split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                dictionary,
                combine=combine,
            )
            return dataset

        def load_split(data_split, metric):
            input_src = None
            if self.cfg.include_src:
                input_src = make_dataset("input_src",
                                         self.dictionary,
                                         data_split,
                                         combine=False)
                assert input_src is not None, "could not find dataset: {}".format(
                    get_path("input_src", data_split))

            input_tgt = make_dataset("input_tgt",
                                     self.dictionary,
                                     data_split,
                                     combine=False)
            assert input_tgt is not None, "could not find dataset: {}".format(
                get_path("input_tgt", data_split))

            label_path = f"{get_path(metric, data_split)}.{metric}"
            assert os.path.exists(
                label_path), f"could not find dataset: {label_path}"

            np_labels = np.loadtxt(label_path)
            if self.cfg.target_metric == "ter":
                np_labels = -np_labels
            label = RawLabelDataset(np_labels)

            return input_src, input_tgt, label

        src_datasets = []
        tgt_datasets = []
        label_datasets = []

        if split == self.cfg.train_subset:
            for k in itertools.count():
                split_k = "train" + (str(k) if k > 0 else "")
                prefix = os.path.join(data_path, "input_tgt", split_k)
                if not indexed_dataset.dataset_exists(prefix, impl=None):
                    if k > 0:
                        break
                    else:
                        raise FileNotFoundError(f"Dataset not found: {prefix}")
                input_src, input_tgt, label = load_split(
                    split_k, self.cfg.target_metric)
                src_datasets.append(input_src)
                tgt_datasets.append(input_tgt)
                label_datasets.append(label)
        else:
            input_src, input_tgt, label = load_split(split,
                                                     self.cfg.target_metric)
            src_datasets.append(input_src)
            tgt_datasets.append(input_tgt)
            label_datasets.append(label)

        if len(tgt_datasets) == 1:
            input_tgt, label = tgt_datasets[0], label_datasets[0]
            if self.cfg.include_src:
                input_src = src_datasets[0]
        else:
            input_tgt = ConcatDataset(tgt_datasets)
            label = ConcatDataset(label_datasets)
            if self.cfg.include_src:
                input_src = ConcatDataset(src_datasets)

        input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions)
        if self.cfg.include_src:
            input_src = PrependTokenDataset(input_src, self.dictionary.bos())
            input_src = TruncateDataset(input_src, self.cfg.max_positions)
            src_lengths = NumelDataset(input_src, reduce=False)
            src_tokens = ConcatSentencesDataset(input_src, input_tgt)
        else:
            src_tokens = PrependTokenDataset(input_tgt, self.dictionary.bos())
            src_lengths = NumelDataset(src_tokens, reduce=False)

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens":
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths":
                src_lengths,
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
            "target": label,
        }

        dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        assert len(dataset) % self.cfg.mt_beam == 0, (
            "dataset size (%d) is not a multiple of beam size (%d)" %
            (len(dataset), self.cfg.mt_beam))

        # no need to shuffle valid/test sets
        if not self.cfg.no_shuffle and split == self.cfg.train_subset:

            # need to keep all hypothese together
            start_idx = np.arange(0, len(dataset), self.cfg.mt_beam)
            with data_utils.numpy_seed(self.cfg.seed + epoch):
                np.random.shuffle(start_idx)

            idx = np.arange(0, self.cfg.mt_beam)
            shuffle = np.tile(idx, (len(start_idx), 1)).reshape(-1) + np.tile(
                start_idx, (self.cfg.mt_beam, 1)).transpose().reshape(-1)

            dataset = SortDataset(
                dataset,
                sort_order=[shuffle],
            )

        logger.info(f"Loaded {split} with #samples: {len(dataset)}")

        self.datasets[split] = dataset
        return self.datasets[split]
示例#9
0
def load_glue_data(
        task_path,
        mydict,
        mode='train'
):  #一个大列表,每个item是一个文档矩阵,矩阵里面每个item是一个node的数值  ,for token_id 和
    # dataset = data_utils.load_indexed_dataset(path,mydict,'mmap',combine=False,)
    # dataset = TokenBlockDataset(dataset,dataset.sizes,512 - 1,pad=mydict.pad(),eos=mydict.eos(), break_mode='complete',)
    # dataset = PrependTokenDataset(dataset, mydict.bos())
    #dataset=[]
    #input1=open(input_path1,'r').readlines()#[:10000]

    #label=open(label_path,'r').readlines()

    input0 = data_utils.load_indexed_dataset(
        os.path.join(task_path, 'input0', mode),
        mydict,
        'mmap',
        combine=False,
    )
    assert input0 is not None, 'could not find dataset: {}'.format(
        get_path(type, split))

    input1 = data_utils.load_indexed_dataset(
        os.path.join(task_path, 'input1', mode),
        mydict,
        'mmap',
        combine=False,
    )
    input0 = PrependTokenDataset(input0, mydict.bos())
    if input1 is None:
        src_tokens = input0
    else:
        input1 = PrependTokenDataset(input1, mydict.eos())
        src_tokens = ConcatSentencesDataset(input0, input1)

    if not 'STS-B' in task_path:
        label_dictionary = Dictionary.load(
            os.path.join(task_path, 'label', 'dict.txt'))
        label_dictionary.add_symbol('<mask>')
        #label_dataset = make_dataset('label', label_dictionary)
        label_dataset = data_utils.load_indexed_dataset(
            os.path.join(task_path, 'label', mode),
            label_dictionary,
            'mmap',
            combine=False,
        )
        if label_dataset is not None:

            label = OffsetTokensDataset(
                StripTokenDataset(
                    label_dataset,
                    id_to_strip=label_dictionary.eos(),
                ),
                offset=-label_dictionary.nspecial,
            )
    else:
        label_path = "{0}.label".format(os.path.join(task_path, 'label', mode))
        if os.path.exists(label_path):

            def parse_regression_target(i, line):
                values = line.split()
                assert len(values) == 1, \
                    f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
                return [float(x) for x in values]

            with open(label_path) as h:

                label = RawLabelDataset([
                    parse_regression_target(i, line.strip())
                    for i, line in enumerate(h.readlines())
                ])
    print('data size: ', len(src_tokens), len(label))
    assert len(src_tokens) == len(label)
    # with data_utils.numpy_seed(self.args.seed):
    #     shuffle = np.random.permutation(len(src_tokens))

    # src_tokens = maybe_shorten_dataset(
    #     src_tokens,
    #     split,
    #     self.args.shorten_data_split_list,
    #     self.args.shorten_method,
    #     self.args.max_positions,
    #     self.args.seed,
    # )
    # input_data1=[]
    # input_data2=[]
    #label_list=[]
    # for line in input1:
    #     if len(line.strip())==0:
    #         input_data1.append([])
    #     else:
    #         line = line.strip().split(' ')
    #         input_data1.append([int(x) for x in line])
    # if input_path2:
    #     input2=open(input_path2,'r').readlines()
    #     for line in input2:
    #         if len(line.strip())==0:
    #             input_data2.append([])
    #         else:
    #             line = line.strip().split(' ')
    #             input_data2.append([int(x) for x in line])
    # if task=='QNLI':
    #     for line in label:
    #         line = line.strip()
    #         if line=='entailment':
    #             label_list.append(int(1))
    #         else:
    #             assert line=='not_entailment'
    #             label_list.append(int(0))
    # else:
    #     for line in label:
    #         line = line.strip()
    #         label_list.append(int(line))
    # print('data length: ',len(input_data1),len(input_data2))
    # assert len(input_data1)==len(label_list)
    # if len(input_data2)!=0:
    #     assert len(input_data1)==len(input_data2)

    return src_tokens, label
示例#10
0
        def lang_dataset(lang):
            input0 = make_dataset('input0', lang, self.source_dictionary)
            assert input0 is not None, 'could not find dataset: {}'.format(
                get_path('input0', lang, split))
            input1 = make_dataset('input1', lang, self.source_dictionary)

            if self.args.init_token is not None:
                input0 = PrependTokenDataset(input0, self.args.init_token)

            if input1 is None:
                src_tokens = input0
            else:
                if self.args.separator_token is not None:
                    input1 = PrependTokenDataset(input1,
                                                 self.args.separator_token)

                src_tokens = ConcatSentencesDataset(input0, input1)

            with data_utils.numpy_seed(self.args.seed):
                shuffle = np.random.permutation(len(src_tokens))

            if self.args.truncate_sequence:
                src_tokens = TruncateDataset(src_tokens,
                                             self.args.max_positions)

            dataset = {
                'id': IdDataset(),
                'net_input': {
                    'src_tokens':
                    RightPadDataset(
                        src_tokens,
                        pad_idx=self.source_dictionary.pad(),
                    ),
                    'src_lengths':
                    NumelDataset(src_tokens, reduce=False),
                },
                'nsentences': NumSamplesDataset(),
                'ntokens': NumelDataset(src_tokens, reduce=True),
            }

            if not self.args.regression_target:
                label_dataset = make_dataset('label', lang,
                                             self.target_dictionary)
                if label_dataset is not None:
                    dataset.update(target=OffsetTokensDataset(
                        StripTokenDataset(
                            label_dataset,
                            id_to_strip=self.target_dictionary.eos(),
                        ),
                        offset=-self.target_dictionary.nspecial,
                    ))
            else:
                label_path = "{0}.label".format(get_path('label', lang, split))
                if os.path.exists(label_path):
                    dataset.update(target=RawLabelDataset([
                        float(x.strip()) for x in open(label_path).readlines()
                    ]))

            nested_dataset = NestedDictionaryDataset(
                dataset,
                sizes=[src_tokens.sizes],
            )

            if self.args.no_shuffle:
                dataset = nested_dataset
            else:
                dataset = SortDataset(
                    nested_dataset,
                    # shuffle
                    sort_order=[shuffle],
                )

            print("| Loaded {0} with #samples: {1}".format(
                split, len(dataset)))
            return dataset
示例#11
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(key, split):
            return os.path.join(self.args.data, key, split)

        def make_dataset(key, dictionary):
            split_path = get_path(key, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        input0 = make_dataset("input0", self.source_dictionary)
        assert input0 is not None, "could not find dataset: {}".format(
            get_path("input0", split))
        input1 = make_dataset("input1", self.source_dictionary)

        if self.args.init_token is not None:
            input0 = PrependTokenDataset(input0, self.args.init_token)

        if input1 is None:
            src_tokens = input0
        else:
            if self.args.separator_token is not None:
                input1 = PrependTokenDataset(input1, self.args.separator_token)

            src_tokens = ConcatSentencesDataset(input0, input1)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens))

        src_tokens = maybe_shorten_dataset(
            src_tokens,
            split,
            self.args.shorten_data_split_list,
            self.args.shorten_method,
            self.args.max_positions,
            self.args.seed,
        )

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens":
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths":
                NumelDataset(src_tokens, reduce=False),
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
        }

        if self.args.add_prev_output_tokens:
            prev_tokens_dataset = RightPadDataset(
                RollDataset(src_tokens, 1),
                pad_idx=self.dictionary.pad(),
            )
            dataset["net_input"].update(
                prev_output_tokens=prev_tokens_dataset, )

        label_path = "{0}.npz".format(get_path("label", split))
        if os.path.exists(label_path):
            csr_matrix = load_npz(label_path)
            dataset.update(target=CSRLabelDataset(csr_matrix))

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(
            split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]
示例#12
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(type, split):
            return os.path.join(self.args.data, type, split)

        def make_dataset(type, dictionary):
            split_path = get_path(type, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                self.source_dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        # inputs are loaded similarly to sentence_prediction
        input0 = make_dataset("input0", self.source_dictionary)  # question
        input1 = make_dataset("input1", self.source_dictionary)  # context

        # src_tokens: <init_token> input0 <separator_token> input1 <eos_token>
        if self.args.init_token is not None:
            input0 = PrependTokenDataset(input0, self.args.init_token)
        if self.args.separator_token is not None:
            input1 = PrependTokenDataset(input1, self.args.separator_token)
        if self.args.max_context_length is not None:
            input1 = TruncateDataset(input1, self.args.max_option_length)
        src_tokens = ConcatSentencesDataset(input0, input1)
        if self.args.truncate_sequence:
            src_tokens = TruncateDataset(src_tokens, self.args.max_positions)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens))

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens":
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths":
                NumelDataset(src_tokens, reduce=False),
                "input0_lengths":
                NumelDataset(
                    input0, reduce=False
                ),  # question length (init_token possibly included)
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
        }

        # labels (spans) are loaded similarly to sentence_ranking
        label_path = "{}.label".format(get_path("label", split))

        def _process_label(positions, input0_length, truncate_sequence,
                           max_positions):
            """Process a span [start:end] to the input range.
            After processing, tokens can be accessed by tokens[start:end+1].
            TODO: change inputs to reflect this change in the first place.
            """
            start, end = [
                pos + input0_length + (self.args.separator_token is not None)
                for pos in positions
            ]
            end -= 1  # [0, 511]
            if truncate_sequence:
                if start >= max_positions:
                    start, end = max_positions - 1, max_positions - 1  # not predictable
                elif end >= max_positions:
                    end = max_positions - 1
            return start, end

        if os.path.exists(label_path):
            with open(label_path) as h:
                dataset.update(target=RawLabelDataset([
                    _process_label(
                        tuple(int(pos) for pos in x.split()),
                        dataset["net_input"]["input0_lengths"][i],
                        self.args.truncate_sequence,
                        self.max_positions(),
                    ) for i, x in enumerate(
                        h.readlines())  # (start_position, end_position)
                ]))

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(
            split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]
示例#13
0
    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""
        def get_path(type, split):
            return os.path.join(self.args.data, type, split)

        def make_dataset(type, dictionary):
            split_path = get_path(type, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        # input0 is source, input1 is synthetic target, input2 is reference
        input0 = make_dataset(self.args.input0, self.source_dictionary)
        assert input0 is not None, 'could not find dataset: {}'.format(
            get_path(type, split))
        input1 = make_dataset(self.args.input1, self.source_dictionary)

        if self.args.init_token is not None:
            input0 = PrependTokenDataset(input0, self.args.init_token)

        if self.args.input2 is not None:
            input2 = make_dataset(self.args.input2, self.source_dictionary)

        if self.args.input2 is not None and self.add_ref_prob > 0 and split != 'valid':
            input3 = PrependTokenDataset(input2, self.args.separator_token)
        else:
            input3 = None

        if input1 is None:
            src_tokens = input0
        else:
            if self.args.separator_token is not None:
                input1 = PrependTokenDataset(input1, self.args.separator_token)

            if self.args.input2 is not None and self.add_ref_prob > 0. and split != 'valid':
                src_tokens = ConcatSentencesDataset(
                    input0,
                    input3,
                    input1,
                    add_ref_prob=self.add_ref_prob,
                    drop_ref_rate=self.args.dropout_ref,
                    pad_idx=self.source_dictionary.pad(),
                    eos_idx=self.source_dictionary.eos(),
                    bos_idx=self.source_dictionary.bos())
            else:
                src_tokens = ConcatSentencesDataset(input0, input1)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens))

        if self.args.truncate_sequence:
            src_tokens = TruncateDataset(src_tokens, self.args.max_positions)

        if self.args.input2 is not None and self.args.add_tran_loss:
            # create masked input and targets
            mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \
                if self.args.mask_whole_words else None
            ref_dataset, ref_target_dataset = MaskTokensDataset.apply_mask(
                input2,
                self.source_dictionary,
                pad_idx=self.source_dictionary.pad(),
                mask_idx=self.mask_idx,
                seed=self.args.seed,
                mask_prob=self.args.mask_prob,
                leave_unmasked_prob=self.args.leave_unmasked_prob,
                random_token_prob=self.args.random_token_prob,
                freq_weighted_replacement=self.args.freq_weighted_replacement,
                mask_whole_words=mask_whole_words,
            )

            if self.args.separator_token is not None:
                input2 = PrependTokenDataset(ref_dataset,
                                             self.args.separator_token)
            parallel_src_tokens = ConcatSentencesDataset(input0, input2)
            if self.args.truncate_sequence:
                parallel_src_tokens = TruncateDataset(parallel_src_tokens,
                                                      self.args.max_positions)

        dataset = {
            'id': IdDataset(),
            'net_input': {
                'src_tokens':
                RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                'src_lengths':
                NumelDataset(src_tokens, reduce=False),
            },
            'nsentences': NumSamplesDataset(),
            'ntokens': NumelDataset(src_tokens, reduce=True),
        }

        if self.args.input2 is not None and self.args.add_tran_loss:
            dataset['net_input']['parallel_src_tokens'] = RightPadDataset(
                parallel_src_tokens,
                pad_idx=self.source_dictionary.pad(),
            )

        if self.args.add_prev_output_tokens:
            prev_tokens_dataset = RightPadDataset(
                RollDataset(src_tokens, 1),
                pad_idx=self.dictionary.pad(),
            )
            dataset['net_input'].update(
                prev_output_tokens=prev_tokens_dataset, )

        if not self.args.regression_target:
            label_dataset = make_dataset('label', self.label_dictionary)
            if label_dataset is not None:
                dataset.update(target=OffsetTokensDataset(
                    StripTokenDataset(
                        label_dataset,
                        id_to_strip=self.label_dictionary.eos(),
                    ),
                    offset=-self.label_dictionary.nspecial,
                ))
            if self.args.input2 is not None and self.args.add_tran_loss:
                # used as translation target when calculating loss
                dataset.update(parallel_target=RightPadDataset(
                    ref_target_dataset,
                    pad_idx=self.source_dictionary.pad(),
                ))
        else:
            label_path = "{0}.label".format(get_path('label', split))
            if os.path.exists(label_path):

                def parse_regression_target(i, line):
                    values = line.split()
                    assert len(values) == self.args.num_classes, \
                        f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
                    return [float(x) for x in values]

                dataset.update(target=RawLabelDataset([
                    parse_regression_target(i, line.strip())
                    for i, line in enumerate(open(label_path).readlines())
                ]))

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
            all_sizes=src_tokens.all_sizes
            if self.args.add_target_num_tokens else None,
            padding_idx=self.source_dictionary.pad(),
            add_ref_prob=self.add_ref_prob if split != 'valid' else 0.,
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(
            split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]