Exemplo n.º 1
0
def train_valid_split(dataset, valid_ratio=0.05):
    """Split the dataset into training and validation sets.

    Parameters
    ----------
    train : list
        A list of training samples.
    valid_ratio : float, default 0.05
        Proportion of training samples to use for validation set
        range: [0, 1]

    Returns
    -------
    train : SimpleDataset
    valid : SimpleDataset
    """
    if not 0.0 <= valid_ratio <= 1.0:
        raise ValueError('valid_ratio should be in [0, 1]')

    num_train = len(dataset)
    num_valid = np.ceil(num_train * valid_ratio).astype('int')
    indices = np.arange(num_train)

    np.random.shuffle(indices)
    valid = SimpleDataset([dataset[indices[i]] for i in range(num_valid)])
    train = SimpleDataset([dataset[indices[i + num_valid]] for i in range(num_train - num_valid)])
    return train, valid
Exemplo n.º 2
0
def test_concatenation():
    datasets = [
        SimpleDataset([1, 2, 3, 4]),
        SimpleDataset([5, 6]),
        SimpleDataset([8, 0, 9]),
    ]
    dataset = nlp.data.ConcatDataset(datasets)
    assert len(dataset) == 9
    assert dataset[0] == 1
    assert dataset[5] == 6
Exemplo n.º 3
0
    def __call__(self, corpus):
        """Batchify a dataset.

        Parameters
        ----------
        corpus : mxnet.gluon.data.Dataset
            A flat dataset to be batchified.

        Returns
        -------
        mxnet.gluon.data.Dataset
            Batches of numericalized samples such that the recurrent states
            from last batch connects with the current batch for each sample.
            Each element of the Dataset is a tuple of data and label arrays for
            BPTT. They are of shape (seq_len, batch_size) respectively.
        """
        if self._last_batch == 'keep':
            coded = self._vocab[list(corpus)]
            sample_len = math.ceil(float(len(coded)) / self._batch_size)
            padding_size = _slice_pad_length(sample_len, self._seq_len + 1, 1) * \
                self._batch_size + sample_len * self._batch_size - len(coded)
            coded.extend([self._vocab[self._vocab.padding_token]] *
                         int(padding_size))
            assert len(coded) % self._batch_size == 0
            assert not _slice_pad_length(
                len(coded) / self._batch_size, self._seq_len + 1, 1)
        else:
            sample_len = len(corpus) // self._batch_size
            coded = self._vocab[corpus[:sample_len * self._batch_size]]
        data = mx.nd.array(coded).reshape((self._batch_size, -1)).T
        batches = slice_sequence(data, self._seq_len + 1, overlap=1)

        return SimpleDataset(batches).transform(_split_data_label, lazy=False)
Exemplo n.º 4
0
def preprocess_dataset(dataset, transform, num_workers=8):
    """Use multiprocessing to perform transform for dataset.

    Parameters
    ----------
    dataset: dataset-like object
        Source dataset.
    transform: callable
        Transformer function.
    num_workers: int, default 8
        The number of multiprocessing workers to use for data preprocessing.

    """
    worker_fn = partial(_worker_fn, transform=transform)
    start = time.time()

    pool = mp.Pool(num_workers)
    dataset_transform = []
    dataset_len = []

    for data in pool.map(worker_fn, dataset):
        if data:
            for _data in data:
                dataset_transform.append(_data[:-1])
                dataset_len.append(_data[-1])

    dataset = SimpleDataset(dataset_transform).transform(
        lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))

    end = time.time()
    pool.close()

    print("Done! Transform dataset costs %.2f seconds." % (end - start))
    return dataset, dataset_len
Exemplo n.º 5
0
    def bptt_batchify(self, vocab, seq_len, batch_size, last_batch='keep'):
        """Transform the dataset into batches of numericalized samples, in the way that the
        recurrent states from last batch connects with the current batch for each sample.

        Each sample is of shape `(seq_len, batch_size)`. When `last_batch='keep'`, the first
        dimension of last sample may be shorter than `seq_len`.

        Parameters
        ----------
        vocab : gluonnlp.Vocab
            The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
            index according to the vocabulary.
        seq_len : int
            The length of each of the samples for truncated back-propagation-through-time (TBPTT).
        batch_size : int
            The number of samples in each batch.
        last_batch : {'keep', 'discard'}
            How to handle the last batch if the remaining length is less than `seq_len`.

            keep - A batch with less samples than previous batches is returned.
            discard - The last batch is discarded if its incomplete.
        """
        data = self.batchify(vocab, batch_size)
        batches = slice_sequence(data, seq_len + 1, overlap=1)
        if last_batch == 'keep':
            sample_len = len(self._data[0]) // batch_size
            has_short_batch = _slice_pad_length(sample_len * batch_size,
                                                seq_len + 1, 1) > 0
            if has_short_batch:
                batches.append(data[seq_len * len(batches):, :])
        return SimpleDataset(batches).transform(
            lambda x: (x[:min(len(x) - 1, seq_len), :], x[1:, :]))
Exemplo n.º 6
0
def test_sorted_sampler():
    N = 1000
    dataset = SimpleDataset([np.random.normal(0, 1, (np.random.randint(10, 100), 1, 1))
                             for _ in range(N)])
    gt_sample_id = sorted(range(len(dataset)), key=lambda i: dataset[i].shape, reverse=True)
    sample_ret = list(SortedSampler([ele.shape[0] for ele in dataset]))
    for lhs, rhs in zip(gt_sample_id, sample_ret):
        assert lhs == rhs
Exemplo n.º 7
0
    def preprocess(self, data):
        if data[0].get('data') is not None:
            data = ast.literal_eval(data[0].get('data').decode('utf-8'))

        paragraph = data[0].get('paragraph')
        question = data[0].get('question')

        # format is: example_id, qas_id, question_text, paragraph_text,
        #            orig_answer_text, answer_offset, is_impossible
        examples, features = self._data_transformer(
            (0, 0, question, paragraph, [''], [0], 0))

        # features are: example_id, input_ids, segment_ids,
        #               valid_length, start_position, end_position
        self._dev_dataset = SimpleDataset(examples)
        self._features = SimpleDataset(features)

        return self._features
Exemplo n.º 8
0
def test_bert_sentencepiece_sentences_transform():
    url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-682b5d15.bpe'
    with warnings.catch_warnings():
        # UserWarning: File test-682b5d15.bpe exists in file system so the downloaded file is deleted
        warnings.simplefilter("ignore")
        f = download(url, overwrite=True)
    bert_vocab = BERTVocab.from_sentencepiece(f)
    bert_tokenizer = t.BERTSPTokenizer(f, bert_vocab, lower=True)
    assert bert_tokenizer.is_first_subword(u'▁this')
    assert not bert_tokenizer.is_first_subword(u'this')
    max_len = 36
    data_train_raw = SimpleDataset(
        [[u'This is a very awesome, life-changing sentence.']])
    transform = t.BERTSentenceTransform(bert_tokenizer,
                                        max_len,
                                        pad=True,
                                        pair=False)
    try:
        data_train = data_train_raw.transform(transform)
    except ImportError:
        warnings.warn(
            "Sentencepiece not installed, skip test_bert_sentencepiece_sentences_transform()."
        )
        return
    processed = list(data_train)[0]

    tokens = [
        u'▁this', u'▁is', u'▁a', u'▁very', u'▁a', u'w', u'es', u'om', u'e',
        u'▁', u',', u'▁life', u'▁', u'-', u'▁c', u'hang', u'ing', u'▁sentence',
        u'▁', u'.'
    ]
    token_ids = [bert_vocab[bert_vocab.cls_token]
                 ] + bert_tokenizer.convert_tokens_to_ids(tokens) + [
                     bert_vocab[bert_vocab.sep_token]
                 ]
    token_ids += [bert_vocab[bert_vocab.padding_token]
                  ] * (max_len - len(token_ids))

    # token ids
    assert all(processed[0] == np.array(token_ids, dtype='int32'))
    # sequence length
    assert processed[1].item() == len(tokens) + 2
    # segment id
    assert all(processed[2] == np.array([0] * max_len, dtype='int32'))
Exemplo n.º 9
0
def test_data_loader_able_to_read(squad_dev_and_vocab_provider):
    dataset, vocab_provider = squad_dev_and_vocab_provider
    transformer = SQuADTransform(vocab_provider, question_max_length,
                                 context_max_length)
    record = dataset[0]

    processed_dataset = SimpleDataset([transformer(*record)])
    loadable_data = SimpleDataset([(r[0], r[2], r[3], r[4], r[5], r[6])
                                   for r in processed_dataset])
    dataloader = DataLoader(loadable_data, batch_size=1)

    for data in dataloader:
        record_index, question_words, context_words, question_chars, context_chars, answers = data

        assert record_index is not None
        assert question_words is not None
        assert context_words is not None
        assert question_chars is not None
        assert context_chars is not None
        assert answers is not None
Exemplo n.º 10
0
    def corpus_to_dataset(self, corpus: ScoredCorpus) -> SimpleDataset:

        sents_expanded = []

        for sent_idx, sent_dict in enumerate(corpus.values()):
            sent = self._apply_tokenizer_opts(sent_dict['text'])
            tokens_original = [self._vocab.cls_token] + self._tokenizer(sent) + [self._vocab.eos_token]
            ids_original = np.array(self._tokenizer.convert_tokens_to_ids(tokens_original))

            # Enforce max length
            if len(ids_original) > self._max_length:
                logging.error("Line #{} is too long; will output score of 0 and omit in token counts (but not yet in word counts!)".format(sent_idx+1))
            else:
                sents_expanded += [(sent_idx, ids_original, len(ids_original), sent_dict['score'])]

        return SimpleDataset(sents_expanded)
Exemplo n.º 11
0
    def bptt_batchify(self, vocab, seq_len, batch_size, last_batch='keep'):
        """Transform the dataset into batches of numericalized samples, in the way that the
        recurrent states from last batch connects with the current batch for each sample.

        Each sample is of shape `(seq_len, batch_size)`. When `last_batch='keep'`, the first
        dimension of last sample may be shorter than `seq_len`.

        Parameters
        ----------
        vocab : gluonnlp.Vocab
            The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
            index according to the vocabulary.
        seq_len : int
            The length of each of the samples for truncated back-propagation-through-time (TBPTT).
        batch_size : int
            The number of samples in each batch.
        last_batch : {'keep', 'discard'}
            How to handle the last batch if the remaining length is less than `seq_len`.

            - keep: A batch with less samples than previous batches is returned. vocab.padding_token
              is used to pad the last batch based on batch size.

            - discard: The last batch is discarded if it's smaller than `(seq_len, batch_size)`.
        """
        if last_batch not in ['keep', 'discard']:
            raise ValueError(
                'Got invalid last_batch: "{}". Must be "keep" or "discard".'.
                format(last_batch))

        if last_batch == 'keep':
            if not vocab.padding_token:
                raise ValueError('vocab.padding_token must be specified '
                                 'in vocab when last_batch="keep".')
            coded = vocab[self._data[0]]
            sample_len = math.ceil(float(len(coded)) / batch_size)
            padding_size = _slice_pad_length(sample_len, seq_len + 1, 1) * batch_size + \
                sample_len * batch_size - len(coded)
            coded.extend([vocab[vocab.padding_token]] * int(padding_size))
            assert len(coded) % batch_size == 0
            assert not _slice_pad_length(len(coded) / batch_size, seq_len + 1, 1)
        else:
            sample_len = len(self._data[0]) // batch_size
            coded = vocab[self._data[0][:sample_len * batch_size]]
        data = mx.nd.array(coded).reshape((batch_size, -1)).T
        batches = slice_sequence(data, seq_len + 1, overlap=1)

        return SimpleDataset(batches).transform(_split_data_label)
Exemplo n.º 12
0
    def bptt_batchify(self, vocab, seq_len, batch_size, last_batch='keep'):
        """Transform the dataset into batches of numericalized samples, in the way that the
        recurrent states from last batch connects with the current batch for each sample.

        Each sample is of shape `(seq_len, batch_size)`. When `last_batch='keep'`, the first
        dimension of last sample may be shorter than `seq_len`.

        Parameters
        ----------
        vocab : gluonnlp.Vocab
            The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
            index according to the vocabulary.
        seq_len : int
            The length of each of the samples for truncated back-propagation-through-time (TBPTT).
        batch_size : int
            The number of samples in each batch.
        last_batch : {'keep', 'discard'}
            How to handle the last batch if the remaining length is less than `seq_len`.

            - keep: A batch with less samples than previous batches is returned. vocab.padding_token
              is used to pad the last batch based on batch size.

            - discard: The last batch is discarded if it's smaller than `(seq_len, batch_size)`.
        """
        data = self.batchify(vocab, batch_size)
        batches = slice_sequence(data, seq_len + 1, overlap=1)
        if last_batch == 'keep':
            sample_len = len(self._data[0]) // batch_size
            has_short_batch = _slice_pad_length(sample_len * batch_size,
                                                seq_len + 1, 1) > 0
            if has_short_batch:
                ctx = data[0].context if len(data) else None
                last_batch = self._data[0][seq_len * batch_size *
                                           len(batches):]
                excess_size = len(last_batch) % batch_size
                if excess_size:
                    assert vocab.padding_token, 'Padding token must be specified in vocab when ' \
                                                'last_batch="keep".'
                    padding_size = batch_size - excess_size
                    last_batch.extend([vocab.padding_token] * padding_size)
                batches.append(
                    mx.nd.array(vocab[last_batch],
                                ctx=ctx).reshape(batch_size, -1).T)
        return SimpleDataset(batches).transform(
            lambda x: (x[:min(len(x) - 1, seq_len), :], x[1:, :]))
Exemplo n.º 13
0
def preprocess_dataset(dataset, question_max_length, context_max_length):
    """Process SQuAD dataset by creating NDArray version of data

    :param Dataset dataset: SQuAD dataset
    :param int question_max_length: Maximum length of question (padded or trimmed to that size)
    :param int context_max_length: Maximum length of context (padded or trimmed to that size)

    Returns
    -------
    SimpleDataset
        Dataset of preprocessed records
    """
    vocab_provider = VocabProvider(dataset)
    transformer = SQuADTransform(vocab_provider, question_max_length,
                                 context_max_length)
    processed_dataset = SimpleDataset(
        dataset.transform(transformer, lazy=False))
    return processed_dataset
Exemplo n.º 14
0
    def __call__(self, data):
        """Batchify a dataset.

        Parameters
        ----------
        data : mxnet.gluon.data.Dataset
            A flat dataset to be batchified.

        Returns
        -------
        mxnet.gluon.data.Dataset
            NDArray of shape (len(data) // N, N) where N is the batch_size
            wrapped by a mxnet.gluon.data.SimpleDataset. Excessive tokens that
            don't align along the batches are discarded.
        """
        batch_num = len(data) // self._batch_size
        return SimpleDataset(
            mx.nd.array(data[:batch_num * self._batch_size]).reshape(
                self._batch_size, -1).T)
Exemplo n.º 15
0
    def corpus_to_dataset(self, corpus: Corpus) -> SimpleDataset:

        sents_expanded = []

        for sent_idx, sent in enumerate(corpus.values()):
            sent = self._apply_tokenizer_opts(sent)
            if self._add_special:
                tokens_original = [self._vocab.cls_token] + self._tokenizer(sent) + [self._vocab.sep_token]
            else:
                tokens_original = [self._vocab.cls_token] + self._tokenizer(sent)
            ids_original = np.array(self._tokenizer.convert_tokens_to_ids(tokens_original))

            # Enforce max length
            if len(ids_original) > self._max_length:
                logging.error("Line #{} is too long; will output score of 0 and omit in token counts (but not yet in word counts!)".format(sent_idx+1))
            else:
                ids_masked = self._ids_to_masked(ids_original)

                if self._wwm:
                    # TODO: Wasteful, but for now "deserialize" the mask set into individual positions
                    # The masks are already applied in ids
                    for ids, mask_set in ids_masked:
                        for mask_el, id_original in zip(mask_set, ids_original[mask_set]):
                            sents_expanded.append((
                                    sent_idx,
                                    ids,
                                    len(ids_original),
                                    mask_el,
                                    [id_original],
                                1))
                else:
                    sents_expanded += [(
                            sent_idx,
                            ids,
                            len(ids_original),
                            mask_set,
                            ids_original[mask_set],
                            1)
                        for ids, mask_set in ids_masked]

        return SimpleDataset(sents_expanded)
Exemplo n.º 16
0
    def corpus_to_dataset(self, corpus: Corpus) -> SimpleDataset:

        sents_expanded = []

        for sent_idx, sent in enumerate(corpus.values()):
            sent = self._apply_tokenizer_opts(sent)
            ids_original = np.array(
                self._tokenizer.encode(sent, add_special_tokens=True))

            # Enforce max length
            if len(ids_original) > self._max_length:
                logging.error(
                    "Line #{} is too long; will output score of 0 and omit in token counts (but not yet in word counts!)"
                    .format(sent_idx + 1))
            else:
                ids_masked = self._ids_to_masked(ids_original)
                sents_expanded += [(sent_idx, ids, len(ids_original), mask_set,
                                    ids_original[mask_set], 1)
                                   for ids, mask_set in ids_masked]
                # print([self._tokenizer.convert_ids_to_tokens(sent[1]) for sent in sents_expanded[:3] + sents_expanded[-3:]])

        return SimpleDataset(sents_expanded)
Exemplo n.º 17
0
 def __call__(self, data):
     batched_data = super(LanguageModelBPTT, self).__call__(data)
     batches = slice_sequence(batched_data, self.seq_len + 1, overlap=1)
     if self.last_batch == 'keep':
         sample_len = len(data) // self.batch_size
         has_short_batch = _slice_pad_length(sample_len * self.batch_size,
                                             self.seq_len + 1, 1) > 0
         if has_short_batch:
             ctx = batched_data.context if len(batched_data) else None
             last_batch = data[self.seq_len * self.batch_size *
                               len(batches):]
             excess_size = len(last_batch) % self.batch_size
             if excess_size:
                 assert self.vocab.padding_token, \
                     'Padding token must be specified in vocab when ' \
                     'last_batch="keep".'
                 padding_size = self.batch_size - excess_size
                 last_batch.extend([self.vocab.padding_token] *
                                   padding_size)
             batches.append(
                 mx.nd.array(self.vocab[last_batch],
                             ctx=ctx).reshape(self.batch_size, -1).T)
     return SimpleDataset(batches).transform(
         lambda x: (x[:min(len(x) - 1, self.seq_len), :], x[1:, :]))
Exemplo n.º 18
0
    def get_loader(self):
        def batchify_fn(list_data):

            input_words = []
            input_valid_lens = []
            input_segments = []

            target_words = []
            target_valid_lens = []
            target_segments = []

            target_actions = []
            target_pms = []
            list_input_texts = []

            target_idxs = []

            error_embs = []
            start_embs = []
            end_embs = []
            list_ids = []

            _list_target_texts = []

            if not self.config['csc_fixed']:

                if self.mode == 'train':

                    for train_data in list_data:

                        # print(train_data)
                        list_ids.append(train_data['id'])

                        str_input_text = train_data['input']
                        # no target means no error
                        if 'target' not in train_data:
                            str_target_text = str_input_text
                        else:
                            str_target_text = train_data['target']

                        input_data = self.transformer([str_input_text])

                        if len(str_input_text) > self.max_seq_len:  # 超過長度
                            continue
                        target_data = self._transform_target(str_target_text)
                        input_word, input_valid_len, input_segment = nd.array([
                            input_data[0]
                        ]), nd.array([input_data[1]
                                      ]), nd.array([input_data[2]])
                        target_word, target_valid_len = nd.array(
                            [target_data[0]]), nd.array([target_data[1]])
                        # input_word, input_valid_len, input_segment = nd.array([input_data[0]]), nd.array([input_data[1]]), nd.array([input_data[2]])
                        target_segment = input_segment

                        _list_target_texts.append(str_target_text)
                        input_words.append(input_word.astype(np.float32))
                        input_valid_lens.append(
                            input_valid_len.astype(np.float32))
                        input_segments.append(input_segment.astype(np.float32))
                        target_words.append(target_word.astype(np.float32))
                        target_valid_lens.append(
                            target_valid_len.astype(np.float32))
                        target_segments.append(
                            target_segment.astype(np.float32))
                        # target_actions.append(target_action.astype(np.float32)); target_pms.append(target_pm.astype(np.float32));
                        list_input_texts.append(str_input_text)

                        _list_target_texts.append(str_target_text)
                        # error_embs.append(error_emb)#; start_embs.append(start_emb); end_embs.append(end_emb)

                    return nd.concat(*input_words, dim=0), nd.concat(
                        *input_valid_lens,
                        dim=0), nd.concat(*input_segments, dim=0), nd.concat(
                            *target_words, dim=0
                        ), nd.concat(*target_valid_lens, dim=0), nd.concat(
                            *target_segments,
                            dim=0), list_input_texts, _list_target_texts

                elif self.mode == 'test':

                    for test_data in list_data:

                        list_ids.append(test_data['id'])
                        str_input_text = test_data['text']
                        str_target_text = test_data['target']

                        # print('input => ', str_input_text)

                        # print('target => ', str_target_text)

                        input_data = self.transformer([str_input_text])
                        input_word, input_valid_len, input_segment = nd.array([
                            input_data[0]
                        ]), nd.array([input_data[1]
                                      ]), nd.array([input_data[2]])
                        input_words.append(input_word.astype(np.float32))
                        input_valid_lens.append(
                            input_valid_len.astype(np.float32))
                        input_segments.append(input_segment.astype(np.float32))
                        list_input_texts.append(str_input_text)
                        _list_target_texts.append(str_target_text)

                    return nd.concat(*input_words, dim=0), nd.concat(
                        *input_valid_lens, dim=0), nd.concat(
                            *input_segments, dim=0
                        ), list_input_texts, _list_target_texts, list_ids

            else:  # fixed length mode

                if self.mode == 'train':
                    for train_data in list_data:
                        list_ids.append(train_data['id'])
                        str_input_text = train_data['input']
                        # no target means no error
                        if 'target' not in train_data:
                            str_target_text = str_input_text
                        else:
                            str_target_text = train_data['target']
                        # print(str_target_text)
                        # print(train_data['correction'])

                        input_data = self.transformer([str_input_text])
                        input_word, input_valid_len, input_segment = nd.array([
                            input_data[0]
                        ]), nd.array([input_data[1]
                                      ]), nd.array([input_data[2]])

                        if input_word.shape[1] > self.max_seq_len:  # 超過長度
                            continue

                        if 'correction' in train_data:
                            target_idx = self.gen_fixed_length_correction_emb(
                                input_word, train_data['correction'])
                        else:
                            target_idx = np.zeros([1, input_word.shape[1]])

                        # target_idx = self.gen_fixed_length_correction_emb(input_word, train_data['correction'])

                        # print(target_idx.shape)

                        # raise

                        # raise

                        _list_target_texts.append(str_target_text)
                        input_words.append(input_word.astype(np.float32))
                        input_valid_lens.append(
                            input_valid_len.astype(np.float32))
                        input_segments.append(input_segment.astype(np.float32))
                        # target_actions.append(target_action.astype(np.float32)); target_pms.append(target_pm.astype(np.float32));
                        list_input_texts.append(str_input_text)

                        target_idxs.append(target_idx)

                        _list_target_texts.append(str_target_text)

                    return nd.concat(*input_words, dim=0), nd.concat(
                        *input_valid_lens,
                        dim=0), nd.concat(*input_segments, dim=0), nd.concat(
                            *target_idxs,
                            dim=0), list_input_texts, _list_target_texts

            # return nd.concat(*input_words, dim = 0), nd.concat(*input_valid_lens, dim = 0), nd.concat(*input_segments, dim = 0), nd.concat(*target_words, dim = 0), nd.concat(*target_valid_lens, dim = 0), nd.concat(*target_segments, dim = 0)#, nd.concat(*target_actions, dim = 0), nd.concat(*target_pms, dim = 0), list_input_texts, list_target_texts

        self.dataset = SimpleDataset(self.data)
        if self.mode == 'test':
            shuffle = False
            last_batch = 'keep'
        else:
            shuffle = True
            last_batch = 'rollover'

        self.loader = DataLoader(self.dataset,
                                 batch_size=self.batch_size,
                                 batchify_fn=batchify_fn,
                                 shuffle=shuffle,
                                 last_batch=last_batch)

        return self.loader
Exemplo n.º 19
0
data_test_lengths = get_data_lengths(data_test)

with io.open(os.path.join(args.save_dir, 'val_gt.txt'), 'w',
             encoding='utf-8') as of:
    for ele in val_tgt_sentences:
        of.write(' '.join(ele) + '\n')

with io.open(os.path.join(args.save_dir, 'test_gt.txt'), 'w',
             encoding='utf-8') as of:
    for ele in test_tgt_sentences:
        of.write(' '.join(ele) + '\n')

data_train = data_train.transform(lambda src, tgt:
                                  (src, tgt, len(src), len(tgt)),
                                  lazy=False)
data_val = SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
                          for i, ele in enumerate(data_val)])
data_test = SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
                           for i, ele in enumerate(data_test)])

ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
    [mx.gpu(int(x)) for x in args.gpus.split(',')]

if args.src_max_len <= 0 or args.tgt_max_len <= 0:
    max_len = np.max([
        np.max(data_train_lengths, axis=0),
        np.max(data_val_lengths, axis=0),
        np.max(data_test_lengths, axis=0)
    ],
                     axis=0)
if args.src_max_len > 0:
    src_max_len = args.src_max_len
Exemplo n.º 20
0
imdb_tok_train = [tokenizer(t.lower()) for t in imdb_review_train]
counter = gluonnlp.data.count_tokens(
    itertools.chain.from_iterable(imdb_tok_train))
vocab = gluonnlp.Vocab(counter, bos_token="<s>", eos_token="</s>", min_freq=10)


def encode(toks):
    return [vocab[tok] for tok in toks]


imdb_x_train = [encode(toks) for toks in imdb_tok_train]

# Build data pipeline.
# TODO: Wrap x and y before making a dataset?
maxlen = max([len(x) for x in imdb_x_train])
dataset = SimpleDataset(imdb_x_train)
dataset = dataset.transform(PadSequence(maxlen))
dataset = dataset.transform(mxnet.nd.array)

# Build the model.
model_ctx = mxnet.cpu()
model = mxnet.gluon.nn.Sequential()
with model.name_scope():
    model.add(mxnet.gluon.nn.Embedding(len(vocab), embedding_size))
    model.add(mxnet.gluon.rnn.GRU(64, dropout=.2))
    model.add(mxnet.gluon.nn.Dense(1))
model.initialize(ctx=model_ctx)
loss = mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss()
opt = mxnet.gluon.Trainer(model.collect_params(), "sgd",
                          {"learning_rate": .01})
Exemplo n.º 21
0
    def get_loader(self):
        def batchify_fn(list_target_texts):

            input_words = []
            input_valid_lens = []
            input_segments = []
            target_words = []
            target_valid_lens = []
            target_segments = []
            target_actions = []
            target_pms = []
            list_input_texts = []
            _list_target_texts = []
            pm_error_idxs = []
            pm_add_idxs = []
            pm_remove_idxs = []

            if self.mode == 'train':  # 先暫時這樣本來 test 應該走 else

                for str_target_text in list_target_texts:

                    # if np.random.ranf() > 0.5:
                    #   str_input_text = self.structure.randomize_word_order(str_target_text)
                    # else:
                    # str_input_text = self.pinyin_sampler.errorize_sentence(str_target_text)
                    str_input_text, pm_error_idx, pm_add_idx, pm_remove_idx = self.errorize_pm(
                        str_target_text)
                    input_data = self.transformer([str_input_text])
                    target_data = self.transform_target(str_target_text)
                    if len(target_data[0]) > self.max_seq_len or input_data[
                            0].shape[0] > self.max_seq_len:  # 超過長度
                        continue
                    pm_error_idx, pm_add_idx, pm_remove_idx = self.transform_pm_error(
                        pm_error_idx, pm_add_idx, pm_remove_idx)

                    input_word, input_valid_len, input_segment = nd.array([
                        input_data[0]
                    ]), nd.array([input_data[1]]), nd.array([input_data[2]])
                    target_word, target_valid_len = nd.array(
                        [target_data[0]]), nd.array([target_data[1]])
                    target_segment = input_segment

                    _list_target_texts.append(str_target_text)
                    input_words.append(input_word.astype(np.float32))
                    input_valid_lens.append(input_valid_len.astype(np.float32))
                    input_segments.append(input_segment.astype(np.float32))
                    target_words.append(target_word.astype(np.float32))
                    target_valid_lens.append(
                        target_valid_len.astype(np.float32))
                    target_segments.append(target_segment.astype(np.float32))
                    pm_add_idxs.append(pm_add_idx)
                    pm_error_idxs.append(pm_error_idx)
                    pm_remove_idxs.append(pm_remove_idx)
                    # target_actions.append(target_action.astype(np.float32)); target_pms.append(target_pm.astype(np.float32));

                    list_input_texts.append(str_input_text)

                return nd.concat(*input_words, dim=0), nd.concat(
                    *input_valid_lens,
                    dim=0), nd.concat(*input_segments, dim=0), nd.concat(
                        *target_words, dim=0
                    ), nd.concat(*target_valid_lens, dim=0), nd.concat(
                        *target_segments,
                        dim=0), nd.concat(*pm_error_idxs, dim=0), nd.concat(
                            *pm_add_idxs, dim=0), nd.concat(
                                *pm_remove_idxs,
                                dim=0), list_input_texts, _list_target_texts
            # return nd.concat(*input_words, dim = 0), nd.concat(*input_valid_lens, dim = 0), nd.concat(*input_segments, dim = 0), nd.concat(*target_words, dim = 0), nd.concat(*target_valid_lens, dim = 0), nd.concat(*target_segments, dim = 0)#, nd.concat(*target_actions, dim = 0), nd.concat(*target_pms, dim = 0), list_input_texts, list_target_texts

            else:

                # print(list_target_texts)
                # print(len(list_target_texts))
                assert (len(list_target_texts) == 1)
                # for test_pair in list_target_texts:
                str_input_text = list_target_texts[0][0]
                str_target_text = list_target_texts[0][1]
                return str_input_text, str_target_text

        self.dataset = SimpleDataset(self.data)
        shuffle = True if self.mode == 'train' else False
        self.loader = DataLoader(self.dataset,
                                 batch_size=self.batch_size,
                                 batchify_fn=batchify_fn,
                                 shuffle=shuffle,
                                 last_batch='rollover')

        return self.loader
Exemplo n.º 22
0
#!/usr/bin/env python

import itertools

from mxnet import gluon
import sentencepiece as spm
from gluonnlp.data import IMDB, SentencepieceTokenizer, PadSequence
from mxnet.gluon.data import SimpleDataset


# Load IMDB movie review dataset.
imdb_train = IMDB("train")
imdb_test = IMDB("test")
imdb_review_train, imdb_s_train = zip(*imdb_train)
imdb_y_train = SimpleDataset([1 if s > 5 else 0 for s in imdb_s_train])


# Train a sentencepiece tokenizer.
train_text_file = "train.txt"
model_prefix = "spm"
with open(train_text_file, "w") as f:
    for review in imdb_review_train:
        f.write(review.lower() + "\n")
spm_args = "--input={}".format(train_text_file)
spm_args += " --model_prefix={}".format(model_prefix)
spm_args += " --vocab_size=20000"
spm_args += " --pad_id=0"
spm_args += " --unk_id=1"
spm_args += " --bos_id=2"
spm_args += " --eos_id=3"
spm_args += " --model_type=unigram"
Exemplo n.º 23
0
def train_valid_split(dataset, valid_ratio=0.05, stratify=None):
    """Split the dataset into training and validation sets.

    Parameters
    ----------
    dataset : list
        A list of training samples.
    valid_ratio : float, default 0.05
        Proportion of training samples to use for validation set
        range: [0, 1]
    stratify : list, default None
        If not None, data is split in a stratified fashion,
        using the contents of stratify as class labels.

    Returns
    -------
    train : SimpleDataset
    valid : SimpleDataset
    """
    if not 0.0 <= valid_ratio <= 1.0:
        raise ValueError('valid_ratio should be in [0, 1]')

    if not stratify:
        num_train = len(dataset)
        num_valid = np.ceil(num_train * valid_ratio).astype('int')
        indices = np.arange(num_train)

        np.random.shuffle(indices)
        valid = SimpleDataset([dataset[indices[i]] for i in range(num_valid)])
        train = SimpleDataset([
            dataset[indices[i + num_valid]]
            for i in range(num_train - num_valid)
        ])

        return train, valid
    else:
        if not isinstance(stratify, list):
            raise TypeError('stratify should be a list')
        if not len(stratify) == len(dataset):
            raise ValueError('stratify should be the same length as num_train')

        classes, digitized = np.unique(stratify, return_inverse=True)
        n_classes = len(classes)
        num_class = np.bincount(digitized)
        num_valid = np.ceil(valid_ratio * num_class).astype('int')

        valid = []
        train = []

        for idx in range(n_classes):
            indices = np.nonzero(stratify == classes[idx])[0]
            np.random.shuffle(indices)
            valid += [dataset[indices[i]] for i in range(num_valid[idx])]
            train += [
                dataset[indices[i + num_valid[idx]]]
                for i in range(num_class[idx] - num_valid[idx])
            ]

        np.random.shuffle(valid)
        np.random.shuffle(train)

        train = SimpleDataset(train)
        valid = SimpleDataset(valid)

        return train, valid
Exemplo n.º 24
0
# context = mx.cpu()  # Enable this to run on CPU
context = mx.gpu(0)  # Enable this to run on GPU

with open(os.path.join('data', 'annotations', 'captions.txt')) as f:
    lines = f.readlines()
    lines = [line.rstrip().split()[1:] for line in lines]

if ADD_EXTRA:
    with open(os.path.join('data', 'annotations',
                           'captions_extra_001-045.txt')) as f:
        lines_extra = f.readlines()
        lines_extra = [line.rstrip().split()[1:] for line in lines_extra]
    lines += lines_extra

tennis_caps = SimpleDataset(lines)

counter = nlp.data.count_tokens(itertools.chain.from_iterable(tennis_caps))
vocab = nlp.Vocab(counter,
                  unknown_token=None,
                  padding_token=None,
                  bos_token=None,
                  eos_token=None,
                  min_freq=1)
idx_to_counts = [counter[w] for w in vocab.idx_to_token]


def code(cap):
    return [vocab[token] for token in cap if token in vocab]

Exemplo n.º 25
0
    transforms.ToTensor(),
])

# %%

emnist_train_data, emnist_train_labels = extract_training_samples('balanced')
emnist_test_data, emnist_test_labels = extract_test_samples('balanced')

emnist_train_data = nd.array(255 - emnist_train_data[:, :, :, None])
emnist_test_data = nd.array(255 - emnist_test_data[:, :, :, None])

# %%
BS = 64

emnist_train_dataset = ArrayDataset(
    SimpleDataset(emnist_train_data).transform(transform_train_emnist),
    emnist_train_labels)
emnist_train_loader = DataLoader(emnist_train_dataset,
                                 shuffle=True,
                                 batch_size=BS)

emnist_test_dataset = ArrayDataset(
    SimpleDataset(emnist_test_data).transform(transform_test),
    emnist_test_labels)
emnist_test_loader = DataLoader(emnist_test_dataset, batch_size=BS)

# with SummaryWriter(logdir='./logs') as sw:
#    sw.add_histogram('emnist_classes', mx.nd.array([c for (f,c) in emnist_train_dataset]), bins=np.arange(-0.5, len(classes)+1))
#    sw.add_histogram('emnist_classes', mx.nd.array([c for (f,c) in emnist_test_dataset]), bins=np.arange(-0.5, len(classes)+1))

# %%
Exemplo n.º 26
0
def split_train_valid(data, n=0.8):
    data.shuffle()
    num = int(len(data) * n)
    return SimpleDataset(data[:num]), SimpleDataset(data[num:])
Exemplo n.º 27
0
    def ___get_loader(self):
        def batchify_fn(list_data):

            input_words = []
            input_valid_lens = []
            input_segments = []

            target_words = []
            target_valid_lens = []
            target_segments = []

            target_actions = []
            target_pms = []
            list_input_texts = []

            error_embs = []
            start_embs = []
            end_embs = []
            list_ids = []

            _list_target_texts = []

            if self.mode == 'train':

                for train_data in list_data:

                    # print(train_data)

                    list_ids.append(train_data['id'])

                    str_input_text = train_data['input']
                    if 'target' not in train_data:
                        str_target_text = str_input_text
                    else:
                        str_target_text = train_data['target']

                    # str_target_text = train_data['target']

                    # print('input text => ', str_input_text)

                    # print('target text => ', str_target_text)

                    input_data = self.transformer([str_input_text])

                    if len(str_input_text) > self.max_seq_len:  # 超過長度
                        continue

                    target_data = self._transform_target(str_target_text)

                    input_word, input_valid_len, input_segment = nd.array([
                        input_data[0]
                    ]), nd.array([input_data[1]]), nd.array([input_data[2]])
                    target_word, target_valid_len = nd.array(
                        [target_data[0]]), nd.array([target_data[1]])

                    # input_word, input_valid_len, input_segment = nd.array([input_data[0]]), nd.array([input_data[1]]), nd.array([input_data[2]])
                    target_segment = input_segment

                    _list_target_texts.append(str_target_text)
                    input_words.append(input_word.astype(np.float32))
                    input_valid_lens.append(input_valid_len.astype(np.float32))
                    input_segments.append(input_segment.astype(np.float32))
                    target_words.append(target_word.astype(np.float32))
                    target_valid_lens.append(
                        target_valid_len.astype(np.float32))
                    target_segments.append(target_segment.astype(np.float32))
                    # target_actions.append(target_action.astype(np.float32)); target_pms.append(target_pm.astype(np.float32));

                    list_input_texts.append(str_input_text)

                    _list_target_texts.append(str_target_text)

                    # error_embs.append(error_emb)#; start_embs.append(start_emb); end_embs.append(end_emb)

                return nd.concat(*input_words, dim=0), nd.concat(
                    *input_valid_lens,
                    dim=0), nd.concat(*input_segments, dim=0), nd.concat(
                        *target_words, dim=0), nd.concat(
                            *target_valid_lens, dim=0), nd.concat(
                                *target_segments,
                                dim=0), list_input_texts, _list_target_texts

            elif self.mode == 'test':

                for test_data in list_data:

                    list_ids.append(test_data['id'])
                    str_input_text = test_data['text']
                    str_target_text = test_data['target']

                    # print('input => ', str_input_text)

                    # print('target => ', str_target_text)

                    input_data = self.transformer([str_input_text])
                    input_word, input_valid_len, input_segment = nd.array([
                        input_data[0]
                    ]), nd.array([input_data[1]]), nd.array([input_data[2]])
                    input_words.append(input_word.astype(np.float32))
                    input_valid_lens.append(input_valid_len.astype(np.float32))
                    input_segments.append(input_segment.astype(np.float32))
                    list_input_texts.append(str_input_text)
                    _list_target_texts.append(str_target_text)

                return nd.concat(*input_words, dim=0), nd.concat(
                    *input_valid_lens, dim=0), nd.concat(
                        *input_segments,
                        dim=0), list_input_texts, _list_target_texts, list_ids
            # return nd.concat(*input_words, dim = 0), nd.concat(*input_valid_lens, dim = 0), nd.concat(*input_segments, dim = 0), nd.concat(*target_words, dim = 0), nd.concat(*target_valid_lens, dim = 0), nd.concat(*target_segments, dim = 0)#, nd.concat(*target_actions, dim = 0), nd.concat(*target_pms, dim = 0), list_input_texts, list_target_texts

        self.dataset = SimpleDataset(self.data)
        if self.mode == 'test':
            shuffle = False
            last_batch = 'keep'
        else:
            shuffle = True
            last_batch = 'rollover'

        self.loader = DataLoader(self.dataset,
                                 batch_size=self.batch_size,
                                 batchify_fn=batchify_fn,
                                 shuffle=shuffle,
                                 last_batch=last_batch)

        return self.loader

        self.dataset = SimpleDataset(self.data)
        if self.mode == 'test':
            shuffle = False
            last_batch = 'keep'
        else:
            shuffle = True
            last_batch = 'rollover'

        self.loader = DataLoader(self.dataset,
                                 batch_size=self.batch_size,
                                 batchify_fn=batchify_fn,
                                 shuffle=shuffle,
                                 last_batch=last_batch)

        return self.loader