예제 #1
0
    def _load_shard(self, shard_name):
        logging.info('Loading data from: %s' % shard_name)
        with open(shard_name) as f:
            sentences = f.readlines()

        if self._reverse:
            sentences_reverse = []
            for sentence in sentences:
                splitted = sentence.split()
                splitted.reverse()
                sentences_reverse.append(' '.join(splitted))
            sentences = sentences_reverse

        if self._shuffle_on_load:
            random.shuffle(sentences)

        ids = [
            self.vocab.encode(sentence, self._reverse)
            for sentence in sentences
        ]
        if self._use_char_inputs:
            chars_ids = [
                self.vocab.encode_chars(sentence, self._reverse)
                for sentence in sentences
            ]
        else:
            chars_ids = [None] * len(ids)
        logging.info('Loaded %d sentences.' % len(ids))
        return list(zip(ids, chars_ids))
예제 #2
0
    def file_based_convert_examples_to_features(self, input_file, output_file):
        """"Convert a set of `InputExample`s to a MindDataset file."""
        examples = self._read_tsv(input_file)

        writer = FileWriter(file_name=output_file, shard_num=1)
        nlp_schema = {
            "input_ids": {"type": "int64", "shape":[-1]},
            "input_mask": {"type": "int64", "shape":[-1]},
            "segment_ids": {"type": "int64", "shape":[-1]},
            "label_ids": {"type": "int64", "shape":[-1]},
        }
        writer.add_schema(nlp_schema, "proprocessed classification dataset")
        data = []
        for index, example in enumerate(examples):
            if index % 10000 == 0:
                logging.info("Writing example %d of %d" % (index, len(examples)))
            record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer)
            sample = {
                "input_ids": np.array(record.input_ids, dtype=np.int64),
                "input_mask": np.array(record.input_mask, dtype=np.int64),
                "segment_ids": np.array(record.segment_ids, dtype=np.int64),
                "label_ids": np.array([record.label_id], dtype=np.int64),
            }
            data.append(sample)
        writer.write_raw_data(data)
        writer.commit()
예제 #3
0
    def __init__(self,
                 filepattern,
                 vocab,
                 test=False,
                 shuffle_on_load=False,
                 reverse=False):
        self._vocab = vocab
        self._all_shards = glob.glob(filepattern)
        logging.info('Found %d shards at %s' %
                     (len(self._all_shards), filepattern))
        self._shards_to_choose = []
        self._reverse = reverse

        self._test = test
        self._shuffle_on_load = shuffle_on_load
        self._use_char_inputs = hasattr(vocab, 'encode_chars')

        self._ids = self._load_random_shard()