def convert(self, data): """Convert a 3D list to a DataSet object. :param data: A 3D tensor. [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], ... ] :return: data_set: A DataSet object. """ data_set = DataSet() for example in data: p, h, l = example # list, list, str x1 = TextField(p, is_target=False) x2 = TextField(h, is_target=False) x1_len = TextField([1] * len(p), is_target=False) x2_len = TextField([1] * len(h), is_target=False) y = LabelField(l, is_target=True) instance = Instance() instance.add_field("premise", x1) instance.add_field("hypothesis", x2) instance.add_field("premise_len", x1_len) instance.add_field("hypothesis_len", x2_len) instance.add_field("truth", y) data_set.append(instance) return data_set
def convert_for_infer(self, data, vocabs): for word_seq in data: # list x = TextField(word_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"])
def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label = example[0], example[1] # list, str x = TextField(word_seq, is_target=False) y = LabelField(label, is_target=True) instance = Instance() instance.add_field("word_seq", x) instance.add_field("label", y) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("label", vocabs["label_vocab"])
def convert(self, data): for example in data: word_seq, label = example[0], example[1] # list, str self.word_vocab.update(word_seq) self.label_vocab.update(label) x = TextField(word_seq, is_target=False) y = LabelField(label, is_target=True) instance = Instance() instance.add_field("word_seq", x) instance.add_field("label", y) self.append(instance) self.index_field("word_seq", self.word_vocab) self.index_field("label", self.label_vocab)
def convert_seq2seq_dataset(data): """Convert list of data into DataSet :param data: list of list of strings, [num_examples, *]. :: [ [ [word_11, word_12, ...], [label_1, label_1, ...] ], [ [word_21, word_22, ...], [label_2, label_1, ...] ], ... ] :return: a DataSet. """ dataset = DataSet() for sample in data: word_seq, label_seq = sample[0], sample[1] ins = Instance() ins.add_field("word_seq", TextField(word_seq, is_target=False)) \ .add_field("label_seq", TextField(label_seq, is_target=True)) dataset.append(ins) return dataset
def convert(self, data): """Convert a 3D list to a DataSet object. :param data: A 3D tensor. Example:: [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], ... ] :return: A DataSet object. """ data_set = DataSet() for example in data: p, h, l = example # list, list, str instance = Instance() instance.add_field("premise", p) instance.add_field("hypothesis", h) instance.add_field("truth", l) data_set.append(instance) data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len") data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len") data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len") data_set.set_target("truth") return data_set
def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label_seq = example[0], example[1] # list, list x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("truth", vocabs["label_vocab"])
def convert_to_dataset(self, data, vocab, label_vocab): """Convert list of indices into a DataSet object. :param data: list. Entries are strings. :param vocab: a dict, mapping string (token) to index (int). :param label_vocab: a dict, mapping string (label) to index (int). :return data_set: a DataSet object """ use_word_seq = False use_label_seq = False use_label_str = False # construct a DataSet object and fill it with Instances data_set = DataSet() for example in data: words, label = example[0], example[1] instance = Instance() if isinstance(words, list): x = TextField(words, is_target=False) instance.add_field("word_seq", x) use_word_seq = True else: raise NotImplementedError("words is a {}".format(type(words))) if isinstance(label, list): y = TextField(label, is_target=True) instance.add_field("label_seq", y) use_label_seq = True elif isinstance(label, str): y = LabelField(label, is_target=True) instance.add_field("label", y) use_label_str = True else: raise NotImplementedError("label is a {}".format(type(label))) data_set.append(instance) # convert strings to indices if use_word_seq: data_set.index_field("word_seq", vocab) if use_label_seq: data_set.index_field("label_seq", label_vocab) if use_label_str: data_set.index_field("label", label_vocab) return data_set
def convert(self, data): """Convert lists of strings into Instances with Fields. :param data: 3-level lists. Entries are strings. """ bar = ProgressBar(total=len(data)) for example in data: word_seq, label_seq = example[0], example[1] # list, list self.word_vocab.update(word_seq) self.label_vocab.update(label_seq) x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) bar.move() self.index_field("word_seq", self.word_vocab) self.index_field("truth", self.label_vocab)
def test_add_field(self): fields = {"x": [1, 2, 3], "y": [4, 5, 6]} ins = Instance(**fields) ins.add_field("z", [1, 1, 1]) fields.update({"z": [1, 1, 1]}) self.assertEqual(ins.fields, fields)