def __init__(self, dump_name, batch_size, pad_len, shuffle=True, subsample=1, unk_prob=0, sample_neg=1.0): self.dataset = load_from_dump(dump_name) if shuffle: self.dataset = self.dataset.shuffle() if subsample < 1: n = int(subsample * len(self.dataset)) self.dataset = Dataset(self.dataset[:n]) if sample_neg != 1.0: if sample_neg <= 0 or sample_neg >= 2: raise Exception("Invalid negative resampling rate: " + str(sample_neg)) self.dataset = self.create_new_by_resample_neg( self.dataset, sample_neg) if shuffle: self.dataset = self.dataset.shuffle() self.batch_size = batch_size self.num_examples = len(self.dataset) self.num_batches = self.num_examples // self.batch_size self.num_residual = self.num_examples - self.batch_size * self.num_batches self.pad_len = pad_len self._unk_prob = unk_prob self._pointer = 0
def test_write_conll(self): f = NamedTemporaryFile(delete=False) f.close() d = Dataset(self.CONLL_MOCK) d.write_conll(f.name) with open(f.name) as fin: self.assertEqual(self.CONLL, fin.read()) os.remove(f.name)
def preprocess(): print "Filtering long seq" filter_seqlen([TRAIN_DEP_FILE, TEST_DEP_FILE], MAX_SEQ_LEN) print "Loading dependency path data from files..." d_train, d_test = load_datasets([TRAIN_DEP_FILE, TEST_DEP_FILE]) print "Build vocab from training set..." word2id = build_vocab([d_train], USE_COUNT) VOCAB_SIZE = len(word2id) vocab_file = DATA_ROOT + "dependency/%d.vocab" % VOCAB_SIZE dump_to_file(vocab_file, word2id) print "Vocab with %d words saved to file %s" % (len(word2id), vocab_file) # print "Collecting labels, pos tags, ner tags, and deprel tags..." # label2id = build_vocab_for_field([d_train], LABEL_FIELD) # dump_to_file(LABEL2ID_FILE, label2id) print "Converting data to ids..." d_train, d_test = convert_words_to_ids([d_train, d_test], word2id) d_train, d_test = convert_fields_to_ids([d_train, d_test], {LABEL_FIELD: LABEL_TO_ID, POS_FIELD: POS_TO_ID, NER_FIELD: NER_TO_ID, DEPREL_FIELD: DEPREL_TO_ID}) # generate file names TRAIN_ID_FILE = DATA_ROOT + 'dependency/train.vocab%d.id' % VOCAB_SIZE TEST_ID_FILE = DATA_ROOT + 'dependency/test.vocab%d.id' % VOCAB_SIZE dump_to_file(TRAIN_ID_FILE, d_train) dump_to_file(TEST_ID_FILE, d_test) print "Datasets saved to files." max_length = 0 for d in [d_train, d_test]: for row in d: l = len(row['word']) if l > max_length: max_length = l print "Datasets maximum sequence length is %d." % max_length print "Generating CV dataset on test set" total_len = len(d_test) dev_len = int(total_len*0.1) dev_list = [] test_list = [] for i in range(100): d_test.shuffle() dev_list.append(Dataset(d_test[:dev_len])) test_list.append(Dataset(d_test[dev_len:])) idx = 0 for dev, test in zip(dev_list, test_list): test_cv_file = DATA_ROOT + 'dependency/cv/test.vocab%d.id.%d' % (VOCAB_SIZE, idx) dev_cv_file = DATA_ROOT + 'dependency/cv/dev.vocab%d.id.%d' % (VOCAB_SIZE, idx) dump_to_file(test_cv_file, test) dump_to_file(dev_cv_file, dev) idx += 1
def convert_to_dependency_path(dataset): # prepare new dataset dep_dataset = OrderedDict() for k in dataset.fields.keys(): dep_dataset[k] = [] # add in an ancestor field at the end dep_dataset[ROOT_FIELD] = [] count = 0 fail = 0 for i, row in enumerate(dataset): t = tree.Tree(row) if t.root is None: # fail to parse the conll data fail += 1 continue # build dependency tree dataset if SHORTEST_PATH_MODE == 'ancestor': path, ancestor_idx = t.get_shortest_path_through_ancestor() else: path, ancestor_idx = t.get_shortest_path_through_root() add_dep_path_to_dataset(path, ancestor_idx, row, dep_dataset) count += 1 # if count % 1000 == 0: # print "%d trees constructed." % count print "Total trees constructed: %d" % count print "Total trees failed: %d" % fail dep_dataset = Dataset(dep_dataset) return dep_dataset
def load_datasets(fname, labelfname, maxlen=MAX_SEQ_LEN, lowercase=True): with open(fname, 'r') as f: instances = json.load(f) with open(labelfname, 'r') as f: labels = json.load(f) # d = Dataset.load_conll(fn) print "Filtering long seq" dataset = OrderedDict() dataset[WORD_FIELD] = [] dataset[POS_FIELD] = [] dataset[DEPREL_FIELD] = [] dataset[LABEL_FIELD] = [] dataset[ROOT_FIELD] = [] for idx, instance in enumerate(instances): if len(instance[0]) < maxlen: # filter length words = instance[0] pos = [tok.upper() for tok in instance[1]] # POS to upper case deprel = instance[2] dirs = instance[3] label = labels[idx] length = len(dirs) roots = ['_'] * length # find root in path for j in range(length - 1, -1, -1): if j != '2': roots[j] = 'ROOT' break dataset[WORD_FIELD].append(words) dataset[POS_FIELD].append(pos) dataset[DEPREL_FIELD].append(deprel) dataset[ROOT_FIELD].append(roots) dataset[LABEL_FIELD].append(label) dataset = Dataset(dataset) print "\t%d examples in %s" % (len(dataset), fname) return dataset
def load_datasets(fnames, lowercase=True): datasets = [] for fn in fnames: d = Dataset.load_conll(fn) print "\t%d examples in %s" % (len(d), fn) if lowercase: converters = {'word': lambda word_list: [x.lower() if x is not None else None for x in word_list]} d.convert(converters, in_place=True) datasets.append(d) return datasets
def get_counter_for_field(filelist, field): c = Counter() for fin_name in filelist: print('loading {}'.format(fin_name)) d = Dataset.load_conll(fin_name) for i, row in enumerate(d): for j in range(len(row['word'])): t = row[field][j] if t == SKIP_TOKEN or t is None: continue else: c[t] += 1 return c
def __init__(self, dump_name, batch_size, pad_len, shuffle=True, subsample=1, unk_prob=0): self.dataset = load_from_dump(dump_name) if shuffle: self.dataset = self.dataset.shuffle() if subsample < 1: n = int(subsample * len(self.dataset)) self.dataset = Dataset(self.dataset[:n]) self.batch_size = batch_size self.num_examples = len(self.dataset) self.num_batches = self.num_examples // self.batch_size self.num_residual = self.num_examples - self.batch_size * self.num_batches self.pad_len = pad_len self._unk_prob = unk_prob self._pointer = 0
def create_new_by_resample_neg(self, dataset, neg_sample_rate): new_dataset = OrderedDict() for k in dataset.fields.keys(): new_dataset[k] = [] # start resampling for i, row in enumerate(dataset): if row[LABLE_FIELD] == LABEL_TO_ID['no_relation']: if neg_sample_rate < 1.0 and random.random( ) <= neg_sample_rate: # keep this negative example self.add_to_new_dataset(new_dataset, row) elif neg_sample_rate >= 1.0: # first keep self.add_to_new_dataset(new_dataset, row) if random.random() <= (neg_sample_rate - 1.0): # then decide whether to repeat self.add_to_new_dataset(new_dataset, row) else: # keep all non-negative examples self.add_to_new_dataset(new_dataset, row) new_dataset = Dataset(new_dataset) print >> sys.stderr, "New dataset created by resampling negative: %d examples before, %d (%g) examples after." % (len(dataset), len(new_dataset), \ float(len(new_dataset)) / len(dataset)) return new_dataset
from stanza.text.dataset import Dataset # for fin_name in ['train.conll', 'dev.conll', 'test.conll']: # fout_name = fin_name.replace('.conll', '.anon.conll') # print('loading {}'.format(fin_name)) # d = Dataset.load_conll(fin_name) # print(d) # for i, row in enumerate(d): # if row['subj'] == 'SUBJECT': # d.fields['word'][i] = row['subj_ner'] # if row['obj'] == 'OBJECT': # d.fields['word'][i] = row['obj_ner'] # d.write_conll(fout_name) for fin_name in ['train.conll', 'dev.conll', 'test.conll']: fout_name = fin_name.replace('.conll', '.anon.conll') print('loading {}'.format(fin_name)) d = Dataset.load_conll(fin_name) print(d) for i, row in enumerate(d): for j in range(len(row['word'])): if row['subj'][j] == 'SUBJECT': d.fields['word'][i][j] = 'NER-' + row['subj_ner'][j] if row['obj'][j] == 'OBJECT': d.fields['word'][i][j] = 'NER-' + row['obj_ner'][j] d.write_conll(fout_name)
def test_load_conll(self): with NamedTemporaryFile() as f: f.write(self.CONLL) f.flush() d = Dataset.load_conll(f.name) self.assertDictEqual(self.CONLL_MOCK, d.fields)
def test_length(self): self.assertEqual(0, len(Dataset({}))) self.assertEqual(2, len(Dataset({'name': ['foo', 'bar']})))
def test_init(self): self.assertRaises(InvalidFieldsException, lambda: Dataset({'name': ['alice', 'bob'], 'ssn': ['1']}))
def setUp(self): random.seed(1) self.mock = Dataset(OrderedDict([(name, d[:]) for name, d in self.MOCK.items()])) self.conll = Dataset(OrderedDict([(name, d[:]) for name, d in self.CONLL_MOCK.items()]))
class DataLoader(): def __init__(self, dump_name, batch_size, pad_len, shuffle=True, subsample=1, unk_prob=0, sample_neg=1.0): self.dataset = load_from_dump(dump_name) if shuffle: self.dataset = self.dataset.shuffle() if subsample < 1: n = int(subsample * len(self.dataset)) self.dataset = Dataset(self.dataset[:n]) if sample_neg != 1.0: if sample_neg <= 0 or sample_neg >= 2: raise Exception("Invalid negative resampling rate: " + str(sample_neg)) self.dataset = self.create_new_by_resample_neg( self.dataset, sample_neg) if shuffle: self.dataset = self.dataset.shuffle() self.batch_size = batch_size self.num_examples = len(self.dataset) self.num_batches = self.num_examples // self.batch_size self.num_residual = self.num_examples - self.batch_size * self.num_batches self.pad_len = pad_len self._unk_prob = unk_prob self._pointer = 0 def next_batch(self): """ Generate the most simple batch. x_batch is sentences, y_batch is labels, and x_lens is the unpadded length of sentences in x_batch. """ x_batch = { WORD_FIELD: [], POS_FIELD: [], NER_FIELD: [], SUBJ_FIELD: [], OBJ_FIELD: [], SUBJ_NER_FIELD: [], OBJ_NER_FIELD: [] } x_lens = [] for field in x_batch.keys(): for tokens in self.dataset.fields[field][self. _pointer:self._pointer + self.batch_size]: if field == WORD_FIELD: # we need to 1) corrupt 2) count sent len with word field if self._unk_prob > 0: tokens = self.corrupt_sentence(tokens) x_lens.append(len(tokens)) # apply padding to the left assert self.pad_len >= len( tokens ), "Padding length is shorter than original sentence length." tokens = tokens + [PAD_ID] * (self.pad_len - len(tokens)) x_batch[field].append(tokens) y_batch = self.dataset.fields[LABLE_FIELD][self. _pointer:self._pointer + self.batch_size] self._pointer += self.batch_size return x_batch, y_batch, x_lens def get_residual(self): x_batch = { WORD_FIELD: [], POS_FIELD: [], NER_FIELD: [], SUBJ_FIELD: [], OBJ_FIELD: [] } x_lens = [] for field in x_batch.keys(): for tokens in self.dataset.fields[field][self._pointer:]: if field == WORD_FIELD: # we need to 1) corrupt 2) count sent len with word field if self._unk_prob > 0: tokens = self.corrupt_sentence(tokens) x_lens.append(len(tokens)) tokens = tokens + [PAD_ID] * (self.pad_len - len(tokens)) x_batch[field].append(tokens) y_batch = self.dataset.fields[LABLE_FIELD][self._pointer:] return x_batch, y_batch, x_lens def reset_pointer(self): self._pointer = 0 def corrupt_sentence(self, tokens): new_tokens = [] for x in tokens: if x != UNK_ID and np.random.random() < self._unk_prob: new_tokens.append(UNK_ID) else: new_tokens.append(x) return new_tokens def add_to_new_dataset(self, new_dataset, row): for k in row.keys(): new_dataset[k].append(row[k]) def create_new_by_resample_neg(self, dataset, neg_sample_rate): new_dataset = OrderedDict() for k in dataset.fields.keys(): new_dataset[k] = [] # start resampling for i, row in enumerate(dataset): if row[LABLE_FIELD] == LABEL_TO_ID['no_relation']: if neg_sample_rate < 1.0 and random.random( ) <= neg_sample_rate: # keep this negative example self.add_to_new_dataset(new_dataset, row) elif neg_sample_rate >= 1.0: # first keep self.add_to_new_dataset(new_dataset, row) if random.random() <= (neg_sample_rate - 1.0): # then decide whether to repeat self.add_to_new_dataset(new_dataset, row) else: # keep all non-negative examples self.add_to_new_dataset(new_dataset, row) new_dataset = Dataset(new_dataset) print >> sys.stderr, "New dataset created by resampling negative: %d examples before, %d (%g) examples after." % (len(dataset), len(new_dataset), \ float(len(new_dataset)) / len(dataset)) return new_dataset def write_keys(self, key_file, id2label=None, include_residual=False): if id2label is None: id2label = lambda x: x # map to itself if include_residual: end_index = self.num_examples else: end_index = self.num_batches * self.batch_size labels = [ id2label[l] for l in self.dataset.fields[LABLE_FIELD][:end_index] ] # write to file with open(key_file, 'w') as outfile: for l in labels: outfile.write(str(l) + '\n') return