示例#1
0
 def __init__(self,
              dump_name,
              batch_size,
              pad_len,
              shuffle=True,
              subsample=1,
              unk_prob=0,
              sample_neg=1.0):
     self.dataset = load_from_dump(dump_name)
     if shuffle:
         self.dataset = self.dataset.shuffle()
     if subsample < 1:
         n = int(subsample * len(self.dataset))
         self.dataset = Dataset(self.dataset[:n])
     if sample_neg != 1.0:
         if sample_neg <= 0 or sample_neg >= 2:
             raise Exception("Invalid negative resampling rate: " +
                             str(sample_neg))
         self.dataset = self.create_new_by_resample_neg(
             self.dataset, sample_neg)
     if shuffle:
         self.dataset = self.dataset.shuffle()
     self.batch_size = batch_size
     self.num_examples = len(self.dataset)
     self.num_batches = self.num_examples // self.batch_size
     self.num_residual = self.num_examples - self.batch_size * self.num_batches
     self.pad_len = pad_len
     self._unk_prob = unk_prob
     self._pointer = 0
 def test_write_conll(self):
     f = NamedTemporaryFile(delete=False)
     f.close()
     d = Dataset(self.CONLL_MOCK)
     d.write_conll(f.name)
     with open(f.name) as fin:
         self.assertEqual(self.CONLL, fin.read())
     os.remove(f.name)
示例#3
0
def preprocess():
    print "Filtering long seq"
    filter_seqlen([TRAIN_DEP_FILE, TEST_DEP_FILE], MAX_SEQ_LEN)
    print "Loading dependency path data from files..."
    d_train, d_test = load_datasets([TRAIN_DEP_FILE, TEST_DEP_FILE])
    print "Build vocab from training set..."
    word2id = build_vocab([d_train], USE_COUNT)
    VOCAB_SIZE = len(word2id)
    vocab_file = DATA_ROOT + "dependency/%d.vocab" % VOCAB_SIZE
    dump_to_file(vocab_file, word2id)
    print "Vocab with %d words saved to file %s" % (len(word2id), vocab_file)

    # print "Collecting labels, pos tags, ner tags, and deprel tags..."
    # label2id = build_vocab_for_field([d_train], LABEL_FIELD)
    # dump_to_file(LABEL2ID_FILE, label2id)

    print "Converting data to ids..."
    d_train, d_test = convert_words_to_ids([d_train, d_test], word2id)
    d_train, d_test = convert_fields_to_ids([d_train, d_test],
                                            {LABEL_FIELD: LABEL_TO_ID, POS_FIELD: POS_TO_ID,
                                             NER_FIELD: NER_TO_ID, DEPREL_FIELD: DEPREL_TO_ID})

    # generate file names
    TRAIN_ID_FILE = DATA_ROOT + 'dependency/train.vocab%d.id' % VOCAB_SIZE
    TEST_ID_FILE = DATA_ROOT + 'dependency/test.vocab%d.id' % VOCAB_SIZE
    dump_to_file(TRAIN_ID_FILE, d_train)
    dump_to_file(TEST_ID_FILE, d_test)
    print "Datasets saved to files."
    max_length = 0
    for d in [d_train, d_test]:
        for row in d:
            l = len(row['word'])
            if l > max_length:
                max_length = l
    print "Datasets maximum sequence length is %d." % max_length
    print "Generating CV dataset on test set"
    total_len = len(d_test)
    dev_len = int(total_len*0.1)
    dev_list = []
    test_list = []
    for i in range(100):
        d_test.shuffle()
        dev_list.append(Dataset(d_test[:dev_len]))
        test_list.append(Dataset(d_test[dev_len:]))
    idx = 0
    for dev, test in zip(dev_list, test_list):
        test_cv_file = DATA_ROOT + 'dependency/cv/test.vocab%d.id.%d' % (VOCAB_SIZE, idx)
        dev_cv_file = DATA_ROOT + 'dependency/cv/dev.vocab%d.id.%d' % (VOCAB_SIZE, idx)
        dump_to_file(test_cv_file, test)
        dump_to_file(dev_cv_file, dev)
        idx += 1
示例#4
0
def convert_to_dependency_path(dataset):
    # prepare new dataset
    dep_dataset = OrderedDict()
    for k in dataset.fields.keys():
        dep_dataset[k] = []
    # add in an ancestor field at the end
    dep_dataset[ROOT_FIELD] = []

    count = 0
    fail = 0
    for i, row in enumerate(dataset):
        t = tree.Tree(row)
        if t.root is None:  # fail to parse the conll data
            fail += 1
            continue
        # build dependency tree dataset
        if SHORTEST_PATH_MODE == 'ancestor':
            path, ancestor_idx = t.get_shortest_path_through_ancestor()
        else:
            path, ancestor_idx = t.get_shortest_path_through_root()
        add_dep_path_to_dataset(path, ancestor_idx, row, dep_dataset)
        count += 1
        # if count % 1000 == 0:
        # print "%d trees constructed." % count
    print "Total trees constructed: %d" % count
    print "Total trees failed: %d" % fail
    dep_dataset = Dataset(dep_dataset)
    return dep_dataset
def load_datasets(fname, labelfname, maxlen=MAX_SEQ_LEN, lowercase=True):
    with open(fname, 'r') as f:
        instances = json.load(f)
    with open(labelfname, 'r') as f:
        labels = json.load(f)
    # d = Dataset.load_conll(fn)
    print "Filtering long seq"
    dataset = OrderedDict()
    dataset[WORD_FIELD] = []
    dataset[POS_FIELD] = []
    dataset[DEPREL_FIELD] = []
    dataset[LABEL_FIELD] = []
    dataset[ROOT_FIELD] = []
    for idx, instance in enumerate(instances):
        if len(instance[0]) < maxlen:  # filter length
            words = instance[0]
            pos = [tok.upper() for tok in instance[1]]  # POS to upper case
            deprel = instance[2]
            dirs = instance[3]
            label = labels[idx]
            length = len(dirs)
            roots = ['_'] * length
            # find root in path
            for j in range(length - 1, -1, -1):
                if j != '2':
                    roots[j] = 'ROOT'
                    break
            dataset[WORD_FIELD].append(words)
            dataset[POS_FIELD].append(pos)
            dataset[DEPREL_FIELD].append(deprel)
            dataset[ROOT_FIELD].append(roots)
            dataset[LABEL_FIELD].append(label)
    dataset = Dataset(dataset)
    print "\t%d examples in %s" % (len(dataset), fname)
    return dataset
示例#6
0
def load_datasets(fnames, lowercase=True):
    datasets = []
    for fn in fnames:
        d = Dataset.load_conll(fn)
        print "\t%d examples in %s" % (len(d), fn)
        if lowercase:
            converters = {'word': lambda word_list: [x.lower() if x is not None else None for x in word_list]}
            d.convert(converters, in_place=True)
        datasets.append(d)
    return datasets
示例#7
0
def get_counter_for_field(filelist, field):
    c = Counter()
    for fin_name in filelist:
        print('loading {}'.format(fin_name))
        d = Dataset.load_conll(fin_name)
        for i, row in enumerate(d):
            for j in range(len(row['word'])):
                t = row[field][j]
                if t == SKIP_TOKEN or t is None:
                    continue
                else:
                    c[t] += 1
    return c
示例#8
0
 def __init__(self, dump_name, batch_size, pad_len, shuffle=True, subsample=1, unk_prob=0):
     self.dataset = load_from_dump(dump_name)
     if shuffle:
         self.dataset = self.dataset.shuffle()
     if subsample < 1:
         n = int(subsample * len(self.dataset))
         self.dataset = Dataset(self.dataset[:n])
     self.batch_size = batch_size
     self.num_examples = len(self.dataset)
     self.num_batches = self.num_examples // self.batch_size
     self.num_residual = self.num_examples - self.batch_size * self.num_batches
     self.pad_len = pad_len
     self._unk_prob = unk_prob
     self._pointer = 0
示例#9
0
 def create_new_by_resample_neg(self, dataset, neg_sample_rate):
     new_dataset = OrderedDict()
     for k in dataset.fields.keys():
         new_dataset[k] = []
     # start resampling
     for i, row in enumerate(dataset):
         if row[LABLE_FIELD] == LABEL_TO_ID['no_relation']:
             if neg_sample_rate < 1.0 and random.random(
             ) <= neg_sample_rate:
                 # keep this negative example
                 self.add_to_new_dataset(new_dataset, row)
             elif neg_sample_rate >= 1.0:
                 # first keep
                 self.add_to_new_dataset(new_dataset, row)
                 if random.random() <= (neg_sample_rate - 1.0):
                     # then decide whether to repeat
                     self.add_to_new_dataset(new_dataset, row)
         else:  # keep all non-negative examples
             self.add_to_new_dataset(new_dataset, row)
     new_dataset = Dataset(new_dataset)
     print >> sys.stderr, "New dataset created by resampling negative: %d examples before, %d (%g) examples after." % (len(dataset), len(new_dataset), \
         float(len(new_dataset)) / len(dataset))
     return new_dataset
示例#10
0
from stanza.text.dataset import Dataset

# for fin_name in ['train.conll', 'dev.conll', 'test.conll']:
#     fout_name = fin_name.replace('.conll', '.anon.conll')
#     print('loading {}'.format(fin_name))
#     d = Dataset.load_conll(fin_name)
#     print(d)
#     for i, row in enumerate(d):
#         if row['subj'] == 'SUBJECT':
#             d.fields['word'][i] = row['subj_ner']
#         if row['obj'] == 'OBJECT':
#             d.fields['word'][i] = row['obj_ner']
#     d.write_conll(fout_name)

for fin_name in ['train.conll', 'dev.conll', 'test.conll']:
    fout_name = fin_name.replace('.conll', '.anon.conll')
    print('loading {}'.format(fin_name))
    d = Dataset.load_conll(fin_name)
    print(d)
    for i, row in enumerate(d):
        for j in range(len(row['word'])):
            if row['subj'][j] == 'SUBJECT':
                d.fields['word'][i][j] = 'NER-' + row['subj_ner'][j]
            if row['obj'][j] == 'OBJECT':
                d.fields['word'][i][j] = 'NER-' + row['obj_ner'][j]
    d.write_conll(fout_name)
示例#11
0
 def test_load_conll(self):
     with NamedTemporaryFile() as f:
         f.write(self.CONLL)
         f.flush()
         d = Dataset.load_conll(f.name)
         self.assertDictEqual(self.CONLL_MOCK, d.fields)
示例#12
0
 def test_length(self):
     self.assertEqual(0, len(Dataset({})))
     self.assertEqual(2, len(Dataset({'name': ['foo', 'bar']})))
示例#13
0
 def test_init(self):
     self.assertRaises(InvalidFieldsException, lambda: Dataset({'name': ['alice', 'bob'], 'ssn': ['1']}))
示例#14
0
 def setUp(self):
     random.seed(1)
     self.mock = Dataset(OrderedDict([(name, d[:]) for name, d in self.MOCK.items()]))
     self.conll = Dataset(OrderedDict([(name, d[:]) for name, d in self.CONLL_MOCK.items()]))
示例#15
0
class DataLoader():
    def __init__(self,
                 dump_name,
                 batch_size,
                 pad_len,
                 shuffle=True,
                 subsample=1,
                 unk_prob=0,
                 sample_neg=1.0):
        self.dataset = load_from_dump(dump_name)
        if shuffle:
            self.dataset = self.dataset.shuffle()
        if subsample < 1:
            n = int(subsample * len(self.dataset))
            self.dataset = Dataset(self.dataset[:n])
        if sample_neg != 1.0:
            if sample_neg <= 0 or sample_neg >= 2:
                raise Exception("Invalid negative resampling rate: " +
                                str(sample_neg))
            self.dataset = self.create_new_by_resample_neg(
                self.dataset, sample_neg)
        if shuffle:
            self.dataset = self.dataset.shuffle()
        self.batch_size = batch_size
        self.num_examples = len(self.dataset)
        self.num_batches = self.num_examples // self.batch_size
        self.num_residual = self.num_examples - self.batch_size * self.num_batches
        self.pad_len = pad_len
        self._unk_prob = unk_prob
        self._pointer = 0

    def next_batch(self):
        """
        Generate the most simple batch. x_batch is sentences, y_batch is labels, and x_lens is the unpadded length of sentences in x_batch.
        """
        x_batch = {
            WORD_FIELD: [],
            POS_FIELD: [],
            NER_FIELD: [],
            SUBJ_FIELD: [],
            OBJ_FIELD: [],
            SUBJ_NER_FIELD: [],
            OBJ_NER_FIELD: []
        }
        x_lens = []
        for field in x_batch.keys():
            for tokens in self.dataset.fields[field][self.
                                                     _pointer:self._pointer +
                                                     self.batch_size]:
                if field == WORD_FIELD:  # we need to 1) corrupt 2) count sent len with word field
                    if self._unk_prob > 0:
                        tokens = self.corrupt_sentence(tokens)
                    x_lens.append(len(tokens))
                # apply padding to the left
                assert self.pad_len >= len(
                    tokens
                ), "Padding length is shorter than original sentence length."
                tokens = tokens + [PAD_ID] * (self.pad_len - len(tokens))
                x_batch[field].append(tokens)
        y_batch = self.dataset.fields[LABLE_FIELD][self.
                                                   _pointer:self._pointer +
                                                   self.batch_size]
        self._pointer += self.batch_size
        return x_batch, y_batch, x_lens

    def get_residual(self):
        x_batch = {
            WORD_FIELD: [],
            POS_FIELD: [],
            NER_FIELD: [],
            SUBJ_FIELD: [],
            OBJ_FIELD: []
        }
        x_lens = []
        for field in x_batch.keys():
            for tokens in self.dataset.fields[field][self._pointer:]:
                if field == WORD_FIELD:  # we need to 1) corrupt 2) count sent len with word field
                    if self._unk_prob > 0:
                        tokens = self.corrupt_sentence(tokens)
                    x_lens.append(len(tokens))
                tokens = tokens + [PAD_ID] * (self.pad_len - len(tokens))
                x_batch[field].append(tokens)
        y_batch = self.dataset.fields[LABLE_FIELD][self._pointer:]
        return x_batch, y_batch, x_lens

    def reset_pointer(self):
        self._pointer = 0

    def corrupt_sentence(self, tokens):
        new_tokens = []
        for x in tokens:
            if x != UNK_ID and np.random.random() < self._unk_prob:
                new_tokens.append(UNK_ID)
            else:
                new_tokens.append(x)
        return new_tokens

    def add_to_new_dataset(self, new_dataset, row):
        for k in row.keys():
            new_dataset[k].append(row[k])

    def create_new_by_resample_neg(self, dataset, neg_sample_rate):
        new_dataset = OrderedDict()
        for k in dataset.fields.keys():
            new_dataset[k] = []
        # start resampling
        for i, row in enumerate(dataset):
            if row[LABLE_FIELD] == LABEL_TO_ID['no_relation']:
                if neg_sample_rate < 1.0 and random.random(
                ) <= neg_sample_rate:
                    # keep this negative example
                    self.add_to_new_dataset(new_dataset, row)
                elif neg_sample_rate >= 1.0:
                    # first keep
                    self.add_to_new_dataset(new_dataset, row)
                    if random.random() <= (neg_sample_rate - 1.0):
                        # then decide whether to repeat
                        self.add_to_new_dataset(new_dataset, row)
            else:  # keep all non-negative examples
                self.add_to_new_dataset(new_dataset, row)
        new_dataset = Dataset(new_dataset)
        print >> sys.stderr, "New dataset created by resampling negative: %d examples before, %d (%g) examples after." % (len(dataset), len(new_dataset), \
            float(len(new_dataset)) / len(dataset))
        return new_dataset

    def write_keys(self, key_file, id2label=None, include_residual=False):
        if id2label is None:
            id2label = lambda x: x  # map to itself
        if include_residual:
            end_index = self.num_examples
        else:
            end_index = self.num_batches * self.batch_size
        labels = [
            id2label[l] for l in self.dataset.fields[LABLE_FIELD][:end_index]
        ]
        # write to file
        with open(key_file, 'w') as outfile:
            for l in labels:
                outfile.write(str(l) + '\n')
        return