def read_batch(self, read_batch=True):
     if self.index == self.length:
         return None
     scores = []
     kscores = []
     tree = []
     ktrees = []
     kbest = []
     #read train
     lines = []
     klines = []
     while self.index < self.length:
         line = self.data[self.index]
         if line.strip() != 'PTB_KBEST':
             if line.strip() == '':
                 ktrees.append(read_tree(tree, self.vocab))
                 lines.append(tree[:])
                 tree = []
             elif not '_' in line:
                 scores.append(float(line))
             else:
                 tree.append(line)
         else:
             if len(ktrees) > 2:
                 kbest.append(ktrees[:])
                 kscores.append(scores[:])
                 klines.append(lines[:])
                 if read_batch and len(kbest) == self.batch:
                     self.index += 1
                     break
                 ktrees = []
                 scores = []
                 lines = []
         self.index += 1
     #read gold
     list = []
     gold = []
     goldlines = []
     while self.gindex < self.glength:
         line = self.gdata[self.gindex]
         if line.strip() == '':
             root = read_tree(list, self.vocab)
             gold.append(root)
             goldlines.append(list[:])
             if read_batch and len(gold) == self.batch:
                 self.gindex += 1
                 break
             list = []
         else:
             list.append(line)
         self.gindex += 1
     train_batch = []
     for a, b, c, d, e in zip(kbest, kscores, gold, klines, goldlines):
         self.kbest_id += 1
         if len(c.children) == 0:
             continue
         train_batch.append(data_util.instance(a, b, c, d, e))
     return train_batch
Exemplo n.º 2
0
def read_dev(kbest_filename, gold_filename, vocab):
    with open(kbest_filename, 'r') as reader:
        kbest_data = reader.readlines()
    kbest_data.append('PTB_KBEST')
    reader.close()
    kbest = []
    scores = []
    onebest = []
    tree = []
    onescores = []
    lines = []
    onelines = []
    i = 0
    while i < len(kbest_data):
        line = kbest_data[i]
        if line.strip() != 'PTB_KBEST':
            if line.strip() == '':
                onelines.append(tree[:])
                onebest.append(train_iterator.read_tree(tree, vocab))
                tree = []
            elif not '_' in line:
                onescores.append(float(line))
            else:
                tree.append(line)
        else:
            if len(onebest) > 1:
                kbest.append(onebest[:])
                scores.append(onescores[:])
                lines.append(onelines)
                onelines = []
                onebest = []
                onescores = []
        i += 1

    with open(gold_filename, 'r') as reader:
        data = reader.readlines()
    reader.close()
    list = []
    gold = []
    gold_lines = []
    for line in data:
        if line.strip() == '':
            root = train_iterator.read_tree(list, vocab)
            gold.append(root)
            gold_lines.append(list[:])
            list = []
        else:
            list.append(line)
    dev_data = []
    for a, b, c, d, e in zip(kbest, scores, gold, lines, gold_lines):
        if len(c.children) == 0:
            continue
        dev_data.append(data_util.instance(a, b, c, d, e))
    return dev_data
 def read_all(self):
     scores = []
     kscores = []
     tree = []
     ktrees = []
     kbest = []
     lines = []
     klines = []
     index = 0
     while index < self.length:
         line = self.data[index]
         if line.strip() != 'PTB_KBEST':
             if line.strip() == '':
                 ktrees.append(read_tree(tree, self.vocab))
                 lines.append(tree[:])
                 tree = []
             elif not '_' in line:
                 scores.append(float(line))
             else:
                 tree.append(line)
         else:
             if len(ktrees) > 2:
                 kbest.append(ktrees[:])
                 kscores.append(scores[:])
                 klines.append(lines[:])
                 lines = []
                 ktrees = []
                 scores = []
         index += 1
     # read gold
     list = []
     gold = []
     goldlines = []
     gindex = 0
     while gindex < self.glength:
         line = self.gdata[gindex]
         if line.strip() == '':
             root = read_tree(list, self.vocab)
             goldlines.append(list[:])
             gold.append(root)
             list = []
         else:
             list.append(line)
         gindex += 1
     train_batch = []
     for a, b, c, d, e in zip(kbest, kscores, gold, klines, goldlines):
         self.kbest_id += 1
         if len(c.children) == 0:
             continue
         train_batch.append(data_util.instance(a, b, c, d, e))
     return train_batch
    def read_give_tree(self, tree_index):
        scores = []
        tree = []
        ktrees = []
        kbestlines = []
        # read train
        index = 0
        best_num = 0
        while index < self.length:
            line = self.data[index]
            if best_num == tree_index:
                if line.strip() != 'PTB_KBEST':
                    if line.strip() == '':
                        ktrees.append(read_tree(tree, self.vocab))
                        kbestlines.append(tree[:])
                        tree = []
                    elif not '_' in line:
                        scores.append(float(line))
                    else:
                        tree.append(line)
                else:
                    break
            if line.strip() == 'PTB_KBEST':
                best_num += 1
            index += 1
        # read gold
        list = []
        gold = []
        index = 0
        num = 1
        while index < self.glength:
            line = self.gdata[index]
            if num == tree_index:
                if line.strip() == '':
                    root = read_tree(list, self.vocab)
                    gold.append(root)
                    break
                else:
                    list.append(line)
            if line.strip() == '':
                num += 1
            index += 1

        retval = data_util.instance(ktrees,
                                    scores,
                                    gold,
                                    gold_lines=list,
                                    lines=kbestlines)
        return retval
 def read_random_batch(self, batch_size=400):
     total_list = []
     for i in range(39830):
         total_list.append(i)
     import random
     random.shuffle(total_list)
     total_list = total_list[0:batch_size]
     sorted(total_list)
     scores = []
     kscores = []
     tree = []
     ktrees = []
     kbest = []
     # read train
     lines = []
     klines = []
     i = 0
     while i < self.length:
         line = self.data[i]
         if line.strip() != 'PTB_KBEST':
             if line.strip() == '':
                 ktrees.append(read_tree(tree, self.vocab))
                 lines.append(tree[:])
                 tree = []
             elif not '_' in line:
                 scores.append(float(line))
             else:
                 tree.append(line)
         else:
             if len(ktrees) > 2:
                 kbest.append(ktrees[:])
                 kscores.append(scores[:])
                 klines.append(lines[:])
                 if read_batch and len(kbest) == self.batch:
                     self.index += 1
                     break
                 ktrees = []
                 scores = []
                 lines = []
         self.index += 1
     # read gold
     list = []
     gold = []
     goldlines = []
     while self.gindex < self.glength:
         line = self.gdata[self.gindex]
         if line.strip() == '':
             root = read_tree(list, self.vocab)
             gold.append(root)
             goldlines.append(list[:])
             if read_batch and len(gold) == self.batch:
                 self.gindex += 1
                 break
             list = []
         else:
             list.append(line)
         self.gindex += 1
     train_batch = []
     for a, b, c, d, e in zip(kbest, kscores, gold, klines, goldlines):
         self.kbest_id += 1
         if len(c.children) == 0:
             continue
         train_batch.append(data_util.instance(a, b, c, d, e))
     return train_batch