def read_batch(self, read_batch=True): if self.index == self.length: return None scores = [] kscores = [] tree = [] ktrees = [] kbest = [] #read train lines = [] klines = [] while self.index < self.length: line = self.data[self.index] if line.strip() != 'PTB_KBEST': if line.strip() == '': ktrees.append(read_tree(tree, self.vocab)) lines.append(tree[:]) tree = [] elif not '_' in line: scores.append(float(line)) else: tree.append(line) else: if len(ktrees) > 2: kbest.append(ktrees[:]) kscores.append(scores[:]) klines.append(lines[:]) if read_batch and len(kbest) == self.batch: self.index += 1 break ktrees = [] scores = [] lines = [] self.index += 1 #read gold list = [] gold = [] goldlines = [] while self.gindex < self.glength: line = self.gdata[self.gindex] if line.strip() == '': root = read_tree(list, self.vocab) gold.append(root) goldlines.append(list[:]) if read_batch and len(gold) == self.batch: self.gindex += 1 break list = [] else: list.append(line) self.gindex += 1 train_batch = [] for a, b, c, d, e in zip(kbest, kscores, gold, klines, goldlines): self.kbest_id += 1 if len(c.children) == 0: continue train_batch.append(data_util.instance(a, b, c, d, e)) return train_batch
def read_dev(kbest_filename, gold_filename, vocab): with open(kbest_filename, 'r') as reader: kbest_data = reader.readlines() kbest_data.append('PTB_KBEST') reader.close() kbest = [] scores = [] onebest = [] tree = [] onescores = [] lines = [] onelines = [] i = 0 while i < len(kbest_data): line = kbest_data[i] if line.strip() != 'PTB_KBEST': if line.strip() == '': onelines.append(tree[:]) onebest.append(train_iterator.read_tree(tree, vocab)) tree = [] elif not '_' in line: onescores.append(float(line)) else: tree.append(line) else: if len(onebest) > 1: kbest.append(onebest[:]) scores.append(onescores[:]) lines.append(onelines) onelines = [] onebest = [] onescores = [] i += 1 with open(gold_filename, 'r') as reader: data = reader.readlines() reader.close() list = [] gold = [] gold_lines = [] for line in data: if line.strip() == '': root = train_iterator.read_tree(list, vocab) gold.append(root) gold_lines.append(list[:]) list = [] else: list.append(line) dev_data = [] for a, b, c, d, e in zip(kbest, scores, gold, lines, gold_lines): if len(c.children) == 0: continue dev_data.append(data_util.instance(a, b, c, d, e)) return dev_data
def read_all(self): scores = [] kscores = [] tree = [] ktrees = [] kbest = [] lines = [] klines = [] index = 0 while index < self.length: line = self.data[index] if line.strip() != 'PTB_KBEST': if line.strip() == '': ktrees.append(read_tree(tree, self.vocab)) lines.append(tree[:]) tree = [] elif not '_' in line: scores.append(float(line)) else: tree.append(line) else: if len(ktrees) > 2: kbest.append(ktrees[:]) kscores.append(scores[:]) klines.append(lines[:]) lines = [] ktrees = [] scores = [] index += 1 # read gold list = [] gold = [] goldlines = [] gindex = 0 while gindex < self.glength: line = self.gdata[gindex] if line.strip() == '': root = read_tree(list, self.vocab) goldlines.append(list[:]) gold.append(root) list = [] else: list.append(line) gindex += 1 train_batch = [] for a, b, c, d, e in zip(kbest, kscores, gold, klines, goldlines): self.kbest_id += 1 if len(c.children) == 0: continue train_batch.append(data_util.instance(a, b, c, d, e)) return train_batch
def read_give_tree(self, tree_index): scores = [] tree = [] ktrees = [] kbestlines = [] # read train index = 0 best_num = 0 while index < self.length: line = self.data[index] if best_num == tree_index: if line.strip() != 'PTB_KBEST': if line.strip() == '': ktrees.append(read_tree(tree, self.vocab)) kbestlines.append(tree[:]) tree = [] elif not '_' in line: scores.append(float(line)) else: tree.append(line) else: break if line.strip() == 'PTB_KBEST': best_num += 1 index += 1 # read gold list = [] gold = [] index = 0 num = 1 while index < self.glength: line = self.gdata[index] if num == tree_index: if line.strip() == '': root = read_tree(list, self.vocab) gold.append(root) break else: list.append(line) if line.strip() == '': num += 1 index += 1 retval = data_util.instance(ktrees, scores, gold, gold_lines=list, lines=kbestlines) return retval
def read_random_batch(self, batch_size=400): total_list = [] for i in range(39830): total_list.append(i) import random random.shuffle(total_list) total_list = total_list[0:batch_size] sorted(total_list) scores = [] kscores = [] tree = [] ktrees = [] kbest = [] # read train lines = [] klines = [] i = 0 while i < self.length: line = self.data[i] if line.strip() != 'PTB_KBEST': if line.strip() == '': ktrees.append(read_tree(tree, self.vocab)) lines.append(tree[:]) tree = [] elif not '_' in line: scores.append(float(line)) else: tree.append(line) else: if len(ktrees) > 2: kbest.append(ktrees[:]) kscores.append(scores[:]) klines.append(lines[:]) if read_batch and len(kbest) == self.batch: self.index += 1 break ktrees = [] scores = [] lines = [] self.index += 1 # read gold list = [] gold = [] goldlines = [] while self.gindex < self.glength: line = self.gdata[self.gindex] if line.strip() == '': root = read_tree(list, self.vocab) gold.append(root) goldlines.append(list[:]) if read_batch and len(gold) == self.batch: self.gindex += 1 break list = [] else: list.append(line) self.gindex += 1 train_batch = [] for a, b, c, d, e in zip(kbest, kscores, gold, klines, goldlines): self.kbest_id += 1 if len(c.children) == 0: continue train_batch.append(data_util.instance(a, b, c, d, e)) return train_batch