def decode(self, mode, write_fp, decode_fn): self.model.eval() cnt = 0 sampler, nb_instance = self.iterate_instance(mode) decode_fn.reset() # with open(f'{write_fp}.{mode}.tsv', 'w') as fp: # fix alexander kahanek with open('{0}.{1}.tsv'.format(write_fp, mode), 'w') as fp: # fp.write(f'prediction\ttarget\tloss\tdist\n') # fix alexander kahanek fp.write('prediction\ttarget\tloss\tdist\n') for src, trg in tqdm(sampler(), total=nb_instance): pred, _ = decode_fn(self.model, src) dist = util.edit_distance(pred, trg.view(-1).tolist()[1:-1]) src_mask = dummy_mask(src) trg_mask = dummy_mask(trg) data = (src, src_mask, trg, trg_mask) loss = self.model.get_loss(data).item() trg = self.data.decode_target(trg)[1:-1] pred = self.data.decode_target(pred) fp.write( # f'{" ".join(pred)}\t{" ".join(trg)}\t{loss}\t{dist}\n') # fix alexander kahanek '{0}\t{1}\t{2}\t{3}\n'.format(" ".join(pred), " ".join(trg), loss, dist)) cnt += 1 decode_fn.reset() # self.logger.info(f'finished decoding {cnt} {mode} instance') # fix alexander kahanek self.logger.info('finished decoding {0} {1} instance'.format( cnt, mode))
def decode(self, mode, batch_size, write_fp, decode_fn): self.model.eval() cnt = 0 sampler, nb_batch = self.iterate_batch(mode, batch_size) with open(f"{write_fp}.{mode}.tsv", "w") as fp: fp.write("prediction\ttarget\tloss\tdist\n") for src, src_mask, trg, trg_mask in tqdm(sampler(batch_size), total=nb_batch): pred, _ = decode_fn(self.model, src, src_mask) self.evaluator.add(src, pred, trg) data = (src, src_mask, trg, trg_mask) losses = self.model.get_loss(data, reduction=False).cpu() pred = util.unpack_batch(pred) trg = util.unpack_batch(trg) for p, t, loss in zip(pred, trg, losses): dist = util.edit_distance(p, t) p = self.data.decode_target(p) t = self.data.decode_target(t) fp.write( f'{" ".join(p)}\t{" ".join(t)}\t{loss.item()}\t{dist}\n' ) cnt += 1 self.logger.info(f"finished decoding {cnt} {mode} instance") results = self.evaluator.compute(reset=True) return results
def __add(self, tree, root, word): """Add a word.""" distance = edit_distance(root[0], word) collision = False for (child, child_distance) in tree[root].keys(): if distance == child_distance: self.__add(tree[root], (child, child_distance), word) collision = True break if not collision: tree[root][(word, distance)] = {}
def distance(self, other_state): """ Returns the distance between two WorldStates. Inputs: other_state (AlchemyState): The other alchemy state to compute the distance from. Returns: float representing the distance. """ delta = 0 for this_beaker, that_beaker in zip(self._beakers, other_state.beakers()): delta += edit_distance(this_beaker, that_beaker) return delta
def subcluster_by_editdistance(self, center, item_list, threshold=2): clusters = defaultdict(list) clusters[center].append(center) for item in item_list: if item in clusters.keys(): continue flag = 0 list_item = self.stemmer.stem(item.encode('utf-8')).split() for goal in clusters.keys(): list_goal = self.stemmer.stem(goal.encode('utf-8')).split() d = edit_distance(list_goal, list_item) if d < threshold: clusters[goal].append(item) flag = 1 break if flag == 0: clusters[item].append(item) return clusters
def autocomplete(suggest_tree, bktree, prefix, count=5): """Suggest top completions for a prefix given a SuggestTree and BKTree. Completions for a given prefix are weighted primarily by their weight in the suggest tree, and secondarily by their Levenshtein distance to words in the BK-tree (where nearby words are weighted higher).""" completion_weights = suggest_tree.completion_weights(prefix) if completion_weights: weight = lambda completion: completion_weights[completion] proximity = lambda completion: completion_proximity_score( prefix, completion) selection_criteria = lambda completion: ( weight(completion), proximity(completion)) completions = completion_weights.keys() return heapq.nlargest(count, completions, key=selection_criteria) else: matches = bktree.search(prefix) proximity = lambda completion: edit_distance(prefix, completion) return heapq.nsmallest(count, matches, key=proximity)
def search(self, prefix, tolerance=2, tree=None, root=None, matches=None): """Search for words within a given edit distance of prefix.""" # TODO: Number of arguments can be reduced by defining BKTree # recursively (i.e. root and tree args shouldn't be necessary). if root is None: root = self.root if tree is None: tree = self.tree[self.root] if matches is None: matches = set() prefix_distance = edit_distance(prefix, root[0]) if prefix_distance <= tolerance: matches.add(root[0]) for word, distance in tree.keys(): if abs(prefix_distance - distance) <= tolerance: child = (word, distance) self.search(prefix, tolerance, tree[child], child, matches) return matches
def decode(self, mode, write_fp, decode_fn): self.model.eval() cnt = 0 sampler, nb_instance = self.iterate_instance(mode) decode_fn.reset() with open(f"{write_fp}.{mode}.tsv", "w") as fp: fp.write("prediction\ttarget\tloss\tdist\n") for src, trg in tqdm(sampler(), total=nb_instance): pred, _ = decode_fn(self.model, src) dist = util.edit_distance(pred, trg.view(-1).tolist()[1:-1]) src_mask = dummy_mask(src) trg_mask = dummy_mask(trg) data = (src, src_mask, trg, trg_mask) loss = self.model.get_loss(data).item() trg = self.data.decode_target(trg)[1:-1] pred = self.data.decode_target(pred) fp.write(f'{" ".join(pred)}\t{" ".join(trg)}\t{loss}\t{dist}\n') cnt += 1 decode_fn.reset() self.logger.info(f"finished decoding {cnt} {mode} instance")
### 5. Step: Determine mean edit distance test_error = np.zeros([rn.batch_quantity('test'), rn.batch_size()]) for j in xrange(rn.batch_quantity('test')): net_out = rn.forward_fn(mb_test_x[j], mb_test_m[j]) for b in xrange(rn.batch_size()): true_out = mb_test_y[j][b, :] cln_true_out = np.delete(true_out, np.where(true_out == 10)) net_out = net_out[:, b, :] arg_net_out = np.argmax(net_out, axis=1) cln_net_out = np.delete(arg_net_out, np.where(arg_net_out == 10)) test_error[j, b] = edit_distance(cln_true_out, cln_net_out) print("Test set mean edit distance: " + "{0:.4f}".format(np.mean(test_error))) # Plot results sample_no = 1 batch = 0 net_out = rn.forward_fn(mb_test_x[sample_no], mb_test_m[sample_no]) sample = mb_test_x[sample_no][:, batch, :] mask = mb_test_m[sample_no][:, batch, :] signal = net_out[:, batch, :] fig = plt.figure() fig.suptitle('Numbers recognition - Sample') plt.subplot(2, 1, 1) plt.xlabel('Image of numbers')