def run(self): # update per-sentence grammars, if there's any for g in self.grammars: g.update(self.id) self.flog = open('%s/%s_%s' % (FLAGS.run_dir, 'log', self.suffix), 'w') if FLAGS.show_time: self.flog.write('running on %s\n\n' % socket.gethostname()) self.flog.flush() fwords = self.line.strip().split() # added by freesunshine, build the local grammar for oov words for each sentence rules = [] if self.oov_idx is not None and len(self.oov_idx) > 0: #oov_weight = 8.0 oov_weight = 0.0001 for idx in self.oov_idx: fw = fwords[idx] ew = "." rule_str = "[A0-0] ||| %s ||| %s ||| %lf %lf %lf" %(fw, ew, oov_weight, oov_weight, oov_weight) rr = Rule() rr.fromstr(rule_str) rules.append(rr) if self.ner_items is not None and len(self.ner_items) > 0: for item in self.ner_items: concept_weight = 10.0 st = item[0][0] ed = item[0][1] fw = ' '.join(fwords[st:ed]) #concept_weight *= pow((ed-st), 2) ew = item[1] value = int(ew[2]) #Here is the feature for difference of nonterminal type #concept_weight /= pow(1.4, value) #Here is the feature for the favor of longer spans #concept_weight *= pow(2, ed-st) #Here is the feature for the number of edges #concept_weight /= pow(2.0, get_num_edges(ew)) #print >>sys.stder, ew, concept_weight #rule_str = "[A1-1] ||| %s ||| %s ||| " % (fw, ew) rule_str = "%s ||| " % ew #weight = 5 if fw == ';': rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight) else: rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight) rr = Rule() #print rule_str rr.fromstr(rule_str) rules.append(rr) #print '===== local_gr =====' #for r in rules: # print r local_gr = None if len(rules) > 0: local_gr = Grammar(FLAGS.rule_bin_size) local_gr.build(rules, self.grammars[0].features) if FLAGS.preprocess: self.fidx2replacement = {} j = 0 for i, token in enumerate(fwords): if token in ('$number', '$date'): self.fidx2replacement[i] = self.special[j][1] j += 1 self.flog.write('[%s][%s words] %s\n' % (self.id, len(fwords), self.line)) decoder = Decoder(fwords, self.grammars, self.features, local_gr) begin_time = time() if FLAGS.decoding_method == 'agenda': item = decoder.decode() elif FLAGS.decoding_method == 'cyk': item = decoder.decode_cyk() elif FLAGS.decoding_method == 'earley': item = decoder.decode_earley() else: assert False, '"%s" not valid decoding option' \ % FLAGS.decoding_method self.time = time() - begin_time if item is None: self.out = '[decoder failed to build a goal item]' else: ttt, succ = item item = ttt hg = Hypergraph(item) hg.set_semiring(hypergraph.SHORTEST_PATH) hg.set_functions(lambda x: x.cost, None, None) hg.topo_sort() self.kbest = hg.root.best_paths() #output_tokens = self.kbest[0].translation[:] #if FLAGS.preprocess: # for i in range(len(output_tokens)): # if output_tokens[i] in ('$number', '$date'): # fidx = self.kbest[0].composed_rule.we2f[i] # if fidx is not None: # output_tokens[i] = self.fidx2replacement[fidx] # @freesunshine target side string output #self.out = ' '.join(output_tokens[FLAGS.lm_order-1: # 1-FLAGS.lm_order]) self.flog.write('Decuction Tree:\n%s\n' % self.kbest[0].tree_str()) #self.out = str(self.kbest[0].translation) #if succ: self.out = self.kbest[0].translation.to_amr_format()[0] #else: # self.out = self.kbest[0].translation.toAMR() lines = [x.strip() for x in self.out.split('\n')] self.out = "".join(lines) self.hg = hg if FLAGS.output_hypergraph: self.write_hypergraph() self.flog.write('%s\n' % self.out) self.flog.write('\n') #if item is not None: # self.flog.write(self.kbest[0].tree_str()) # self.flog.write('\n') # self.flog.write(hg.stats()) # self.flog.write('\n') self.flog.write(decoder.agenda_stats()) self.flog.write('\n') self.flog.write(decoder.chart.stats()) self.flog.write('\n') for dotchart in decoder.dotcharts: self.flog.write(dotchart.stats()) self.flog.write('\n') if FLAGS.show_time: timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time) self.flog.write(timeline) self.write_output_file() if FLAGS.output_kbest: self.write_kbest_to_file() self.flog.close()