def execute(treebank, dev): print "reading treebank..." parses = utils.read_parses_no_indent(treebank) parse_lists = [] for parse in parses: parse_lists.append(utils.make_parse_list(parse)) print "learning pcfg..." nonterms, terms, start, prob = grammar.learn(parse_lists) print "learning hmm..." emission, transition = sequnece_labeler.learn(parse_lists) print "reading dev data..." dev_sentences = utils.get_sentences(dev) print dev_sentences[100] for sentence in dev_sentences: parse = cky.run(sentence, nonterms, start, prob) sequnece = viterbi.run(sentence, emission, transition)
def quick_execute(dev): print "loading learnt parameters..." pcfg_prob, nonterms, start = cky.get_pcfg() hmm, tagset = viterbi.get_hmm_tagset() print "reading dev data..." parses = utils.read_parses_no_indent(dev) i = 0 for parse in parses: if len(parse) > 100: parse_list = utils.make_parse_list(parse) sentence, truetags = utils.get_terminals_tags(parse_list) print '\n', sentence, '\n' #print dev_sentences.index(sentence) print "running dual decomposition..." num_iterations = dd_parser_tagger.run(sentence, pcfg_prob, nonterms, start, tagset, hmm) print "\n", truetags, " :true tags" if num_iterations != -1: print "converges in ", num_iterations ," iterations \n" else: print "does not converge :(\n"
prev_tag, current_tag = trans.split("~>") counts.write(str(count)+ " 2-GRAM "+ prev_tag+ " "+ current_tag+ "\n") counts.close() def learn(parses): emission_counts = defaultdict() transition_counts = defaultdict() tag_counts = defaultdict() for parse in parses: parse_list = utils.make_parse_list(parse) update_counts(parse_list, emission_counts, transition_counts, tag_counts) # I'm not doing smoothing because smoothing gives very bad results # Every -RARE- word gets assigned to the FW tag, and then all following tags are FW. # Because FW->-RARE- and FW->FW have high probabilities # emission_counts = smooth_emission(emission_counts, tag_counts) # I don't like this! Why won't u work otherwise, Python? set_hmm_params(emission_counts, transition_counts, tag_counts) check_if_prob_dist(emission_counts) check_if_prob_dist(transition_counts) write_hmm_params(emission_counts, transition_counts, tag_counts) #write_for_java(emission_counts, transition_counts, tag_counts) return emission_counts, transition_counts if __name__ == "__main__": treebank = sys.argv[1] parses = utils.read_parses_no_indent(treebank) emission, transition = learn(parses)