elif word.istitle(): word = "_CF_" elif all(c in string.punctuation or c.isdigit() for c in word): word = "_NP_" else: word = "_RARE_" # Iterate over u and v for u in K[k - 1]: for v in K[k]: # Find max over w in K[k-2] w_candidates = defaultdict(float) for w in K[k - 2]: w_candidates[w] = pi[k - 1][(w, u)] * counter.calc_mle( [w, u, v]) * counter.calc_emissions(word, v) final_w = max(w_candidates.iteritems(), key=operator.itemgetter(1)) # Assign pi value pi[k][(u, v)] = final_w[1] # Get the (tag, probability) of v in max(pi[k](u,v)) final_k_idx = max(pi[k].iteritems(), key=operator.itemgetter(1)) prob = final_k_idx[1] # Log probability log_prob = math.log(prob) # Ouput format: word, tag, log probability sys.stdout.write("%s %s %s\n" %
sys.exit(2) try: counts_file = file(sys.argv[1], "r") trigram_file = sys.argv[2] except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) # Initialize a trigram counter counter = Hmm(3) # Read in counts counter.read_counts(count_file) # Iterate through trigrams in trigram_file and calculate the log probability of each trigram. for line in test_file: trigram = line.strip().split(" ") if trigram: # Nonempty line prob = counter.calc_mle(trigram) # Get the log of the probability log_prob = math.log(prob) # Write log probability to output file sys.stdout.write( "%s %s %s %s\n" % (trigram[0], trigram[1], trigram[2], str(log_prob))) else: print ""