class Tagger: def __init__(self, common_file, counts_file): self.common_words = get_common_words(common_file) self.hmm = Hmm(3) self.hmm.read_counts(counts_file)
def replace_rare(raw_data_file, raw_count_file, output_file, rare_counts = 5): # read in the raw counts from hmm fp = open(raw_count_file, 'r') hmm = Hmm(3) hmm.read_counts(fp) fp.close() # accumulate the word counts from emission_counts word_count = defaultdict(int) for word_tag in hmm.emission_counts: word_count[word_tag[0]] += hmm.emission_counts[word_tag] rare_words = set([word for word in word_count if word_count[word] < rare_counts]) #print rare_words # replace rare words with _RARE_ input = open(raw_data_file, 'r') output = open(output_file, 'w') for line in input: line = line.strip() if line: word, tag = line.split(" ") if word in rare_words: word_class = get_word_class(word) output.write(" ".join([word_class, tag])) #output.write(" ".join(['_RARE_', tag])) else: output.write(line) output.write("\n") input.close() output.close()
def save_transition_probs(input_file): """ Computes and stores trigrams and their respective transition probabilities from an input file containing the trigrams """ # read counts file counter = Hmm(3) counter.read_counts(file('ner_rare.counts')) out_lines_list = [] l = input_file.readline() while l: line = l.strip() if line: # Nonempty line trigram = tuple(line.split()) # get transition probability of trigram prob = compute_transition_prob( counter.ngram_counts[1][(trigram[0], trigram[1])], counter.ngram_counts[2][trigram]) # get log probability log_prob = math.log(prob) l = line + " " + str(log_prob) out_lines_list.append(l) l = input_file.readline() out_lines = "\n".join(out_lines_list) # write trigrams and their log probs to file with open('5_1.txt', 'w') as out_file: out_file.write(out_lines)
class Hmm(object): def __init__(self, counts_file="gene.counts"): self.hmm = Hmm() self.hmm.read_counts(counts_file) def emission(self, x, y): pass
def problem4(count_file, dev_file): """Implement a simple named entity tagger and output predictions.""" try: infile = file(count_file, "r") except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) # Initialize a trigram counter counter = Hmm(3) # Read counts counter.read_counts(infile) # Write the predictions counter.write_predicts(dev_file, sys.stdout)
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"): """ Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file. """ # get frequently occurring words word_count_dict = get_word_counts(file('ner_train.dat')) freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5] # compute emission probs counter = Hmm(3) counter.read_counts(counts_file) emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0]) out_lines_list = [] l = dev_file.readline() while l: word = l.strip() if word: # Nonempty line # use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words. if word not in freq_words: tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0] prob = emission_probs[rare_symbol][tag] # use emission probabilities of the word itself for frequently occurring words. else: tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0] prob = emission_probs[word][tag] log_prob = math.log(prob, 2) l = word + " " + tag + " " + str(log_prob) else: l = "" out_lines_list.append(l) l = dev_file.readline() out_lines = "\n".join(out_lines_list) out_lines = out_lines + "\n" # write words, corresponding tags and log probs to file with open('4_2.txt','w') as out_file: out_file.write(out_lines)
from collections import defaultdict from count_freqs import Hmm def p2_1emission (word,tag,hmm,countTag): #print "p2_1 " + word + " " + tag + " %i" %hmm.emission_counts[(word,tag)] if (word,tag) in hmm.emission_counts: return hmm.emission_counts[(word,tag)]/countTag[tag] else: return 0 if __name__ == "__main__": input = file(sys.argv[1],"r") model = Hmm(3) #print len(model.emission_counts) model.read_counts(input) #print len(model.emission_counts #if ("BACKGROUND","O") in model.emission_counts: #print "yes" #print model.all_states testFile = file(sys.argv[2],"r") tagsNum = len(model.all_states) countTag = dict.fromkeys(model.all_states,0) #print countTag for (word,tag) in model.emission_counts: countTag[tag] += model.emission_counts[(word,tag)] #print countTag for line in testFile: word = line.strip()
#! /usr/bin/python # __author__="Xiaochen Wei <*****@*****.**>" __date__ ="$Sep 20, 2014" from dataClean import * from count_freqs import Hmm import math # the file of train data trainingDataFilePath = "ner.counts" hmm = Hmm(3) inputFile = open(trainingDataFilePath, "r") hmm.read_counts(inputFile) class SimpleNamedEntityTagger: ''' get the Emission Parameter INPUT: the target word, and the status of target word ====================== RETURN: the emission of a target in specific targetType ''' def GetEmissionParameters(self, target, targetType): sumCount = 0 count = 0 if target not in [key[0] for key in hmm.emission_counts.keys()]:
import sys from collections import defaultdict import math from count_freqs import Hmm """ Implement the Viterbi algorithm to compute argmax (y1...yn) p(x1...xn, y1...yn) Your tagger should have the same basic functionality as the baseline tagger. Instead of emission probabilities the third column should contain the log-probability of the tagged sequence up to this word. """ if __name__ == "__main__": if len(sys.argv) != 3: # Expect exactly two arguments: the counts file and dev file usage() sys.exit(2) try: counts_file = file(sys.argv[1], "r") except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) counter = Hmm(3) # Read counts counter.read_counts(counts_file) counter.viterbi_read(sys.argv[2])
if word in infreq_words: if word.isupper(): f2.write("_UPPER_" + " " + parts[1] + "\n") elif word.isdigit(): f2.write("_DIGIT_" + " " + parts[1] + "\n") elif not word.isalpha(): f2.write("_NOTALPHA_" + " " + parts[1] + "\n") else: f2.write("_RARE_" + " " + parts[1] + "\n") else: f2.write(line) f2.close() def usage(): print """ python add_class.py [count_file] [training_data] """ if __name__ == "__main__": if len(sys.argv)!=3: # Expects two argument: original count file and training data file usage() sys.exit(2) counter = Hmm(3) # finds count information for words in file (em_count, ngram_count, infreq_word_set, all_tags, all_words) = counter.read_counts(sys.argv[1]) #produces new file with _RARE_ replace_class(sys.argv[2], infreq_word_set)
word = " ".join(fields[:-1]) # replace word with its category if frequency < count_thresh if word_count_dict[word] < count_thresh: line = " ".join([get_category(word), fields[-1]]) out_lines_list.append(line) l = in_file.readline() out_lines = "\n".join(out_lines_list) out_file.write(out_lines) if __name__ == "__main__": # replace infrequent words with categories and write to file replace_infrequent_words_with_categories(file('ner_train.dat'), file('ner_train_cats.dat', 'w')) # generate counts file os.system('python count_freqs.py ner_train_cats.dat > ner_cats.counts') # get frequent words word_count_dict = get_word_counts(file('ner_train.dat')) freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5] # get transition and emission probabilities counter = Hmm(3) counter.read_counts(file('ner_cats.counts')) transition_probs = compute_transition_probs(counter.ngram_counts[1], counter.ngram_counts[2]) emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0]) # store tagged data with the log probs to file tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words) os.system('python eval_ne_tagger.py ner_dev.key 6.txt')
with open(file, "r") as f: f2 = open("ner_train_rare.dat", "w") for line in f: parts = line.strip().split(" ") word = parts[0] if word in infreq_words: f2.write("_RARE_" + " " + parts[1] + "\n") else: f2.write(line) f2.close() def usage(): print """ python add_rare.py [count_file] [training_data] """ if __name__ == "__main__": if len(sys.argv)!=3: # Expects two argument: original count file and training data file usage() sys.exit(2) counter = Hmm(3) # finds count information for words in file (em_count, ngram_count, infreq_word_set, all_tags, all_words) = counter.read_counts(sys.argv[1]) #produces new file with _RARE_ replace_rare(sys.argv[2], infreq_word_set)
sys.stdout.write(line) if __name__ == "__main__": if len( sys.argv ) != 3: # Expect exactly two arguments: the counts and corresponding training data file usage() sys.exit(2) try: input = file(sys.argv[1], "r") output = sys.argv[2] except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) # Initialize a trigram counter counter = Hmm(3) # Read in counts counter.read_counts(input) # Filter words with count < 5 low_words = dict( (k, v) for k, v in counter.word_counts.iteritems() if v < 5) high_words = dict( (k, v) for k, v in counter.word_counts.iteritems() if v > 5) # Replace each instance of word in low_words with _RARE_ in training set replace_all(output, low_words, '_RARE_')
word = line[:-1] if len(word) != 0: if word in keys: tag = viterbi(d[0], d[1], word) else: tag = viterbi(d[0], d[1], classify(word)) output.write("%s %s\n" % (word, tag)) d.append(tag) else: output.write("\n") d = deque(["*", "*"], maxlen=2) if __name__ == "__main__": counter = Hmm(3) counter.read_counts(file("outputs/p3_count.txt", "r")) bigram_counts = counter.ngram_counts[1] trigram_counts = counter.ngram_counts[2] keys = set() for k in counter.emission_counts.keys(): keys.add(k[0]) # FOR THE DEVELOPENT FILE write_tags("data/gene.dev", keys, file("outputs/gene_dev.p3.out", "w")) """ TO EVALUATE, RUN: >>> python eval_gene_tagger.py data/gene.key outputs/gene_dev.p3.out AND THE OUTPUT WILL BE: Found 404 GENEs. Expected 642 GENEs; Correct: 214.
#!/usr/bin/python import sys from count_freqs import Hmm countInput = file(sys.argv[1],"r") hmm = Hmm(3) hmm.read_counts(countInput) for tag in hmm.all_states: hmm.emission_counts[("_RARE_",tag)]=0 hmm.emission_counts[("_Numeric_",tag)]=0 hmm.emission_counts[("_AllCapitals_",tag)]=0 hmm.emission_counts[("_LastCapital_",tag)]=0 for key,value in hmm.emission_counts.items(): #print value if key[0] == "_RARE_": continue if value < 5: if key[0].isdigit(): hmm.emission_counts[("_Numeric_",key[1])] += value #print "%s delete %i to Numeric %i" %(key,value,hmm.emission_counts[("_Numeric_",key[1])]) elif key[0].isalpha() and key[0].isupper(): hmm.emission_counts[("_AllCapitals_",key[1])] += value #print "%s delete %i to Captital %i" %(key,value,hmm.emission_counts[("_AllCapitals_",key[1])]) elif key[0].isalpha() and key[0][-1].isupper(): #elif key[0][-1].isupper(): hmm.emission_counts[("_LastCapital_",key[1])] += value #print "%s delete %i to LastCaptital %i" %(key,value,hmm.emission_counts[("_LastCapital_",key[1])]) else:
def __read_counts(self, count_file): fp = open(count_file, 'r') hmm = Hmm(3) hmm.read_counts(fp) fp.close() return hmm
out_lines = "\n".join(out_lines_list) out_lines = out_lines + "\n" # write to file with open('5_2.txt', 'w') as out_file: out_file.write(out_lines) if __name__ == "__main__": os.system('python 4_1.py') os.system('python count_freqs.py ner_train_rare.dat > ner_rare.counts') # get frequent words word_count_dict = get_word_counts(file('ner_train.dat')) freq_words = [ word for word in word_count_dict if word_count_dict[word] >= 5 ] # get transition and emission probs counter = Hmm(3) counter.read_counts(file('ner_rare.counts')) transition_probs = compute_transition_probs(counter.ngram_counts[1], counter.ngram_counts[2]) emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0]) # store tagged data with the log probs to file tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words) os.system('python eval_ne_tagger.py ner_dev.key 5_2.txt')
from collections import defaultdict from count_freqs import Hmm import math import sys def emission_probability(word, tag, emission_counts, ngram_counts): return emission_counts[(word, tag)] / ngram_counts[0][(tag, )] if __name__ == "__main__": counts_file = open(sys.argv[1]) sentences_file = open(sys.argv[2]) hmm = Hmm() hmm.read_counts(counts_file) emission_counts = hmm.emission_counts ngram_counts = hmm.ngram_counts entity_tags = hmm.all_states trained_words = defaultdict(int) infrequent_words = defaultdict(int) for word, tag in emission_counts: trained_words[word] += hmm.emission_counts[(word, tag)] for word in trained_words: if trained_words[word] < 5: infrequent_words[word] = 1
from collections import defaultdict from count_freqs import Hmm import math import sys def emission_probability(word, tag, emission_counts, ngram_counts): return emission_counts[(word, tag)] / ngram_counts[0][(tag,)] if __name__ == "__main__": counts_file = open(sys.argv[1]) sentences_file = open(sys.argv[2]) hmm = Hmm() hmm.read_counts(counts_file) emission_counts = hmm.emission_counts ngram_counts = hmm.ngram_counts entity_tags = hmm.all_states trained_words = defaultdict(int) infrequent_words = defaultdict(int) for word, tag in emission_counts: trained_words[word] += hmm.emission_counts[(word, tag)] for word in trained_words: if trained_words[word] < 5: infrequent_words[word] = 1 for word in infrequent_words:
) != 3: # Expect exactly one argument: the training data file usage() sys.exit(2) try: counts_file = file(sys.argv[1], "r") test_file = file(sys.argv[2], "r") except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) # Initialize a trigram counter counter = Hmm(3) # Read in counts counter.read_counts(counts_file) # Iterate over all test sentences test_sent_iterator = sent_iterator(word_iterator(test_file)) for sentence in test_sent_iterator: # Viterbi Algorithm n = len(sentence) pad_sent = (2) * ["*"] pad_sent.extend(sentence) pad_sent.append("STOP") # Initialize # K[0], K[-1] = "*", K[1...n] = all_states K = ["*"] + (n) * [counter.all_states] + ["*"]
if __name__ == "__main__": if len(sys.argv) < 4: # Expects atleast 3 arguments usage() sys.exit(2) try: input = file(sys.argv[1], "r") except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) # Initialize a trigram counter counter = Hmm(3) if (len(sys.argv) == 4): #to obtain original counts (em_count1, ngram_count1, infreq_word1, all_tags1, all_words1) = counter.read_counts(sys.argv[3]) #to process new data (em_count, ngram_count, infreq_word, all_tags, all_words) = counter.read_counts(sys.argv[1]) #to obtain emission prob emission_probabilities = emission_parameters(sys.argv[2], em_count, ngram_count[0], all_tags, all_words1, infreq_word1) else: #to process new data (em_count, ngram_count, infreq_word, all_tags, all_words) = counter.read_counts(sys.argv[1]) #to obtain trigram prob from samplefile trigram(ngram_count, sys.argv[4])