Пример #1
0
def save_transition_probs(input_file):
    """
	Computes and stores trigrams and their respective transition probabilities from an input file containing the trigrams
	"""

    # read counts file
    counter = Hmm(3)
    counter.read_counts(file('ner_rare.counts'))

    out_lines_list = []
    l = input_file.readline()
    while l:
        line = l.strip()
        if line:  # Nonempty line
            trigram = tuple(line.split())
            # get transition probability of trigram
            prob = compute_transition_prob(
                counter.ngram_counts[1][(trigram[0], trigram[1])],
                counter.ngram_counts[2][trigram])
            # get log probability
            log_prob = math.log(prob)
            l = line + " " + str(log_prob)

        out_lines_list.append(l)
        l = input_file.readline()
    out_lines = "\n".join(out_lines_list)

    # write trigrams and their log probs to file
    with open('5_1.txt', 'w') as out_file:
        out_file.write(out_lines)
Пример #2
0
class Tagger:

    def __init__(self, common_file, counts_file):
        self.common_words = get_common_words(common_file)
        
        self.hmm = Hmm(3)
        self.hmm.read_counts(counts_file)
Пример #3
0
def replace_rare(raw_data_file, raw_count_file, output_file, rare_counts = 5):
    # read in the raw counts from hmm
    fp = open(raw_count_file, 'r')
    hmm = Hmm(3)
    hmm.read_counts(fp)
    fp.close()

    # accumulate the word counts from emission_counts
    word_count = defaultdict(int)
    for word_tag in hmm.emission_counts:
        word_count[word_tag[0]] += hmm.emission_counts[word_tag]
    rare_words = set([word for word in word_count if word_count[word] < rare_counts])
    #print rare_words

    # replace rare words with _RARE_
    input = open(raw_data_file, 'r')
    output = open(output_file, 'w')
    for line in input:
        line = line.strip()
        if line:
            word, tag = line.split(" ")
            if word in rare_words:
                word_class = get_word_class(word)
                output.write(" ".join([word_class, tag]))
                #output.write(" ".join(['_RARE_', tag]))
            else:
                output.write(line)
        output.write("\n")
    input.close()
    output.close()
def gen_counts(input_path, output_path):
    if exists(output_path): return
    
    print 'Generating counts from: "%s"' % input_path
    counter = Hmm(3)
    counter.train(open(input_path, 'r'))
    counter.write_counts(open(output_path, 'w'))
Пример #5
0
	def __init__(self, infile="ner_train.dat"):
		self.counter = Hmm(3)
		with open(infile) as f:
			self.counter.train(f)
		self.unigrams = {k[0]:v for k,v in self.counter.ngram_counts[0].iteritems()} #since the key is a one-word tuple
		self.bigrams = self.counter.ngram_counts[1]
		self.trigrams = self.counter.ngram_counts[2]
		self.words = [x[0] for x in self.counter.emission_counts.keys()]
Пример #6
0
class Hmm(object):
    
    def __init__(self, counts_file="gene.counts"):
        self.hmm = Hmm()
        self.hmm.read_counts(counts_file)
        
    def emission(self, x, y):
        
        pass
Пример #7
0
def problem4(count_file, dev_file):
    """Implement a simple named entity tagger and output predictions."""

    try:
        infile = file(count_file, "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read counts
    counter.read_counts(infile)
    # Write the predictions
    counter.write_predicts(dev_file, sys.stdout)
Пример #8
0
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"):
	"""
	Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file.
	"""

	# get frequently occurring words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# compute emission probs
	counter = Hmm(3)
	counter.read_counts(counts_file)
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	out_lines_list = []
	l = dev_file.readline()
	while l:
		word = l.strip()
		if word:  # Nonempty line
			# use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words.
			if word not in freq_words:
				tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[rare_symbol][tag]

			# use emission probabilities of the word itself for frequently occurring words.
			else:
				tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[word][tag]
			log_prob = math.log(prob, 2)
			l = word + " " + tag + " " + str(log_prob)
		else:
			l = ""
		out_lines_list.append(l)
		l = dev_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_lines = out_lines + "\n"

	# write words, corresponding tags and log probs to file
	with open('4_2.txt','w') as out_file:
		out_file.write(out_lines)
Пример #9
0
def gen_counts(input_path, output_path):
    if exists(output_path): return

    print 'Generating counts from: "%s"' % input_path
    counter = Hmm(3)
    counter.train(open(input_path, 'r'))
    counter.write_counts(open(output_path, 'w'))
Пример #10
0
#! /usr/bin/python
#
__author__="Xiaochen Wei <*****@*****.**>"
__date__ ="$Sep 20, 2014"

from dataClean import *
from count_freqs import Hmm
import math

# the file of train data
trainingDataFilePath = "ner.counts"
hmm = Hmm(3)
inputFile = open(trainingDataFilePath, "r")
hmm.read_counts(inputFile)

class SimpleNamedEntityTagger:


	'''
	get the Emission Parameter

	INPUT: the target word, and the status of target word
	======================
	RETURN: the emission of a target in specific targetType
	'''
	def GetEmissionParameters(self, target, targetType):

		sumCount = 0
		count = 0
		
		if target not in [key[0] for key in hmm.emission_counts.keys()]:
Пример #11
0
import sys
from collections import defaultdict
import math
from count_freqs import Hmm

"""
Implement the Viterbi algorithm to compute
argmax (y1...yn) p(x1...xn, y1...yn)
Your tagger should have the same basic functionality as the baseline tagger.
Instead of emission probabilities the third column should contain the log-probability
of the tagged sequence up to this word.
"""

if __name__ == "__main__":

    if len(sys.argv) != 3:  # Expect exactly two arguments: the counts file and dev file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    counter = Hmm(3)
    # Read counts
    counter.read_counts(counts_file)

    counter.viterbi_read(sys.argv[2])
Пример #12
0
            if word in infreq_words:
                if word.isupper():
                    f2.write("_UPPER_" + " " + parts[1] + "\n")
                elif word.isdigit():
                    f2.write("_DIGIT_" + " " + parts[1] + "\n")
                elif not word.isalpha():
                    f2.write("_NOTALPHA_" + " " + parts[1] + "\n")
                else:
                    f2.write("_RARE_" + " " + parts[1] + "\n")
            else:
                f2.write(line)
        f2.close()


def usage():
    print """
    python add_class.py [count_file] [training_data] 
    """

if __name__ == "__main__":

    if len(sys.argv)!=3: # Expects two argument: original count file and training data file
        usage()
        sys.exit(2)

    counter = Hmm(3)
    # finds count information for words in file
    (em_count, ngram_count, infreq_word_set, all_tags, all_words) = counter.read_counts(sys.argv[1])
    #produces new file with _RARE_
    replace_class(sys.argv[2], infreq_word_set)      
Пример #13
0
class Tagger(object):
	def __init__(self, infile="ner_train.dat"):
		self.counter = Hmm(3)
		with open(infile) as f:
			self.counter.train(f)
		self.unigrams = {k[0]:v for k,v in self.counter.ngram_counts[0].iteritems()} #since the key is a one-word tuple
		self.bigrams = self.counter.ngram_counts[1]
		self.trigrams = self.counter.ngram_counts[2]
		self.words = [x[0] for x in self.counter.emission_counts.keys()]
	"""
	conditional probability that the word maps to tag given the number of times the tag occurs
	"""
	def compute_emission(self, word, tag):
		em = self.counter.emission_counts
		if tag == '*':
			return 0
		if (word,tag) in em:
			return em[(word,tag)]/float(self.unigrams[tag])
		elif word in self.words:
			return 0
		else:
			return em[('_RARE_',tag)]/float(self.unigrams[tag])
	"""
	returns the trigram count over the bigram count, defaulting the dict gets so that there aren't division by 0 errors
	"""
	def compute_trigram(self,yi,y1,y2):
		return self.trigrams.get((y2,y1,yi),0)/float(self.bigrams.get((y2,y1),1))

	"""
	basic file replacement, writes to a new file called rare-{infile} where infile is provided. Can pass a threshold of how many common_words
	is considered "rare"
	"""
	def replace_rare(self,infile,threshold=5):
		wordcounts = defaultdict(int)
		for tup in self.counter.emission_counts.iteritems():
			wordcounts[tup[0][0]] += tup[1] # aggregates counts of words total, with any tag
		common_words = [k for k,v in wordcounts.iteritems() if v >= threshold]
		replaced = 0
		f = open(infile)
		f2 = open(infile.replace('.dat','-rare.dat'), 'w')
		for line in f:
			if len(line.split(' ')) == 2:
				if line.split(' ')[0] not in common_words: # closed set, there are more rare than not rare, we know it's one or the other
					f2.write(line.replace(line.split(' ')[0], '_RARE_', 1))
					replaced +=1
				else:
					f2.write(line)
			else:
				f2.write(line) # maintain stops
		f.close()
		f2.close()
	"""
	returns a dictionary of relative probabilities for emission counts
	"""
	def tag_probabilities(self,word):
		counts = {tag:self.compute_emission(word,tag) for tag in self.unigrams}
		prob = lambda v: v/sum(counts.values()) if sum(counts.values()) != 0 else 0
		return {k:prob(v) for k,v in counts.iteritems()}
	"""
	wrapper function for dynamic programming algorithm, writes to outfile
	"""
	def viterbi(self,infile,outfile):
		
		def write_to_pred_file(f,sentence):
			tag_seq = [" ".join(x) for x in self.tag_sequence(sentence)] #tuples of tag,probability
			for word,tag in itertools.izip(sentence,tag_seq): # word, tag, probability
				f.write('%s %s\n' % (word,tag))
			f.write('\n')

		with open(infile) as f, open(outfile,"w") as f2:
			sentence = []
			for line in f:
				if line == '\n':
					write_to_pred_file(f2,sentence)
					sentence = []
					continue
				else:
					sentence.append(line.strip())
			#write the last sentence to the file (if there is no newline -- will just return and escape if sentence is empty
			write_to_pred_file(f2,sentence) 


	def tag_sequence(self,sentence):
		if len(sentence) == 0:
			return []
		possible_tags = self.unigrams.keys()
		possible_tags.append('*')
		bp = {i:{} for i in range(len(sentence) + 1)}
		# initialization: pi(0,'*','*') = 1, pi(0,u,v) = 0
		bp[0] = {t:('O',0) for t in itertools.product(possible_tags,repeat=2)}
		bp[0][('*','*')] = (1.0,1.0)
		# at idx 1, u can only be *
		for v in possible_tags:
			tag_max = ('sentinel',-1) #a real probability (since logs are only computed at end) will never be negative, so this will be reset
			tags = {}
			for w in possible_tags:
				tags[w] = bp[0][(w,'*')][1]*self.compute_trigram(v,w,'*')*self.compute_emission(sentence[0],v)
				if tags[w] > tag_max[1] and tags[w] != 0:
					tag_max = (w,tags[w])
			bp[1][('*',v)] = tag_max if tag_max != ('sentinel',-1) else ('O',0) #default tag is no tag, so O -- no sequences with this u,v with a nonzero probability

		for i,word in enumerate(sentence[1:], start=2): #from 2...n
			for v,u in itertools.product(possible_tags,repeat=2): #same as nested for u in K, v in K
				tag_max = ('sentinel', -1)
				tags = {}
				for w in possible_tags:
					if (w,u) in bp[i-1]:
						tags[w] = bp[i-1][(w,u)][1]*self.compute_trigram(v,u,w)*self.compute_emission(word,v)
						if tags[w] > tag_max[1] and tags[w] != 0:
							tag_max = (w,tags[w])
					bp[i][(u,v)] = tag_max if tag_max != ('sentinel',-1) else ('O',0)

		n = len(sentence)
		last = {(u,v): bp[n][(u,v)][1]*self.compute_trigram('STOP',v,u) for u,v in bp[n].keys()}
		yn1,yn = max(last, key=last.get) # max probability for sequence ending in STOP
		conf = last[(yn1,yn)]
		seq = [(yn,str(ln(conf))), (yn1,str(ln(conf)))] #sequence will be yn...y0

		for i in xrange(len(sentence) - 2, 0, -1):
			u,v = tuple(x[0] for x in reversed(seq[-2:])) #previous two are yn-1, yn-2
			prev = bp[i+2][(u,v)]
			seq.append((prev[0], str(ln(prev[1]))))
		return reversed(seq) #reversed yn...y0 is y0...yn
Пример #14
0
        sys.stdout.write(line)


if __name__ == "__main__":

    if len(
            sys.argv
    ) != 3:  # Expect exactly two arguments: the counts and corresponding training data file
        usage()
        sys.exit(2)

    try:
        input = file(sys.argv[1], "r")
        output = sys.argv[2]
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read in counts
    counter.read_counts(input)

    # Filter words with count < 5
    low_words = dict(
        (k, v) for k, v in counter.word_counts.iteritems() if v < 5)
    high_words = dict(
        (k, v) for k, v in counter.word_counts.iteritems() if v > 5)

    # Replace each instance of word in low_words with _RARE_ in training set
    replace_all(output, low_words, '_RARE_')
Пример #15
0
if __name__ == "__main__":

    if len(sys.argv
           ) != 3:  # Expect exactly one argument: the training data file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
        test_file = file(sys.argv[2], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read in counts
    counter.read_counts(counts_file)

    # Iterate through words in test data and calculate the log probability of each tag.
    for line in test_file:
        word = line.strip()

        if word:  # Nonempty line
            original_word = word
            # Check if word is absent in training set, if so, use _RARE_
            if word not in counter.all_words or counter.word_counts[word] < 5:
                word = "_RARE_"

            # Initialize dict to hold emission values
            candidates = defaultdict(float)
Пример #16
0
if __name__ == "__main__":

    if len(sys.argv
           ) != 3:  # Expect exactly one argument: the training data file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
        test_file = file(sys.argv[2], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)

    # Read in counts
    counter.read_counts(counts_file)

    # Iterate over all test sentences
    test_sent_iterator = sent_iterator(word_iterator(test_file))
    for sentence in test_sent_iterator:
        # Viterbi Algorithm
        n = len(sentence)

        pad_sent = (2) * ["*"]
        pad_sent.extend(sentence)
        pad_sent.append("STOP")

        # Initialize
Пример #17
0
    out_lines = "\n".join(out_lines_list)
    out_lines = out_lines + "\n"

    # write to file
    with open('5_2.txt', 'w') as out_file:
        out_file.write(out_lines)


if __name__ == "__main__":
    os.system('python 4_1.py')
    os.system('python count_freqs.py ner_train_rare.dat > ner_rare.counts')

    # get frequent words
    word_count_dict = get_word_counts(file('ner_train.dat'))
    freq_words = [
        word for word in word_count_dict if word_count_dict[word] >= 5
    ]

    # get transition and emission probs
    counter = Hmm(3)
    counter.read_counts(file('ner_rare.counts'))
    transition_probs = compute_transition_probs(counter.ngram_counts[1],
                                                counter.ngram_counts[2])
    emission_probs = compute_emission_probs(counter.emission_counts,
                                            counter.ngram_counts[0])

    # store tagged data with the log probs to file
    tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words)

    os.system('python eval_ne_tagger.py ner_dev.key 5_2.txt')
Пример #18
0
 def __read_counts(self, count_file):
     fp = open(count_file, 'r')
     hmm = Hmm(3)
     hmm.read_counts(fp)
     fp.close()
     return hmm
Пример #19
0
from collections import defaultdict
from count_freqs import Hmm
import math
import sys

def emission_probability(word, tag, emission_counts, ngram_counts):
  return emission_counts[(word, tag)] / ngram_counts[0][(tag,)]

if __name__ == "__main__":
  counts_file = open(sys.argv[1])
  sentences_file = open(sys.argv[2])
  
  hmm = Hmm()
  hmm.read_counts(counts_file)
  
  emission_counts = hmm.emission_counts
  ngram_counts = hmm.ngram_counts
  
  entity_tags = hmm.all_states
  trained_words = defaultdict(int)
  infrequent_words = defaultdict(int)
  
  for word, tag in emission_counts:
    trained_words[word] += hmm.emission_counts[(word, tag)]
  
  for word in trained_words:
    if trained_words[word] < 5:
      infrequent_words[word] = 1
  
  for word in infrequent_words:
Пример #20
0
#!/usr/bin/python

import sys
from count_freqs import Hmm

countInput = file(sys.argv[1],"r")
hmm = Hmm(3)
hmm.read_counts(countInput)

for tag in hmm.all_states:
    hmm.emission_counts[("_RARE_",tag)]=0
    hmm.emission_counts[("_Numeric_",tag)]=0
    hmm.emission_counts[("_AllCapitals_",tag)]=0
    hmm.emission_counts[("_LastCapital_",tag)]=0

for key,value in hmm.emission_counts.items():
    #print value
    if key[0] == "_RARE_":
        continue
    if value < 5:
        if key[0].isdigit():
            hmm.emission_counts[("_Numeric_",key[1])] += value
            #print "%s delete %i to Numeric %i" %(key,value,hmm.emission_counts[("_Numeric_",key[1])])
        elif key[0].isalpha() and key[0].isupper():
            hmm.emission_counts[("_AllCapitals_",key[1])] += value
            #print "%s delete %i to Captital %i" %(key,value,hmm.emission_counts[("_AllCapitals_",key[1])])
        elif key[0].isalpha() and key[0][-1].isupper():
        #elif key[0][-1].isupper():
            hmm.emission_counts[("_LastCapital_",key[1])] += value
            #print "%s delete %i to LastCaptital %i" %(key,value,hmm.emission_counts[("_LastCapital_",key[1])])
        else:
Пример #21
0
	else:
		return "I-GENE"

def get_rare_words(d):	
	temp_d = d.copy()
	O_words, GENE_words = set(), set()

	for key, value in d.iteritems():
		if value < 5 and get_max_value(temp_d, key[0]) < 5:
			if get_most_tag(temp_d, key[0]) == "O":
				O_words.add(key[0])
			else:
				GENE_words.add(key[0])

	return (O_words, GENE_words)

if __name__ == "__main__":
	counter = Hmm(3)
	counter.train(file("data/gene.train","r"), RARE=False)
	# print counter.emission_counts
	rare_words = get_rare_words(counter.emission_counts)
	# print len(rare_words[0]) #O_words = 19034
	# print len(rare_words[1]) #GENE_words = 6231

	with open("data/rare_words.pickle", "wb") as f:
		pickle.dump(rare_words, f)




Пример #22
0
        for line in f.readlines():
            word = line[:-1]
            if len(word) != 0:
                if word in keys:
                    tag = viterbi(d[0], d[1], word)
                else:
                    tag = viterbi(d[0], d[1], classify(word))
                output.write("%s %s\n" % (word, tag))
                d.append(tag)
            else:
                output.write("\n")
                d = deque(["*", "*"], maxlen=2)


if __name__ == "__main__":
    counter = Hmm(3)
    counter.read_counts(file("outputs/p3_count.txt", "r"))
    bigram_counts = counter.ngram_counts[1]
    trigram_counts = counter.ngram_counts[2]

    keys = set()
    for k in counter.emission_counts.keys():
        keys.add(k[0])

    # FOR THE DEVELOPENT FILE
    write_tags("data/gene.dev", keys, file("outputs/gene_dev.p3.out", "w"))
    """
	TO EVALUATE, RUN:
		>>> python eval_gene_tagger.py data/gene.key outputs/gene_dev.p3.out
	AND THE OUTPUT WILL BE:
		Found 404 GENEs. Expected 642 GENEs; Correct: 214.
Пример #23
0
                max_prob = math.log(max_prob, 2)
            output_file.write("%s %s %f\n" % (word, max_tag, max_prob))


def usage():
    print """
    python simple_tagger.py [counts_file] [test_file] > [output_file]
    """


if __name__ == "__main__":
    if len(sys.argv) != 3:
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
        test_file = file(sys.argv[2], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read counts
    counter.read_counts(counts_file)
    # Initialize a simple tagger
    tagger = SimpleTagger(counter)
    # Tag the data
    tagger.tag(test_file, sys.stdout)
Пример #24
0
 def __init__(self, counts_file="gene.counts"):
     self.hmm = Hmm()
     self.hmm.read_counts(counts_file)
Пример #25
0
import sys
import operator
from collections import defaultdict
from count_freqs import Hmm

def p2_1emission (word,tag,hmm,countTag):

    #print "p2_1 " + word + " " + tag + " %i" %hmm.emission_counts[(word,tag)]
    if (word,tag) in hmm.emission_counts:
        return hmm.emission_counts[(word,tag)]/countTag[tag]
    else:
        return 0

if __name__ == "__main__":
    input = file(sys.argv[1],"r")
    model = Hmm(3)
    #print len(model.emission_counts)
    model.read_counts(input)
    #print len(model.emission_counts
    #if ("BACKGROUND","O") in model.emission_counts:
        #print "yes"
    #print model.all_states 
    testFile = file(sys.argv[2],"r")

    tagsNum = len(model.all_states)
    countTag = dict.fromkeys(model.all_states,0)
    #print countTag

    for (word,tag) in model.emission_counts:
        countTag[tag] += model.emission_counts[(word,tag)]
    #print countTag
Пример #26
0
			word = " ".join(fields[:-1])

			# replace word with its category if frequency < count_thresh
			if word_count_dict[word] < count_thresh:
				line = " ".join([get_category(word), fields[-1]])
		out_lines_list.append(line)
		l = in_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_file.write(out_lines)

if __name__ == "__main__":
	# replace infrequent words with categories and write to file
	replace_infrequent_words_with_categories(file('ner_train.dat'), file('ner_train_cats.dat', 'w'))

	# generate counts file
	os.system('python count_freqs.py ner_train_cats.dat > ner_cats.counts')

	# get frequent words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# get transition and emission probabilities
	counter = Hmm(3)
	counter.read_counts(file('ner_cats.counts'))
	transition_probs = compute_transition_probs(counter.ngram_counts[1], counter.ngram_counts[2])
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	# store tagged data with the log probs to file
	tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words)

	os.system('python eval_ne_tagger.py ner_dev.key 6.txt')
Пример #27
0
from collections import defaultdict
from count_freqs import Hmm
import math
import sys


def emission_probability(word, tag, emission_counts, ngram_counts):
    return emission_counts[(word, tag)] / ngram_counts[0][(tag, )]


if __name__ == "__main__":
    counts_file = open(sys.argv[1])
    sentences_file = open(sys.argv[2])

    hmm = Hmm()
    hmm.read_counts(counts_file)

    emission_counts = hmm.emission_counts
    ngram_counts = hmm.ngram_counts

    entity_tags = hmm.all_states
    trained_words = defaultdict(int)
    infrequent_words = defaultdict(int)

    for word, tag in emission_counts:
        trained_words[word] += hmm.emission_counts[(word, tag)]

    for word in trained_words:
        if trained_words[word] < 5:
            infrequent_words[word] = 1
Пример #28
0
     with open(file, "r") as f:
        f2 = open("ner_train_rare.dat", "w")
        for line in f:
            parts = line.strip().split(" ")
            word = parts[0]
            if word in infreq_words:
                f2.write("_RARE_" + " " + parts[1] + "\n")
            else:
                f2.write(line)
        f2.close()


def usage():
    print """
    python add_rare.py [count_file] [training_data] 
    """

if __name__ == "__main__":

    if len(sys.argv)!=3: # Expects two argument: original count file and training data file
        usage()
        sys.exit(2)

    counter = Hmm(3)
    # finds count information for words in file
    (em_count, ngram_count, infreq_word_set, all_tags, all_words) = counter.read_counts(sys.argv[1])

    #produces new file with _RARE_
    replace_rare(sys.argv[2], infreq_word_set)      

Пример #29
0
	"""


if __name__ == "__main__":
    if len(sys.argv) < 4:  # Expects atleast 3 arguments
        usage()
        sys.exit(2)
    try:
        input = file(sys.argv[1], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    if (len(sys.argv) == 4):
        #to obtain original counts
        (em_count1, ngram_count1, infreq_word1, all_tags1,
         all_words1) = counter.read_counts(sys.argv[3])
        #to process new data
        (em_count, ngram_count, infreq_word, all_tags,
         all_words) = counter.read_counts(sys.argv[1])
        #to obtain emission prob
        emission_probabilities = emission_parameters(sys.argv[2], em_count,
                                                     ngram_count[0], all_tags,
                                                     all_words1, infreq_word1)
    else:
        #to process new data
        (em_count, ngram_count, infreq_word, all_tags,
         all_words) = counter.read_counts(sys.argv[1])
 def train_ngram_and_emission_freq_from_corpus_file(self, corpus_file):
     counter = Hmm(3)
     counter.train(corpus_file)
     self.emission_counts = counter.emission_counts
     self.ngram_counts = counter.ngram_counts
Пример #31
0
    output2.flush()
    
    
###################################################

###################################################


    print("\n2. Generate word count file.\n")
    
    freqs_input = open('gene.replace.train',"r")
    freqs_output = open('gene.counts', "w")

    
    # Initialize a trigram counter
    counter = Hmm(3)
    # Collect counts
    counter.train(freqs_input)
    # Write the counts
    counter.write_counts(freqs_output)
    
    freqs_output.flush()
    
    
    
###################################################

###################################################   

    
    print("\n3. Tag dev corpus with Viterbi tagger.\n")
Пример #32
0
if __name__ == "__main__":

    if len(sys.argv
           ) != 3:  # Expect exactly two arguments: the counts and trigram file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
        trigram_file = sys.argv[2]
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read in counts
    counter.read_counts(count_file)

    # Iterate through trigrams in trigram_file and calculate the log probability of each trigram.
    for line in test_file:
        trigram = line.strip().split(" ")

        if trigram:  # Nonempty line
            prob = counter.calc_mle(trigram)

            # Get the log of the probability
            log_prob = math.log(prob)

            # Write log probability to output file
            sys.stdout.write(