def test_get_top_verb_tags(): expected = [('is', 1738), ('was', 808), ('have', 748)] tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE) actual = tag_word_counts["VERB"].most_common(3) eq_(expected, actual, msg="UNEQUAL Expected:%s, Actual:%s" % (expected, actual))
def test_get_top_noun_tags(): expected = [('time', 385), ('people', 233), ('way', 187)] tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE) actual = tag_word_counts["NOUN"].most_common(3) eq_(expected, actual, msg="UNEQUAL Expected:%s, Actual:%s" % (expected, actual))
def compute_HMM_weights(trainfile, smoothing): """Compute all weights for the HMM :param trainfile: training file :param smoothing: float for smoothing of both probability distributions :returns: defaultdict of weights, list of all possible tags (types) :rtype: defaultdict, list """ # hint: these are your first two lines tag_trans_counts = most_common.get_tag_trans_counts(trainfile) tag_word_counts = most_common.get_tag_word_counts(trainfile) all_tags = tag_trans_counts.keys() # hint: call compute_transition_weights # hint: set weights for illegal transitions to -np.inf # hint: call get_tag_word_counts and estimate_nb_tagger # hint: Counter.update() combines two Counters # hint: return weights, all_tags trans_weights = compute_transition_weights(tag_trans_counts, smoothing) emission_weights = naive_bayes.estimate_nb_tagger(tag_word_counts, smoothing) tagged_emission_weights = {} for weight in emission_weights: tagged_emission_weights[(weight[0], weight[1], EMIT)] = emission_weights[weight] trans_weights.update(tagged_emission_weights) return trans_weights, all_tags
def compute_HMM_weights(trainfile,smoothing): """Compute all weights for the HMM :param trainfile: training file :param smoothing: float for smoothing of both probability distributions :returns: defaultdict of weights, list of all possible tags (types) :rtype: defaultdict, list """ # hint: these are your first two lines tag_trans_counts = most_common.get_tag_trans_counts(trainfile) all_tags = tag_trans_counts.keys() # hint: call compute_transition_weights weights = compute_transition_weights(tag_trans_counts,smoothing) # hint: set weights for illegal transitions to -np.inf for prev_tag in all_tags: weights[(prev_tag,END_TAG,TRANS)]=-np.inf weights[(END_TAG,END_TAG,TRANS)]=-np.inf weights[(START_TAG,prev_tag,TRANS)]=-np.inf weights[(START_TAG,START_TAG,TRANS)]=-np.inf weights[(START_TAG,END_TAG,TRANS)]=-np.inf weights[(END_TAG,START_TAG,TRANS)]=-np.inf # hint: call get_tag_word_counts and estimate_nb_tagger tag_word_counts = most_common.get_tag_word_counts(trainfile); # print tag_word_counts counter_items = np.array(tag_word_counts.items()) counters = counter_items[:,1] update_nb_tagger = defaultdict(float) nb_tagger = naive_bayes.estimate_nb_tagger(tag_word_counts,smoothing) for key in nb_tagger: value = nb_tagger[key]; # print "key: ", key # print type(key) # print "value: ", value if(key[0]!=OFFSET and key[1]!=OFFSET): new_key = (key[0],key[1],EMIT) update_nb_tagger[new_key]=value # print nb_tagger # print counters # print "counters: ", counters # for count in counters: # nb_tagger = naive_bayes.estimate_nb_tagger(counters,smoothing) # print "nb_tagger: ", nb_tagger # hint: Counter.update() combines two Counters newDict = defaultdict(float) newDict.update(update_nb_tagger) newDict.update(weights) # hint: return weights, all_tags return newDict, all_tags
def setup(): global counters, theta_nb, vocab, theta_nb_fixed, sorted_tags counters = most_common.get_tag_word_counts(constants.TRAIN_FILE) sorted_tags = sorted(counters.keys()) theta_nb = naive_bayes.estimate_nb([counters[tag] for tag in sorted_tags], sorted_tags, .01) vocab = set([word for tag,word in theta_nb.keys() if word is not constants.OFFSET]) theta_nb_fixed = naive_bayes.estimate_nb_tagger(counters,.01)
def setup(): global counters, theta_nb, vocab, theta_nb_fixed, sorted_tags counters = most_common.get_tag_word_counts(constants.TRAIN_FILE) sorted_tags = sorted(counters.keys()) theta_nb = naive_bayes.estimate_nb([counters[tag] for tag in sorted_tags], sorted_tags, .01) vocab = set([ word for tag, word in theta_nb.keys() if word is not constants.OFFSET ]) theta_nb_fixed = naive_bayes.estimate_nb_tagger(counters, .01)
def compute_HMM_weights(trainfile, smoothing): """Compute all weights for the HMM :param trainfile: training file :param smoothing: float for smoothing of both probability distributions :returns: defaultdict of weights, list of all possible tags (types) :rtype: defaultdict, list """ # hint: these are your first two lines tag_trans_counts = most_common.get_tag_trans_counts(trainfile) all_tags = tag_trans_counts.keys() # hint: call compute_transition_weights trans_weights = compute_transition_weights(tag_trans_counts, smoothing) # hint: set weights for illegal transitions to -np.inf for key in all_tags: trans_weights[(key, END_TAG, TRANS)] = -np.inf trans_weights[(START_TAG, key, TRANS)] = -np.inf trans_weights[(END_TAG, END_TAG, TRANS)] = -np.inf # hint: call get_tag_word_counts and estimate_nb_tagger tag_word_counts = most_common.get_tag_word_counts(trainfile) word_weights = naive_bayes.estimate_nb_tagger(tag_word_counts, smoothing) new_weights = {} for key in word_weights: new_key = (key[0], key[1], EMIT) if key[1] != OFFSET: new_weights[new_key] = word_weights[key] # hint: Counter.update() combines two Counters trans_weights.update(new_weights) # hint: return weights, all_tags return trans_weights, all_tags
def test_get_top_noun_tags(): expected = [('people', 53), ('time', 48), ('world', 46)] tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE) actual = tag_word_counts["NOUN"].most_common(3) eq_ (expected, actual, msg="UNEQUAL Expected:%s, Actual:%s" %(expected, actual))
def test_get_top_verb_tags(): expected = [('is', 335), ('was', 128), ('have', 110)] tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE) actual = tag_word_counts["VERB"].most_common(3) eq_(expected, actual, msg="UNEQUAL Expected:%s, Actual:%s" %(expected, actual))