Exemplo n.º 1
0
def test_get_top_verb_tags():
    expected = [('is', 1738), ('was', 808), ('have', 748)]
    tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE)
    actual = tag_word_counts["VERB"].most_common(3)
    eq_(expected,
        actual,
        msg="UNEQUAL Expected:%s, Actual:%s" % (expected, actual))
Exemplo n.º 2
0
def test_get_top_noun_tags():
    expected = [('time', 385), ('people', 233), ('way', 187)]
    tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE)
    actual = tag_word_counts["NOUN"].most_common(3)
    eq_(expected,
        actual,
        msg="UNEQUAL Expected:%s, Actual:%s" % (expected, actual))
Exemplo n.º 3
0
def compute_HMM_weights(trainfile, smoothing):
    """Compute all weights for the HMM

    :param trainfile: training file
    :param smoothing: float for smoothing of both probability distributions
    :returns: defaultdict of weights, list of all possible tags (types)
    :rtype: defaultdict, list

    """
    # hint: these are your first two lines
    tag_trans_counts = most_common.get_tag_trans_counts(trainfile)
    tag_word_counts = most_common.get_tag_word_counts(trainfile)
    all_tags = tag_trans_counts.keys()

    # hint: call compute_transition_weights
    # hint: set weights for illegal transitions to -np.inf
    # hint: call get_tag_word_counts and estimate_nb_tagger
    # hint: Counter.update() combines two Counters

    # hint: return weights, all_tags
    trans_weights = compute_transition_weights(tag_trans_counts, smoothing)
    emission_weights = naive_bayes.estimate_nb_tagger(tag_word_counts,
                                                      smoothing)
    tagged_emission_weights = {}
    for weight in emission_weights:
        tagged_emission_weights[(weight[0], weight[1],
                                 EMIT)] = emission_weights[weight]
    trans_weights.update(tagged_emission_weights)
    return trans_weights, all_tags
def compute_HMM_weights(trainfile,smoothing):
    """Compute all weights for the HMM

    :param trainfile: training file
    :param smoothing: float for smoothing of both probability distributions
    :returns: defaultdict of weights, list of all possible tags (types)
    :rtype: defaultdict, list

    """
    # hint: these are your first two lines
    tag_trans_counts = most_common.get_tag_trans_counts(trainfile)
    all_tags = tag_trans_counts.keys()

    # hint: call compute_transition_weights
    weights = compute_transition_weights(tag_trans_counts,smoothing)

    # hint: set weights for illegal transitions to -np.inf
    for prev_tag in all_tags:
        weights[(prev_tag,END_TAG,TRANS)]=-np.inf
        weights[(END_TAG,END_TAG,TRANS)]=-np.inf
        weights[(START_TAG,prev_tag,TRANS)]=-np.inf
        weights[(START_TAG,START_TAG,TRANS)]=-np.inf
        weights[(START_TAG,END_TAG,TRANS)]=-np.inf
        weights[(END_TAG,START_TAG,TRANS)]=-np.inf

    # hint: call get_tag_word_counts and estimate_nb_tagger
    tag_word_counts = most_common.get_tag_word_counts(trainfile);
    # print tag_word_counts
    counter_items = np.array(tag_word_counts.items())
    counters = counter_items[:,1] 
    update_nb_tagger = defaultdict(float)
    nb_tagger = naive_bayes.estimate_nb_tagger(tag_word_counts,smoothing)
    for key in nb_tagger:
        value = nb_tagger[key];
        # print "key: ", key
        # print type(key)
        # print "value: ", value
        if(key[0]!=OFFSET and key[1]!=OFFSET):
            new_key = (key[0],key[1],EMIT)
            update_nb_tagger[new_key]=value
    # print nb_tagger
    # print counters
    # print "counters: ", counters
    # for count in counters:
    # nb_tagger = naive_bayes.estimate_nb_tagger(counters,smoothing)
    # print "nb_tagger: ", nb_tagger
    # hint: Counter.update() combines two Counters
    newDict = defaultdict(float)
    newDict.update(update_nb_tagger)
    newDict.update(weights)

    # hint: return weights, all_tags
    return newDict, all_tags
Exemplo n.º 5
0
def setup():
    global counters, theta_nb, vocab, theta_nb_fixed, sorted_tags
    
    counters = most_common.get_tag_word_counts(constants.TRAIN_FILE)

    sorted_tags = sorted(counters.keys())

    theta_nb = naive_bayes.estimate_nb([counters[tag] for tag in sorted_tags],
                                         sorted_tags,
                                         .01)

    vocab = set([word for tag,word in theta_nb.keys() if word is not constants.OFFSET])

    theta_nb_fixed = naive_bayes.estimate_nb_tagger(counters,.01)
Exemplo n.º 6
0
def setup():
    global counters, theta_nb, vocab, theta_nb_fixed, sorted_tags

    counters = most_common.get_tag_word_counts(constants.TRAIN_FILE)

    sorted_tags = sorted(counters.keys())

    theta_nb = naive_bayes.estimate_nb([counters[tag] for tag in sorted_tags],
                                       sorted_tags, .01)

    vocab = set([
        word for tag, word in theta_nb.keys() if word is not constants.OFFSET
    ])

    theta_nb_fixed = naive_bayes.estimate_nb_tagger(counters, .01)
Exemplo n.º 7
0
def compute_HMM_weights(trainfile, smoothing):
    """Compute all weights for the HMM

    :param trainfile: training file
    :param smoothing: float for smoothing of both probability distributions
    :returns: defaultdict of weights, list of all possible tags (types)
    :rtype: defaultdict, list

    """
    # hint: these are your first two lines
    tag_trans_counts = most_common.get_tag_trans_counts(trainfile)
    all_tags = tag_trans_counts.keys()

    # hint: call compute_transition_weights
    trans_weights = compute_transition_weights(tag_trans_counts, smoothing)

    # hint: set weights for illegal transitions to -np.inf
    for key in all_tags:
        trans_weights[(key, END_TAG, TRANS)] = -np.inf
        trans_weights[(START_TAG, key, TRANS)] = -np.inf

    trans_weights[(END_TAG, END_TAG, TRANS)] = -np.inf

    # hint: call get_tag_word_counts and estimate_nb_tagger
    tag_word_counts = most_common.get_tag_word_counts(trainfile)
    word_weights = naive_bayes.estimate_nb_tagger(tag_word_counts, smoothing)
    new_weights = {}

    for key in word_weights:
        new_key = (key[0], key[1], EMIT)
        if key[1] != OFFSET:
            new_weights[new_key] = word_weights[key]

    # hint: Counter.update() combines two Counters
    trans_weights.update(new_weights)

    # hint: return weights, all_tags
    return trans_weights, all_tags
Exemplo n.º 8
0
def test_get_top_noun_tags():
    expected = [('people', 53), ('time', 48), ('world', 46)]
    tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE)
    actual = tag_word_counts["NOUN"].most_common(3)
    eq_ (expected, actual, msg="UNEQUAL Expected:%s, Actual:%s" %(expected, actual))
Exemplo n.º 9
0
def test_get_top_verb_tags():
    expected = [('is', 335), ('was', 128), ('have', 110)]
    tag_word_counts = most_common.get_tag_word_counts(TRAIN_FILE)
    actual = tag_word_counts["VERB"].most_common(3)
    eq_(expected, actual, msg="UNEQUAL Expected:%s, Actual:%s" %(expected, actual))