예제 #1
0
def setup_module():
    # Need to do this because the dataIterator function depends
    # on the BOW file to be generated.
    global ac_train
    global ac_dev
    docsToBOWs(TRAINKEY)
    docsToBOWs(DEVKEY)
    ac_train = getAllCounts(dataIterator(TRAINKEY))
    ac_dev = getAllCounts(dataIterator(DEVKEY))
예제 #2
0
def setup_module():
    # Need to do this because the dataIterator function depends
    # on the BOW file to be generated.
    global ac_train
    global ac_dev
    docsToBOWs(TRAINKEY)
    docsToBOWs(DEVKEY)
    ac_train = getAllCounts (dataIterator (TRAINKEY))
    ac_dev = getAllCounts (dataIterator (DEVKEY))
예제 #3
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features

        ngrams("I really like", 2)
            (I, really)
            (really, like)
        (end_tag,'N',trans)  => q(stop/N) => (N, end_tag)
        q(stop/N) = count(stop, N) / count(N)
        """
    # compute naive bayes weights
    counters = most_common.get_tags(trainfile)
    class_counts = most_common.get_class_counts(counters)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))

    nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, alpha=0.001)

    # convert nb weights to hmm weights
    hmm_weights = defaultdict(lambda: -1000.0)
    for (tag, word), weight in nb_weights.iteritems():
        hmm_weights[(tag, word, EMIT)] = weight

    unigram = preproc.getNgrams(trainfile)
    bigram = preproc.getNgrams(trainfile, 2)
    unigramCount = preproc.getAllCounts(unigram)
    bigramCount = preproc.getAllCounts(bigram)

    for (tag1, tag2) in bigramCount.keys():
        hmm_weights[(tag2, tag1, TRANS)] = np.log(1.0 * bigramCount.get((tag1, tag2), 0)) - np.log(
            unigramCount.get(tag1, 0)
        )

    return hmm_weights