def setup_module(): # Need to do this because the dataIterator function depends # on the BOW file to be generated. global ac_train global ac_dev docsToBOWs(TRAINKEY) docsToBOWs(DEVKEY) ac_train = getAllCounts(dataIterator(TRAINKEY)) ac_dev = getAllCounts(dataIterator(DEVKEY))
def setup_module(): # Need to do this because the dataIterator function depends # on the BOW file to be generated. global ac_train global ac_dev docsToBOWs(TRAINKEY) docsToBOWs(DEVKEY) ac_train = getAllCounts (dataIterator (TRAINKEY)) ac_dev = getAllCounts (dataIterator (DEVKEY))
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features ngrams("I really like", 2) (I, really) (really, like) (end_tag,'N',trans) => q(stop/N) => (N, end_tag) q(stop/N) = count(stop, N) / count(N) """ # compute naive bayes weights counters = most_common.get_tags(trainfile) class_counts = most_common.get_class_counts(counters) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, alpha=0.001) # convert nb weights to hmm weights hmm_weights = defaultdict(lambda: -1000.0) for (tag, word), weight in nb_weights.iteritems(): hmm_weights[(tag, word, EMIT)] = weight unigram = preproc.getNgrams(trainfile) bigram = preproc.getNgrams(trainfile, 2) unigramCount = preproc.getAllCounts(unigram) bigramCount = preproc.getAllCounts(bigram) for (tag1, tag2) in bigramCount.keys(): hmm_weights[(tag2, tag1, TRANS)] = np.log(1.0 * bigramCount.get((tag1, tag2), 0)) - np.log( unigramCount.get(tag1, 0) ) return hmm_weights