def setup_module(): global weights_nb global counts global class_counts global allkeys counts, class_counts, allkeys = getCountsAndKeys(TRAINKEY) weights_nb = learnNBWeights(counts, class_counts, allkeys)
def setup_module (): global weights_nb global counts global class_counts global allkeys counts, class_counts, allkeys = getCountsAndKeys (TRAINKEY) weights_nb = learnNBWeights (counts, class_counts, allkeys)
def test_nb_simple(): ''' Tests for the following two sentences: the D man N runs V man V the D cannons N ''' allwords = ['the', 'man', 'runs', 'the', 'cannons'] wordCountsByTag = Counter({ 'D': Counter({'the': 2}), 'N': Counter({'man': 1, 'cannons': 1}), 'V': Counter({'runs': 1, 'man': 1}) }) classCounts = Counter({'D': 2, 'N': 2, 'V': 2}) weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0) assert_almost_equals(0.5, np.exp(weights[('N', 'man')]), places=3) assert_almost_equals(0.5, np.exp(weights[('V', 'man')]), places=3) assert_almost_equals(1.0, np.exp(weights[('D', 'the')]), places=3) # offsets assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3) assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3) assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)
def test_nb_smoothing(): ''' Tests for the following two sentences, with smoothing of 0.5 the D man N runs V man V the D cannons N ''' allwords = ['the', 'man', 'runs', 'the', 'cannons'] wordCountsByTag = Counter({ 'D': Counter({'the': 2}), 'N': Counter({'man': 1, 'cannons': 1}), 'V': Counter({'runs': 1, 'man': 1}) }) classCounts = Counter({'D': 2, 'N': 2, 'V': 2}) # smoothing of 0.5 reserves 1/2 probability mass for unknown weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0.5) assert_almost_equals(5.0 / 8.0, np.exp(weights[('D', 'the')]), places=3) assert_almost_equals(1.0 / 8.0, np.exp(weights[('N', 'the')])) assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3) assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3) # offsets unchanged assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)
def setup_module (): global weights_nb global alltags global allwords counters = most_common.get_tags(TRAIN_FILE) for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) weights_nb = naivebayes.learnNBWeights(counters,class_counts,allwords) alltags = preproc.getAllTags(TRAIN_FILE)
def setup_module(): global weights_nb global alltags global allwords counters = most_common.get_tags(TRAIN_FILE) for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) weights_nb = naivebayes.learnNBWeights(counters, class_counts, allwords) alltags = preproc.getAllTags(TRAIN_FILE)
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features """ # compute naive bayes weights # convert nb weights to hmm weights counters = most_common.get_tags(trainfile) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, 0.001) trans_cnt = defaultdict(Counter) with open(trainfile) as instances: prev_tag = START_TAG for line in instances: if len(line.rstrip()) == 0: trans_cnt[prev_tag][END_TAG] += 1 prev_tag = START_TAG continue parts = line.rstrip().split() if len(parts) > 1: cur_tag = parts[1] else: cur_tag = UNKNOWN trans_cnt[prev_tag][cur_tag] += 1 prev_tag = cur_tag if prev_tag != START_TAG: trans_cnt[prev_tag][END_TAG] += 1 hmm_weights = defaultdict(lambda: -1000.) for key in nb_weights: tag = key[0] word = key[1] hmm_weights[(tag, word, EMIT)] = nb_weights[key] for prev_tag in trans_cnt: cnt = trans_cnt[prev_tag] total_pairs = sum(cnt.values()) for cur_tag in cnt: hmm_weights[(cur_tag, prev_tag, TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs) return hmm_weights
def test_nb_one_class(): allwords = ['football', 'spoon', 'dog'] wordCountsByTag = Counter({ 'N': Counter({ 'football': 1, 'spoon': 1, 'dog': 1 }) }) classCounts = Counter({'N': 3}) weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0) assert_almost_equals(0.333, np.exp(weights[('N', 'spoon')]), places=3) assert_almost_equals(0.333, np.exp(weights[('N', 'football')]), places=3) assert_almost_equals(0.333, np.exp(weights[('N', 'dog')]), places=3)
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features """ # compute naive bayes weights # convert nb weights to hmm weights counters = most_common.get_tags(trainfile) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) nb_weights = naivebayes.learnNBWeights(counters,class_counts,allwords,0.001) trans_cnt = defaultdict(Counter) with open(trainfile) as instances: prev_tag = START_TAG for line in instances: if len(line.rstrip()) == 0: trans_cnt[prev_tag][END_TAG] += 1 prev_tag = START_TAG continue parts = line.rstrip().split() if len(parts) >1: cur_tag = parts[1] else: cur_tag = UNKNOWN trans_cnt[prev_tag][cur_tag] += 1 prev_tag = cur_tag if prev_tag != START_TAG: trans_cnt[prev_tag][END_TAG] += 1 hmm_weights = defaultdict(lambda : -1000.) for key in nb_weights: tag = key[0] word = key[1] hmm_weights[(tag, word, EMIT)] = nb_weights[key] for prev_tag in trans_cnt: cnt = trans_cnt[prev_tag] total_pairs = sum(cnt.values()) for cur_tag in cnt: hmm_weights[(cur_tag, prev_tag, TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs) return hmm_weights
def test_nb_one_class(): allwords = ['football', 'spoon', 'dog'] wordCountsByTag = Counter( {'N': Counter({ 'football': 1, 'spoon': 1, 'dog': 1 })}) classCounts = Counter({'N': 3}) weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0) assert_almost_equals(0.333, np.exp(weights[('N', 'spoon')]), places=3) assert_almost_equals(0.333, np.exp(weights[('N', 'football')]), places=3) assert_almost_equals(0.333, np.exp(weights[('N', 'dog')]), places=3)
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features ngrams("I really like", 2) (I, really) (really, like) (end_tag,'N',trans) => q(stop/N) => (N, end_tag) q(stop/N) = count(stop, N) / count(N) """ # compute naive bayes weights counters = most_common.get_tags(trainfile) class_counts = most_common.get_class_counts(counters) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, alpha=0.001) # convert nb weights to hmm weights hmm_weights = defaultdict(lambda: -1000.0) for (tag, word), weight in nb_weights.iteritems(): hmm_weights[(tag, word, EMIT)] = weight unigram = preproc.getNgrams(trainfile) bigram = preproc.getNgrams(trainfile, 2) unigramCount = preproc.getAllCounts(unigram) bigramCount = preproc.getAllCounts(bigram) for (tag1, tag2) in bigramCount.keys(): hmm_weights[(tag2, tag1, TRANS)] = np.log(1.0 * bigramCount.get((tag1, tag2), 0)) - np.log( unigramCount.get(tag1, 0) ) return hmm_weights
def test_nb_smoothing(): ''' Tests for the following two sentences, with smoothing of 0.5 the D man N runs V man V the D cannons N ''' allwords = ['the', 'man', 'runs', 'the', 'cannons'] wordCountsByTag = Counter({ 'D': Counter({'the': 2}), 'N': Counter({ 'man': 1, 'cannons': 1 }), 'V': Counter({ 'runs': 1, 'man': 1 }) }) classCounts = Counter({'D': 2, 'N': 2, 'V': 2}) # smoothing of 0.5 reserves 1/2 probability mass for unknown weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0.5) assert_almost_equals(5.0 / 8.0, np.exp(weights[('D', 'the')]), places=3) assert_almost_equals(1.0 / 8.0, np.exp(weights[('N', 'the')])) assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3) assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3) # offsets unchanged assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)
def test_nb_simple(): ''' Tests for the following two sentences: the D man N runs V man V the D cannons N ''' allwords = ['the', 'man', 'runs', 'the', 'cannons'] wordCountsByTag = Counter({ 'D': Counter({'the': 2}), 'N': Counter({ 'man': 1, 'cannons': 1 }), 'V': Counter({ 'runs': 1, 'man': 1 }) }) classCounts = Counter({'D': 2, 'N': 2, 'V': 2}) weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0) assert_almost_equals(0.5, np.exp(weights[('N', 'man')]), places=3) assert_almost_equals(0.5, np.exp(weights[('V', 'man')]), places=3) assert_almost_equals(1.0, np.exp(weights[('D', 'the')]), places=3) # offsets assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3) assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3) assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)