def features_brown_news(path): name = 'features_brown_news' tagged_words = brown.tagged_words(categories='news') common_suffixes = get_common_suffixes() featuresets = [(pos_features(common_suffixes, n), g) for (n,g) in tagged_words] log("featuresets") save_features(featuresets, path, name) log("dump featuresets") return name
def get_common_suffixes(): suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] log("common_suffixes") return common_suffixes
def train_naive_bayes(path, fname): name='naive_bayes' + '_._' + fname featuresets = load_features(path,fname) log("load featuresets") train_set = featuresets[1:200] classifier = nltk.NaiveBayesClassifier.train(train_set) log("train classifier") save_classifiers(classifier, path, name) log("dump classifier")
def train_decision_tree(path, fname): name = 'decision_tree' + '_._' + fname featuresets = load_features(path, fname) log("load featuresets") train_set = featuresets[1:200] classifier = nltk.DecisionTreeClassifier.train(train_set) log("train classifier") save_classifiers(classifier, path, name) log("dump classifier")
def train_decision_tree(path, fname): name='decision_tree' + '_._' + fname featuresets = load_features(path,fname) log("load featuresets") train_set = featuresets[1:200] classifier = nltk.DecisionTreeClassifier.train(train_set) log("train classifier") save_classifiers(classifier, path, name) log("dump classifier")
def pos_features(common_suffixes, word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) log("pos_features") return features
# # # @file: intfc_nltk_features.py # @author: [email protected] # @brief: collect features and dump # # import sys,os,time,nltk, cPickle as pickle from intfc_common_io import log, home, save_features, load_features from nltk.corpus import * log("import nltk.corpus") ## # # @brief: finding out what the most common suffixes ## def get_common_suffixes(): suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] log("common_suffixes") return common_suffixes