def make_classifier_builder(args): if isinstance(args.classifier, basestring): algos = [args.classifier] else: algos = args.classifier for algo in algos: if algo not in classifier_choices: raise ValueError('classifier %s is not supported' % algo) classifier_train_args = [] for algo in algos: classifier_train_kwargs = {} if algo == 'DecisionTree': classifier_train = DecisionTreeClassifier.train classifier_train_kwargs['binary'] = False classifier_train_kwargs['entropy_cutoff'] = args.entropy_cutoff classifier_train_kwargs['depth_cutoff'] = args.depth_cutoff classifier_train_kwargs['support_cutoff'] = args.support_cutoff classifier_train_kwargs['verbose'] = args.trace elif algo == 'NaiveBayes': classifier_train = NaiveBayesClassifier.train elif algo == 'Scikits': classifier_train = ScikitsClassifier.train else: if algo != 'Maxent': classifier_train_kwargs['algorithm'] = algo if algo == 'MEGAM': megam.config_megam() classifier_train = MaxentClassifier.train classifier_train_kwargs['max_iter'] = args.max_iter classifier_train_kwargs['min_ll'] = args.min_ll classifier_train_kwargs['min_lldelta'] = args.min_lldelta classifier_train_kwargs['trace'] = args.trace classifier_train_args.append((algo, classifier_train, classifier_train_kwargs)) def trainf(train_feats): classifiers = [] for algo, classifier_train, train_kwargs in classifier_train_args: if args.trace: print 'training %s classifier' % algo classifiers.append(classifier_train(train_feats, **train_kwargs)) if len(classifiers) == 1: return classifiers[0] else: return AvgProbClassifier(classifiers) return trainf #return lambda(train_feats): classifier_train(train_feats, **classifier_train_kwargs)
def maximum_entropy(train_set, test_set): print "--- nltk.classify.maximum_entropy ---" from nltk.classify import megam megam.config_megam() classifier = nltk.classify.MaxentClassifier.train(train_set, "megam") print "Overall accuracy:", accuracy(classifier, test_set) classifier.show_most_informative_features(10)
def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = npchunk_features(untagged_sent, i, history) train_set.append( (featureset, tag) ) history.append(tag) from nltk.classify import megam megam.config_megam(bin='/cs/fs/home/hxiao/code/megam_i686.opt') self.classifier = nltk.MaxentClassifier.train( train_set, algorithm='megam', trace=0)
def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = npchunk_features(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) from nltk.classify import megam megam.config_megam(bin='/cs/fs/home/hxiao/code/megam_i686.opt') self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0)
def main(): megam.config_megam(megam_path) feature_extractor = BigramFeatureExtractor() training_documents = [] training_documents.append((['muy', 'buena', 'comida', 'hola'], 'pos')) training_documents.append((['muy', 'mala', 'comida'], 'neg')) training_documents.append((['muy', 'mala', 'comida', 'hola'], 'neg')) training_documents.append((['buena', 'comida'], 'pos')) training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents) classifier = nltk.MaxentClassifier.train(training_set, algorithm='megam', explicit=False, bernoulli=True, model='binary') classifier.show_most_informative_features()
def make_classifier_builder(args): if isinstance(args.classifier, basestring): algos = [args.classifier] else: algos = args.classifier for algo in algos: if algo not in classifier_choices: raise ValueError('classifier %s is not supported' % algo) classifier_train_args = [] for algo in algos: classifier_train_kwargs = {} if algo == 'DecisionTree': classifier_train = DecisionTreeClassifier.train classifier_train_kwargs['binary'] = False classifier_train_kwargs['entropy_cutoff'] = args.entropy_cutoff classifier_train_kwargs['depth_cutoff'] = args.depth_cutoff classifier_train_kwargs['support_cutoff'] = args.support_cutoff classifier_train_kwargs['verbose'] = args.trace elif algo == 'NaiveBayes': classifier_train = NaiveBayesClassifier.train elif algo == 'Svm': classifier_train = SvmClassifier.train elif algo.startswith('sklearn.'): # TODO: support many options for building an estimator pipeline pipe = [('classifier', make_sklearn_classifier(algo, args))] if args.tfidf: if args.trace: print 'using tfidf transformer with norm %s' % args.penalty pipe.insert(0, ('tfidf', TfidfTransformer(norm=args.penalty))) sparse = pipe[-1][1].__class__.__name__ not in dense_classifiers if not sparse and args.trace: print 'using dense matrix' if args.value_type == 'bool' and not args.tfidf: dtype = bool elif args.value_type == 'int' and not args.tfidf: dtype = int else: dtype = float if args.trace: print 'using dtype %s' % dtype.__name__ classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=dtype, sparse=sparse).train else: if algo != 'Maxent': classifier_train_kwargs['algorithm'] = algo if algo == 'MEGAM': megam.config_megam() classifier_train = MaxentClassifier.train classifier_train_kwargs['max_iter'] = args.max_iter classifier_train_kwargs['min_ll'] = args.min_ll classifier_train_kwargs['min_lldelta'] = args.min_lldelta classifier_train_kwargs['trace'] = args.trace classifier_train_args.append((algo, classifier_train, classifier_train_kwargs)) def trainf(train_feats): classifiers = [] for algo, classifier_train, train_kwargs in classifier_train_args: if args.trace: print 'training %s classifier' % algo classifiers.append(classifier_train(train_feats, **train_kwargs)) if len(classifiers) == 1: return classifiers[0] else: return AvgProbClassifier(classifiers) return trainf
def __init__(self, n_folds, corpus_size, fold_number, remove_stop_words, use_unigrams, use_unigrams_frequency, use_bigrams, use_all_bigrams, min_word_length, remove_duplicated_chars, process_negation, stem, transform_lower_case, remove_punctuation_marks, remove_accents, lemma, adjectives, allprepro, out_of_domain_test, proportion_of_positives): super(CrossValidatedMegamMaxEntClassifier, self).__init__(n_folds, corpus_size, fold_number, remove_stop_words, use_unigrams, use_unigrams_frequency, use_bigrams, use_all_bigrams, min_word_length, remove_duplicated_chars, process_negation, stem, transform_lower_case, remove_punctuation_marks, remove_accents, lemma, adjectives, allprepro, out_of_domain_test, proportion_of_positives) megam.config_megam(megam_path)
import nltk from nltk.classify import megam megam.config_megam('/Users/arlogb/ext_sources/megam_0.92/') #We will use the nltk NaiveBayesClassifer def tset(extractor, tok): """function wrapping the apply_feature function. Should pass a feature extracting function which returns a featureset - dict mapping features to feature values. Tok are tokens which extractor will be applied to.""" trainset = nltk.classify.apply_features(extractor, tok) return trainset def trainclassifier(trainset): return nltk.NaiveBayesClassifier.train(trainset) def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist): """A copy of the nltk.NaiveBayesClassifer.train(...) method to allow inspection of what the method is actually doing and how long it's taking""" """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = nltk.FreqDist() feature_freqdist = nltk.defaultdict(nltk.FreqDist) feature_values = nltk.defaultdict(set) fnames = set() print 'There are ' + str(len(labeled_featuresets)) + ' labeled featuresets' # Count up how many times each feature value occured, given # the label and featurename. print 'Counting feature value occurence'
help="use rules that are applied after the maxent algorithm") parser.add_argument("--classifier", help="classifying algorithm to use", choices=["maxent", "svm", "dt", "rf"], default="svm") args = parser.parse_args() inputdir = args.inputdirdevel if args.testdir == "devel" else args.inputdirtest train_file = "features/features_megan_train_" + str(args.version) + ".txt" test_file = "features/features_megan_" + args.testdir + "_" + str( args.version) + ".txt" if args.classifier == "maxent": # Train megam.config_megam("src/megam_i686.opt") train = megam.call_megam(["multiclass", train_file]) with open("features/weights", "w") as f: f.write(train) # Prediction predictions_text = megam.call_megam( ["-predict", "features/weights", "multiclass", test_file]) predictions = [ y.split("\t")[0] for y in predictions_text.split("\n") if len(y) > 1 ] else: # sklearn classifiers # Read train files with open(train_file, 'r') as f:
self.old_stdout.flush() self.old_stderr.flush() sys.stdout, sys.stderr = self._stdout, self._stderr def __exit__(self, exc_type, exc_value, traceback): self._stdout.flush() self._stderr.flush() sys.stdout = self.old_stdout sys.stderr = self.old_stderr try: import nltk.classify.megam as megam import nltk.classify.maxent as maxent with RedirectStdStreams(stdout=sys.stderr): megam.config_megam() #megam.config_megam(bin="path/to/megam") use_megam = True except: print >> sys.stderr, "ERROR: megam not found, configure it before using this" print >> sys.stderr, "using default nltk maxent" use_megam = False class MaxentLearner: """ wrapper to nltk version of max entropy classifier TODO: forces to use megam, could use a parameter for other algorithms. return_type: whether to return the "label" (a string) or an orange "value" """
def make_classifier_builder(args): if isinstance(args.classifier, basestring): algos = [args.classifier] else: algos = args.classifier for algo in algos: if algo not in classifier_choices: raise ValueError("classifier %s is not supported" % algo) classifier_train_args = [] for algo in algos: classifier_train_kwargs = {} if algo == "DecisionTree": classifier_train = DecisionTreeClassifier.train classifier_train_kwargs["binary"] = False classifier_train_kwargs["entropy_cutoff"] = args.entropy_cutoff classifier_train_kwargs["depth_cutoff"] = args.depth_cutoff classifier_train_kwargs["support_cutoff"] = args.support_cutoff classifier_train_kwargs["verbose"] = args.trace elif algo == "NaiveBayes": classifier_train = NaiveBayesClassifier.train elif algo == "Svm": classifier_train = SvmClassifier.train elif algo.startswith("sklearn."): # TODO: support many options for building an estimator pipeline pipe = [("classifier", make_sklearn_classifier(algo, args))] tfidf = getattr(args, "tfidf", None) penalty = getattr(args, "penalty", None) if tfidf and penalty: if args.trace: print("using tfidf transformer with norm %s" % penalty) pipe.insert(0, ("tfidf", TfidfTransformer(norm=penalty))) sparse = pipe[-1][1].__class__.__name__ not in dense_classifiers if not sparse and args.trace: print("using dense matrix") value_type = getattr(args, "value_type", "bool") if value_type == "bool" and not tfidf: dtype = bool elif value_type == "int" and not tfidf: dtype = int else: dtype = float if args.trace: print("using dtype %s" % dtype.__name__) classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=dtype, sparse=sparse).train else: if algo != "Maxent": classifier_train_kwargs["algorithm"] = algo if algo == "MEGAM": megam.config_megam() classifier_train = MaxentClassifier.train classifier_train_kwargs["max_iter"] = args.max_iter classifier_train_kwargs["min_ll"] = args.min_ll classifier_train_kwargs["min_lldelta"] = args.min_lldelta classifier_train_kwargs["trace"] = args.trace classifier_train_args.append((algo, classifier_train, classifier_train_kwargs)) def trainf(train_feats): classifiers = [] for algo, classifier_train, train_kwargs in classifier_train_args: if args.trace: print("training %s classifier" % algo) classifiers.append(classifier_train(train_feats, **train_kwargs)) if len(classifiers) == 1: return classifiers[0] else: return AvgProbClassifier(classifiers) return trainf
import nltk from nltk.classify import megam megam.config_megam("/Users/arlogb/ext_sources/megam_0.92/") # We will use the nltk NaiveBayesClassifer def tset(extractor, tok): """function wrapping the apply_feature function. Should pass a feature extracting function which returns a featureset - dict mapping features to feature values. Tok are tokens which extractor will be applied to.""" trainset = nltk.classify.apply_features(extractor, tok) return trainset def trainclassifier(trainset): return nltk.NaiveBayesClassifier.train(trainset) def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist): """A copy of the nltk.NaiveBayesClassifer.train(...) method to allow inspection of what the method is actually doing and how long it's taking""" """ @param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples C{(featureset, label)}. """ label_freqdist = nltk.FreqDist() feature_freqdist = nltk.defaultdict(nltk.FreqDist) feature_values = nltk.defaultdict(set) fnames = set() print "There are " + str(len(labeled_featuresets)) + " labeled featuresets"
import ipdb ipdb.set_trace() for clf in clfs: evaluate_cross_validataion(clf, data, target, 5) import ipdb ipdb.set_trace() if not os.path.exists(MULTI_CLASSIFIER_NAME): st = str("/usr/local/bin/megam") import ipdb ipdb.set_trace() config_megam(st) rwords = reuters_high_info_words() featdet = lambda words: bag_of_words_in_set(words, rwords) multi_train_feats, multi_test_feats = reuters_train_test_feats(featdet) trainf = lambda train_feats: MaxentClassifier.train(train_feats, algorithm="megam", trace=0, max_iter=10) # labelset = set(reuters.categories()) labelset = set(list(publics.keys())) classifiers = train_binary_classifiers(trainf, multi_train_feats, labelset) len(classifiers) multi_classifier = MultiBinaryClassifier(*classifiers.items()) output = open(MULTI_CLASSIFIER_NAME, "wb") pickle.dump(multi_classifier, output)
########################## from nltk.data import path as nltk_data_path nltk_data_location = os.getenv('NLTK_DATA_PATH') if nltk_data_location is not None: nltk_data_path.append(nltk_data_location) # EXTERNAL LIBRARIES ########################## # NOTE Set this directory to wherever megam, MITIE, Stanford NER and SENNA are # located. EXTLIB_DIR = '/Data/nlp/utilities/' #TODO this might need a try/except as well: MEGAM_DIR = EXTLIB_DIR + 'megam_0.92/' try: megam.config_megam(MEGAM_DIR + 'megam.opt') except: print("megam is not installed or not configured correctly.") MITIE_DIR = EXTLIB_DIR + 'MITIE/' MITIE_LIB_DIR = MITIE_DIR + 'mitielib/' sys.path.append(MITIE_LIB_DIR) try: import mitie except: print("To use the pre-trained MITIE model, you will need to install the " + "MITIE Python wrapper.") try: stanford_ner_path = os.environ.get('CLASSPATH') except:
import optparse import sys import bleu_smooth from nltk.classify.megam import call_megam, parse_megam_weights,config_megam optparser = optparse.OptionParser() optparser.add_option("-k", "--kbest-list", dest="train", default="data/train.100best", help="100-best translation lists") optparser.add_option("-d", "--dev_kbest-list", dest="dev", default="data/dev+test.100best", help="100-best translation lists") optparser.add_option("-r", "--reference", dest="reference", default="data/train.ref", help="Target language reference sentences") (opts, _) = optparser.parse_args() lm = tm1 = -0.92 tm2 = -1 megam_features = [] samples = [] sent_features = [] sign = lambda x: (1, -1)[x<0] config_megam("/usr/local/bin/") #Read Reference Translation for Training ref = [line.strip().split() for line in open(opts.reference)] #Read Candidate Translations for Training all_hyps = [pair.split(' ||| ') for pair in open(opts.train)] num_sents = len(all_hyps) / 100 bleu_score_per_sent = [] for s in xrange(0, num_sents): del bleu_score_per_sent[:] del sent_features[:] del samples[:] empty=0 hyps_for_one_sent = all_hyps[s * 100:s * 100 + 100] #compute BLEU+1 for label and read/compute feature values for (num, hyp, feats) in hyps_for_one_sent: untranslated=0