def rte_classifier(algorithm): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs( ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print('Training classifier...') if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms. # Ensure that MEGAM is configured first. check_megam_config() clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str("RTEClassifier only supports these algorithms:\n " "'megam', 'BFGS', 'GIS', 'IIS'.\n") raise Exception(err_msg) print('Testing classifier...') acc = accuracy(clf, featurized_test_set) print('Accuracy: %6.4f' % acc) return clf
def rte_classifier(algorithm, sample_N=None): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs( ["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"]) test_set = rte_corpus.pairs( ["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"]) if sample_N is not None: train_set = train_set[:sample_N] test_set = test_set[:sample_N] featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print("Training classifier...") if algorithm in ["megam"]: # MEGAM based algorithms. clf = MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str("RTEClassifier only supports these algorithms:\n " "'megam', 'GIS', 'IIS'.\n") raise Exception(err_msg) print("Testing classifier...") acc = accuracy(clf, featurized_test_set) print("Accuracy: %6.4f" % acc) return clf
def rte_classifier(): # classifier featurized_train_set = rte_featurize(train_set, True) featurized_test_set_1 = rte_featurize(test_set_1, False, test_id=0) featurized_test_set_2 = rte_featurize(test_set_2, False, test_id=1) featurized_test_set_3 = rte_featurize(test_set_3, False, test_id=2) featurized_new_1 = rte_featurize(new_1, False, test_id=3) featurized_new_2 = rte_featurize(new_2, False, test_id=4) testing = [ featurized_test_set_1, featurized_test_set_2, featurized_test_set_3, featurized_new_1, featurized_new_2 ] print('Training classifier...') clf_svm = SklearnClassifier(LinearSVC()).train(featurized_train_set) clf_nb = nltk.NaiveBayesClassifier.train(featurized_train_set) clf_gis = MaxentClassifier.train(featurized_train_set, 'GIS') clf_iis = MaxentClassifier.train(featurized_train_set, 'IIS') clf_rf = SklearnClassifier( RandomForestClassifier(random_state=0)).train(featurized_train_set) print('Testing classifier...') # acc = m_accuracy(clf_rf, featurized_new_2, new_2) for testset in testing: print "=====Random Forest=====" m_accuracy(clf_rf, testset) print "=====SVM=====" m_accuracy(clf_svm, testset) print "=====Naive Bayes=====" m_accuracy(clf_nb, testset) print "=====MaxEnt GIS=====" m_accuracy(clf_gis, testset) print "======MaxEnt IIS======" m_accuracy(clf_iis, testset) print '==================================='
def rte_classifier(algorithm): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print('Training classifier...') if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms. # Ensure that MEGAM is configured first. check_megam_config() clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str( "RTEClassifier only supports these algorithms:\n " "'megam', 'BFGS', 'GIS', 'IIS'.\n" ) raise Exception(err_msg) print('Testing classifier...') acc = accuracy(clf, featurized_test_set) print('Accuracy: %6.4f' % acc) return clf
def main(): parser = argparse.ArgumentParser(description='clwsd') parser.add_argument('--sourceword', type=str, required=True) parser.add_argument('--targetlang', type=str, required=True) parser.add_argument('--taggerhome', type=str, required=True) args = parser.parse_args() all_target_languages = "de es fr it nl".split() assert args.targetlang in all_target_languages target = args.targetlang sourceword = args.sourceword stanford.taggerhome = args.taggerhome gold_answers = read_gold.get_gold_answers(sourceword, target) instances = get_training_data(sourceword, target) print("... training ...") nltk.classify.megam.config_megam(bin='/usr/local/bin/megam') classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam') print("LABELS", classifier.labels()) ## with open("../eval/{0}.output".format(sourceword), "w") as outfile: fn = "../trialdata/alltrials/{0}.data".format(sourceword) problems = extract_wsd_problems(fn) for problem in problems: featureset = features.extract(problem) answer = classifier.classify(featureset) print(output_one_best(problem, target, answer)) label = gold_answers[problem.instance_id] print("CORRECT" if label == answer else "WRONG") print("distribution was...") dist = classifier.prob_classify(featureset) for key in dist.samples(): print(" ", key, dist.prob(key))
def train_classifier(self, featureVector): print('Training the Naive Bayes Classifier..') nbclassifier = nbc.train(featureVector) print('success\n') nbclassifier.show_most_informative_features(20) print('Storing the classfier object...') pickle.dump(nbclassifier, open('data/trained_model_naivebayes.pickle', 'wb')) print('success') print('-------------------------\n') print('Training the Maximum Entropy Classifier..') meclassifier = mec.train(featureVector) print('success\n') meclassifier.show_most_informative_features() print('Storing the classfier object...') pickle.dump(meclassifier, open('data/trained_model_maxentropy.pickle', 'wb')) print('success') print('-------------------------\n') print('Training the SVM Classifier..') svmclassifier = SklearnClassifier(LinearSVC()) svmclassifier.train(featureVector) print('success\n') print('Storing the classfier object...') pickle.dump(svmclassifier, open('data/trained_model_svm.pickle', 'wb')) print('success') print('-------------------------\n')
def train(self, clf_type): print('Training classifier...') words, labels = self.load_data(self.train_path) self.pos = [t[1] for t in nltk.pos_tag(words)] self.previous_labels = ["O"] + labels # next_labels = labels[1:] + ['O'] features = [self.features(words, i) for i in range(len(words))] train_samples = [(f, l) for (f, l) in zip(features, labels)] if clf_type == 'SVM': # classifier = SklearnClassifier( make_pipeline(StandardScaler(with_mean=False), SVC(kernel='rbf', # probability=True, max_iter=1000))).train(train_samples) classifier = SklearnClassifier(LinearSVC()).train(train_samples) elif clf_type == 'MLP': classifier = SklearnClassifier( MLPClassifier()).train(train_samples) elif clf_type == 'Naive Bayes': classifier = NaiveBayesClassifier.train(train_samples) else: classifier = MaxentClassifier.train(train_samples, max_iter=self.max_iter) self.dict_classifiers[clf_type] = classifier self.pos = self.previous_labels = None
def main(): nltk.classify.megam.config_megam(bin='/usr/local/bin/megam') with open(sys.argv[1]) as infile: lines = infile.readlines() words_to_include = [line.strip() for line in lines] print("extracting training instances...") for wordnum, sw in enumerate(words_to_include): instances = get_instances(sw) if not instances: print("no instances for {0}, skipping".format(sw)) continue if len(instances) > MAX_TRAINING_INSTANCES: print("TOO MANY! Sampling {0} down.".format(sw)) instances = random.sample(instances, MAX_TRAINING_INSTANCES) print("training", sw, "{0}/{1} with {2} instances".format( wordnum, len(words_to_include), len(instances))) classifier = MaxentClassifier.train(instances, trace=0, max_iter=10, algorithm='megam') picklestore.save(sw, classifier)
def test_feature_extraction_for_maxent_classifier(self): print("Testing Feature extraction for maxent classifier...") from oke.oak.nif2rdfProcessor import NIF2RDFProcessor dataProcessor = NIF2RDFProcessor() context_data = dataProcessor.aggregate_context_data( dataProcessor.graphData_goldstandards, 'http://www.ontologydesignpatterns.org/data/oke-challenge/task-2/sentence-93#char=0,179', 'The Southern Intercollegiate Athletic Conference is a College athletic conference consisting of historically black colleges and universities located in the southern United States.' ) featFactory = FeatureFactory() datums = featFactory.compute_features(context_data) featFactory.writeData(datums, 'test_trainWithFeatures') datums = featFactory.readData('test_trainWithFeatures.json') train_set = [(datum.features, datum.label) for datum in datums] print(train_set) from nltk.classify.maxent import MaxentClassifier me_classifier = MaxentClassifier.train(train_set) predit_label = me_classifier.classify({ 'word': 'conference', 'word_root': 'conference', 'word_pos': 'NN', 'isEntity': 'N', 'isStopWord': 'N', 'prev_word_isStopWord': 'N' }) print('predicted label:', predit_label) print('========Show top 10 most informative features========') me_classifier.show_most_informative_features(10)
def train(self, train_sents, algorithm='megam', rare_word_cutoff=5, rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3, **cutoffs): """ MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged sentences. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. Each sentence is represented by a list of tuples. Each tuple holds two strings, a word and its tag, e.g. ('company','NN'). @type algorithm: C{str} @param algorithm: The algorithm that is used by L{nltk.MaxentClassifier.train()} to train and optimise the model. It is B{strongly recommended} to use the C{LM-BFGS} algorithm provided by the external package U{megam<http://hal3.name/megam/>} as it is much faster and uses less memory than any of the algorithms provided by NLTK (i.e. C{GIS}, C{IIS}) or L{scipy} (e.g. C{CG} and C{BFGS}). @type rare_word_cutoff: C{int} @param rare_word_cutoff: Words with less occurrences than C{rare_word_cutoff} will be treated differently by L{extract_feats} than non-rare words (cf. Ratnaparkhi 1996). @type rare_feat_cutoff: C{int} @param rare_feat_cutoff: ignore features that occur less than C{rare_feat_cutoff} during training. @type uppercase_letters: C{regex} @param uppercase_letters: a regular expression that covers all uppercase letters of the language of your corpus (e.g. '[A-ZÄÖÜ]' for German) @type trace: C{int} @param trace: The level of diagnostic output to produce. C{0} doesn't produce any output, while C{3} will give all the output that C{megam} produces plus the time it took to train the model. @param cutoffs: Arguments specifying various conditions under which the training should be halted. When using C{MEGAM}, only C{max_iter} should be relevant. For other cutoffs see L{nltk.MaxentClassifier} - C{max_iter=v}: Terminate after C{v} iterations. """ self.uppercase_letters = uppercase_letters self.word_freqdist = self.gen_word_freqs(train_sents) self.featuresets = self.gen_featsets(train_sents, rare_word_cutoff) self.features_freqdist = self.gen_feat_freqs(self.featuresets) self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff) t1 = time.time() self.classifier = MaxentClassifier.train(self.featuresets, algorithm, trace, **cutoffs) t2 = time.time() if trace > 0: print "time to train the classifier: {0}".format(round(t2-t1, 3))
def train(self, train_sents, algorithm='megam', rare_word_cutoff=5, rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3, **cutoffs): """ MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged sentences. @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str}) @param train_sents: A list of tagged sentences. Each sentence is represented by a list of tuples. Each tuple holds two strings, a word and its tag, e.g. ('company','NN'). @type algorithm: C{str} @param algorithm: The algorithm that is used by L{nltk.MaxentClassifier.train()} to train and optimise the model. It is B{strongly recommended} to use the C{LM-BFGS} algorithm provided by the external package U{megam<http://hal3.name/megam/>} as it is much faster and uses less memory than any of the algorithms provided by NLTK (i.e. C{GIS}, C{IIS}) or L{scipy} (e.g. C{CG} and C{BFGS}). @type rare_word_cutoff: C{int} @param rare_word_cutoff: Words with less occurrences than C{rare_word_cutoff} will be treated differently by L{extract_feats} than non-rare words (cf. Ratnaparkhi 1996). @type rare_feat_cutoff: C{int} @param rare_feat_cutoff: ignore features that occur less than C{rare_feat_cutoff} during training. @type uppercase_letters: C{regex} @param uppercase_letters: a regular expression that covers all uppercase letters of the language of your corpus (e.g. '[A-ZÄÖÜ]' for German) @type trace: C{int} @param trace: The level of diagnostic output to produce. C{0} doesn't produce any output, while C{3} will give all the output that C{megam} produces plus the time it took to train the model. @param cutoffs: Arguments specifying various conditions under which the training should be halted. When using C{MEGAM}, only C{max_iter} should be relevant. For other cutoffs see L{nltk.MaxentClassifier} - C{max_iter=v}: Terminate after C{v} iterations. """ self.uppercase_letters = uppercase_letters self.word_freqdist = self.gen_word_freqs(train_sents) self.featuresets = self.gen_featsets(train_sents, rare_word_cutoff) self.features_freqdist = self.gen_feat_freqs(self.featuresets) self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff) t1 = time.time() self.classifier = MaxentClassifier.train(self.featuresets, algorithm, trace, **cutoffs) t2 = time.time() if trace > 0: print("time to train the classifier: {0}".format(round(t2-t1, 3)))
def get_maxent_classifier(sourceword, target): instances = get_training_data_from_extracted(sourceword, target) instances = train_from_extracted.remove_onecount_instances(instances) print("got {0} training instances!!".format(len(instances))) print("... training ...") classifier = MaxentClassifier.train(instances, trace=0, max_iter=20, algorithm="megam") print("LABELS", classifier.labels()) return classifier
def train(self, train_sents, **cutoffs): self.word_freqdist = self.gen_word_freqs(train_sents) featuresets = self.gen_featsets(train_sents, self._rare_word_cutoff) print("Start training maxent...") self.classifier = MaxentClassifier.train(featuresets, self._algorithm, self._trace, **cutoffs) print("Finish training maxent!")
def memm_train(): X_train, y_train = prep_memm_feats('assignment2dataset/train.txt') train_feature = [(a, b) for a, b in zip(X_train, y_train)] memm = MaxentClassifier.train(train_feature, max_iter=60) fopen = open("./MEMM/memm.pt", "wb") pickle.dump(memm, fopen)
def results(train, query_data, query_no_label, query_labels): print '\nCalculating final results...' megam_classifier = MaxentClassifier.train(train, 'megam') # build and train the maxent classifier accu = accuracy(megam_classifier, query_data) # calculate the classification accuracy predicted = megam_classifier.classify_many(query_no_label) # get a list of predicted labels cm = confusion_matrix(query_labels, predicted) # build confusion matrix return accu, cm
def classify(inputdir): #filenames = os.listdir('d:\\shir\\') filenames = os.listdir(inputdir) feat_set = [] sets = [] for name in filenames: # print name lineno=0 path = os.path.join(inputdir, name) sense = name.split('\\')[-1].split('.')[0] print 'training', sense file = codecs.open(path, 'r', 'utf-8') allwords = [] for line in file: if len(line.split())>2: lineno+=1 line = line.strip() words=[] tags=[] tokens = line.split() for item in tokens: if len(item.split('\\'))==2: word=item.split('\\')[0] tag= item.split('\\')[1] words.append(word) tags.append(tag) allwords.append(word) feat_set.append((bag_of_words(line),sense)) #feat_set.append((get_feature2(line),sense)) else: words=[] tags=[] file.close() random.shuffle(feat_set) random.shuffle(feat_set) #random.shuffle(feat_set) train_data = train_feats(feat_set) test_data = test_feats(feat_set) #classifier= MaxentClassifier.train(train_data) nb_classifier = NaiveBayesClassifier.train(train_data) dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) # pickle.dump(classifier, classifier_save_file) entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5) print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100) print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100) print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100) mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier) print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
def findMEPerformance(self): self.train_set_size, self.test_set_size, self.trainSet, self.testSet = self.findSet( ) classifier = mec.train(self.trainSet, algorithm='iis', max_iter=50) bull_precision, bear_precision, neutral_precision = self.findPrecision( classifier) bull_recall, bear_recall, neutral_recall = self.findRecall(classifier) bull_fmetric, bear_fmetric, neutral_fmetric = self.findFMetric( classifier) accuracy = self.findAccuracy(classifier) return self.train_set_size, self.test_set_size, accuracy, bull_precision, bear_precision, neutral_precision, bull_recall, bear_recall, neutral_recall, bull_fmetric, bear_fmetric, neutral_fmetric
def cross_validate(self): all_train_list = [] all_held_list = [] for ethnicity_list in self.training_lists: train_list, held_list = self.split_list_crossvalidation(ethnicity_list) all_train_list.append(train_list) all_held_list.append(held_list) toks = self.make_train_toks(all_train_list) self.classifier = mxc.train(toks) self.evaluate_success(all_held_list)
def train(self): print('Training classifier...') words, labels = self.load_data(self.train_path) previous_labels = ["O"] + labels features = [ self.features(words, previous_labels[i], i) for i in range(len(words)) ] train_samples = [(f, l) for (f, l) in zip(features, labels)] classifier = MaxentClassifier.train(train_samples, max_iter=self.max_iter) self.classifier = classifier
def cross_validate(self): all_train_list = [] all_held_list = [] for ethnicity_list in self.training_lists: train_list, held_list = self.split_list_crossvalidation( ethnicity_list) all_train_list.append(train_list) all_held_list.append(held_list) toks = self.make_train_toks(all_train_list) self.classifier = mxc.train(toks) self.evaluate_success(all_held_list)
def get_maxent_classifier(sourceword, target): instances = get_training_data_from_extracted(sourceword, target) print("got {0} training instances!!".format(len(instances))) without_onecounts = remove_onecount_instances(instances) print("removed {0} one-count instances!!".format( len(instances) - len(without_onecounts))) instances = without_onecounts print("... training ...") classifier = MaxentClassifier.train(instances, trace=0, max_iter=20, algorithm='megam') print("LABELS", classifier.labels()) return classifier
def train(self, train_sents, algorithm='megam', rare_word_cutoff=5, rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3, **cutoffs): self.uppercase_letters = uppercase_letters self.word_freqdist = self.gen_word_freqs(train_sents) self.featuresets = self.gen_featsets(train_sents, rare_word_cutoff) self.features_freqdist = self.gen_feat_freqs(self.featuresets) self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff) t1 = time.time() self.classifier = MaxentClassifier.train(self.featuresets, algorithm, trace, **cutoffs) t2 = time.time() if trace > 0: print "time to train the classifier: {0}".format(round(t2-t1, 3))
def _train_ME_Classifier(extractedBases, lbls, params = {}): """ NLTK ME Training Wrapper""" trainset = [[eb, lbl] for eb, lbl in zip(extractedBases, lbls)] optimizer = params.get('optimizer', 'GIS') trace = params.get('trace', 3) encoding = params.get('encoding',None) labels = params.get('labels', None) sparse = params.get('sparse', True) gaussian_prior_sigma = params.get('gaussian_prior_sigma', 0) max_iter = params.get('max_iter', 25) classifier = nltkmec.train(trainset, optimizer, trace=trace, \ encoding=encoding, labels=labels, sparse=sparse, gaussian_prior_sigma=gaussian_prior_sigma, max_iter = max_iter) return classifier, classifier.labels()
def train(self, train_set): split_size_train = 0.7 print(' split ', split_size_train * 100, '% from gold standards for training ... ') from nltk.classify.maxent import MaxentClassifier from nltk.classify.naivebayes import NaiveBayesClassifier # 10 fold test fold_n = 2 all_f_measure = [] all_precision = [] all_recall = [] import random for i in range(1, fold_n): print("start [%s] fold validation..." % i) random.shuffle(train_set) _train_set, _test_set = train_set[:round( len(train_set) * split_size_train )], train_set[round(len(train_set) * split_size_train):] me_classifier = MaxentClassifier.train(_train_set) #nb_classifier = NaiveBayesClassifier.train(_train_set) #from sklearn.svm import LinearSVC #from nltk.classify.scikitlearn import SklearnClassifier #print("training SVM Classifier...") #svm_classifier = SklearnClassifier(LinearSVC()) #svm_classifier = svm_classifier.train(_train_set) #print("complete SVM training.") self.benchmarking(me_classifier, _test_set, all_f_measure, all_precision, all_recall) print("all_f_measure,", all_f_measure) print("all_precision,", all_precision) print("all_recall", all_recall) print("Final F-measure", sum(all_f_measure) / float(len(all_f_measure))) print("Final precision", sum(all_precision) / float(len(all_precision))) print("Final recall", sum(all_recall) / float(len(all_recall))) self.save_classifier_model(me_classifier, 'me_class_inducer.m') return me_classifier
def train_classifier(classifier, directory, feature, name=None, scorethreshold=None): """Creates and trains a NLTK classifier from nltk.classify package. classifier - a classifier class that supports training directory - directory containing the training set (inside wedt/training) feature - feature set function (features.py) """ if classifier=="MaxEnt": from nltk.classify.maxent import MaxentClassifier as Classifier elif classifier=="PositiveNaiveBayes": from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier as Classifier else: from nltk.classify.naivebayes import NaiveBayesClassifier as Classifier featuresets = get_featuresets(directory, feature, scorethreshold) c = Classifier.train(featuresets) if name: with open(os.path.join(classifier_path, name), 'w') as file: pickle.dump((c,feature), file) return c
def train(self, train_set): split_size_train=0.7 print(' split ',split_size_train*100,'% from gold standards for training ... ') from nltk.classify.maxent import MaxentClassifier from nltk.classify.naivebayes import NaiveBayesClassifier # 10 fold test fold_n=2 all_f_measure=[] all_precision=[] all_recall=[] import random for i in range(1,fold_n): print("start [%s] fold validation..." %i) random.shuffle(train_set) _train_set, _test_set=train_set[:round(len(train_set)*split_size_train)],train_set[round(len(train_set)*split_size_train):] me_classifier = MaxentClassifier.train(_train_set) #nb_classifier = NaiveBayesClassifier.train(_train_set) #from sklearn.svm import LinearSVC #from nltk.classify.scikitlearn import SklearnClassifier #print("training SVM Classifier...") #svm_classifier = SklearnClassifier(LinearSVC()) #svm_classifier = svm_classifier.train(_train_set) #print("complete SVM training.") self.benchmarking(me_classifier,_test_set,all_f_measure, all_precision, all_recall) print("all_f_measure,",all_f_measure) print("all_precision,",all_precision) print("all_recall", all_recall) print("Final F-measure", sum(all_f_measure) / float(len(all_f_measure))) print("Final precision", sum(all_precision) / float(len(all_precision))) print("Final recall", sum(all_recall) / float(len(all_recall))) self.save_classifier_model(me_classifier,'me_class_inducer.m') return me_classifier
def trainClassifier(self): ''' Calculates features and trains the maxent classifier, storing the resulting model in <self.model> ''' # check if pickled pickled_model = self.checkForPickle() if pickled_model: self.model = pickled_model else: self.initFeatures() print 'Done reading in training examples' kargs = { 'algorithm' : 'gis', } if self.max_iter != None: kargs['max_iter'] = self.max_iter self.model = MaxentClassifier.train(self.shrunk_training_examples, **kargs) self.pickleModel() print 'Reading Pickle files..'
def trainClassifier(self): ''' Calculates features and trains the maxent classifier, storing the resulting model in <self.model> ''' # check if pickled pickled_model = self.checkForPickle() if pickled_model: self.model = pickled_model else: self.initFeatures() print 'Done reading in training examples' kargs = { 'algorithm': 'gis', } if self.max_iter != None: kargs['max_iter'] = self.max_iter self.model = MaxentClassifier.train(self.shrunk_training_examples, **kargs) self.pickleModel()
def test_feature_extraction_for_maxent_classifier(self): print("Testing Feature extraction for maxent classifier...") from oke.oak.nif2rdfProcessor import NIF2RDFProcessor dataProcessor=NIF2RDFProcessor() context_data=dataProcessor.aggregate_context_data(dataProcessor.graphData_goldstandards, 'http://www.ontologydesignpatterns.org/data/oke-challenge/task-2/sentence-93#char=0,179', 'The Southern Intercollegiate Athletic Conference is a College athletic conference consisting of historically black colleges and universities located in the southern United States.') featFactory = FeatureFactory() datums=featFactory.compute_features(context_data) featFactory.writeData(datums,'test_trainWithFeatures') datums = featFactory.readData('test_trainWithFeatures.json') train_set = [(datum.features, datum.label) for datum in datums] print(train_set) from nltk.classify.maxent import MaxentClassifier me_classifier = MaxentClassifier.train(train_set) predit_label=me_classifier.classify({'word': 'conference', 'word_root': 'conference', 'word_pos': 'NN', 'isEntity': 'N', 'isStopWord': 'N', 'prev_word_isStopWord': 'N'}) print('predicted label:',predit_label) print('========Show top 10 most informative features========') me_classifier.show_most_informative_features(10)
def maxent(featuresets, num): size= int(len(featuresets)) * 0.1 train_set, test_set= featuresets[int(size):], featuresets[:int(size)] maxent_classifier= MaxentClassifier.train(train_set, trace= 3, max_iter= num) return (maxent_classifier, test_set)
def classify(inputdir): #filenames = os.listdir('d:\\shir\\') filenames = os.listdir(inputdir) feat_set = [] sets = [] for name in filenames: # print name labeledlist = [] lineno = 0 path = os.path.join(inputdir, name) sense = name.split('\\')[-1].split('.')[0] print 'training', sense file = codecs.open(path, 'r', 'utf-8') allwords = [] for line in file: if len(line.split()) > 2: lineno += 1 line = line.strip() words = [] tags = [] tokens = line.split() for item in tokens: if len(item.split('\\')) == 2: word = item.split('\\')[0] tag = item.split('\\')[1] words.append(word) tags.append(tag) allwords.append(word) feat_set.append((bag_of_bigrams_words(words), sense)) # feat_set.append((context_feature(line),sense)) else: words = [] tags = [] print lineno labeledlist.append((sense, allwords)) # feat_set.append((bigram_feature(allwords),sense)) file.close() high_info_words = set(high_information_words(labeledlist)) for item in high_info_words: print item random.shuffle(feat_set) random.shuffle(feat_set) random.shuffle(feat_set) train_data = train_feats(feat_set) test_data = test_feats(feat_set) print "training on " + str(len(train_data)) + " instances" print "testting on " + str(len(test_data)) + " instances" #classifier= MaxentClassifier.train(train_data) # nb_classifier = NaiveBayesClassifier.train(train_data) dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=7, support_cutoff=10) # print dt_classifier.pp() # pickle.dump(classifier, classifier_save_file) entropy_classifier = MaxentClassifier.train(train_data, algorithm='iis', trace=0, max_iter=2, min_lldelta=0.5) print "nb accuracy " # print accuracy(nb_classifier, test_data) * 100 # print "nb precision and recall" # print precision_recall(nb_classifier,test_data) # print nb_classifier.show_most_informative_features() # for item in nb_classifier.most_informative_features(): # print item # print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100) print "entropy accuracy " + str( accuracy(entropy_classifier, test_data) * 100)
def classify(inputdir): #filenames = os.listdir('d:\\shir\\') filenames = os.listdir(inputdir) feat_set = [] sets = [] for name in filenames: # print name labeledlist = [] lineno=0 path = os.path.join(inputdir, name) sense = name.split('\\')[-1].split('.')[0] print 'training', sense file = codecs.open(path, 'r', 'utf-8') allwords = [] for line in file: if len(line.split())>2: lineno+=1 line = line.strip() words=[] tags=[] tokens = line.split() for item in tokens: if len(item.split('\\'))==2: word=item.split('\\')[0] tag= item.split('\\')[1] words.append(word) tags.append(tag) allwords.append(word) feat_set.append((bag_of_bigrams_words(words),sense)) # feat_set.append((context_feature(line),sense)) else: words=[] tags=[] print lineno labeledlist.append((sense,allwords)) # feat_set.append((bigram_feature(allwords),sense)) file.close() high_info_words = set(high_information_words(labeledlist)) for item in high_info_words: print item random.shuffle(feat_set) random.shuffle(feat_set) random.shuffle(feat_set) train_data = train_feats(feat_set) test_data = test_feats(feat_set) print "training on "+str(len(train_data))+" instances" print "testting on "+str(len(test_data))+" instances" #classifier= MaxentClassifier.train(train_data) # nb_classifier = NaiveBayesClassifier.train(train_data) dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=7, support_cutoff=10) # print dt_classifier.pp() # pickle.dump(classifier, classifier_save_file) entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=2, min_lldelta=0.5) print "nb accuracy " # print accuracy(nb_classifier, test_data) * 100 # print "nb precision and recall" # print precision_recall(nb_classifier,test_data) # print nb_classifier.show_most_informative_features() # for item in nb_classifier.most_informative_features(): # print item # print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100) print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
return dict((row['date'], row) for row in L) with open('counter-dump-normalized', 'r') as f: reader = csv.DictReader(f) W = dateMap(reader) train_set = [] dates = set(W) for ds in dates: try: ds_data = {} if bool(int(W[ds]['relaxation'])): ds_data['relaxation'] = True if int(W[ds]['caffeine']) > 0: ds_data['caffeine'] = True if int(W[ds]['sweets']) > 1: ds_data['sweets'] = True if int(W[ds]['alcohol']) > 4: ds_data['alcohol'] = True except (ValueError, KeyError): continue exercised = bool(int(W[ds]['exercise'])) and 'exercise' or 'no-exercise' train_set.append((ds_data, exercised)) classifier = MaxentClassifier.train(train_set, algorithm='IIS', max_iter=100, min_lldelta=0.0001) classifier.show_most_informative_features()
def train(self): tokens = self.make_train_toks(self.training_lists) self.classifier = mxc.train(tokens,algorithm="iis")
from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) cm_stats(y_test, y_pred, "Random Forest") # Fitting CART to the Training set from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) cm_stats(y_test, y_pred, "CART") # Fitting Maximum Entropy to the Training set from nltk.classify.maxent import MaxentClassifier algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier = MaxentClassifier.train() # Predicting the Test set results y_pred = classifier.predict(X_test) cm_stats(y_test, y_pred, "CART")
i = 0 while (i < 1400): if (i < 700): train_output.append(1) else: train_output.append(0) i = i + 1 train_set = [] for i in range(1400): f = trainDataVecs[i] f = f.tolist() output = train_output[i] train_set.append((dict(enumerate(f)), output)) maxent_classifier = MaxentClassifier.train(train_set, max_iter=25) true_positives = 0.0 true_negatives = 0.0 false_positives = 0.0 false_negatives = 0.0 for i in range(600): input_feature = testDataVecs[i].tolist() prediction = maxent_classifier.classify(dict(enumerate(input_feature))) if (i < 300): if (prediction == 1): true_positives = true_positives + 1 else: false_negatives = false_negatives + 1 else:
def train(self, fset): self.classifier = MaxentClassifier.train(fset, encoding=self._encoding)
def train(self, max_iter): return MaxentClassifier.train(train_toks=self.trainTokens, max_iter=max_iter)
def clf(x): return MaxentClassifier.train(featurized_train_set, algorithm)
def build_classifier(trainSet): if not trainSet: return None classifier = MaxentClassifier.train(trainSet, algorithm='gis') return classifier
def train(self): tokens = self.make_train_toks(self.training_lists) self.classifier = mxc.train(tokens, algorithm="iis")
def build_classifier(trainSet): if not trainSet: return None classifier = MaxentClassifier.train(trainSet, algorithm='gis') return classifier
i=0 while(i<1400): if (i<700): train_output.append(1) else: train_output.append(0) i=i+1 train_set=[] for i in range(1400): f = trainDataVecs[i] f = f.tolist() output = train_output[i] train_set.append((dict(enumerate(f)), output)) maxent_classifier = MaxentClassifier.train(train_set, max_iter=25) true_positives = 0.0 true_negatives = 0.0 false_positives = 0.0 false_negatives = 0.0 for i in range(600): input_feature = testDataVecs[i].tolist() prediction = maxent_classifier.classify(dict(enumerate(input_feature))) if(i<300): if(prediction==1): true_positives=true_positives+1 else: false_negatives=false_negatives+1 else:
print('* Loaded training corpus', file=sys.stderr, end='\n ') print('\n '.join(out), file=sys.stderr) print('* Training model...', end=' ', file=sys.stderr) if args.model_name == 'memo': train_toks = make_word_featuresets(train_corpus.reader) model = MemoTraining.train(train_toks) elif args.model_name == 'maxent': contexts = args.contexts if args.contexts is not None else range( -2, 3) train_toks = make_maxent_featuresets(train_corpus.reader, ds=contexts) encoding = TypedMaxentFeatureEncoding.train( train_toks, count_cutoff=args.cutoff, alwayson_features=True) model = MaxentClassifier.train(train_toks, encoding=encoding, max_iter=args.max_iter) else: train_toks = make_dummy_featuresets(train_corpus.reader) model = MajorityTag.train(train_toks) print('done', file=sys.stderr) with open(args.model_path, 'wb') as f: pickle.dump(model, f) print(f'* Model saved to {args.model_path}', file=sys.stderr) print(f'* Evaluate on training set:', file=sys.stderr) train_featuresets = [fs for fs, _ in train_toks] hyp_tags = model.classify_many(train_featuresets) ref_tags = [tag for _, tag in train_corpus.reader.tagged_words()] result = evaluate(ref_tags, hyp_tags)
return dict((row['date'], row) for row in L) with open('counter-dump-normalized', 'r') as f: reader = csv.DictReader(f) W = dateMap(reader) train_set = [] dates = set(W) for ds in dates: try: ds_data = {} if bool(int(W[ds]['relaxation'])): ds_data['relaxation'] = True if int(W[ds]['caffeine']) > 0: ds_data['caffeine'] = True if int(W[ds]['sweets']) > 1: ds_data['sweets'] = True if int(W[ds]['alcohol']) > 4: ds_data['alcohol'] = True except (ValueError, KeyError): continue exercised = bool(int(W[ds]['exercise'])) and 'exercise' or 'no-exercise' train_set.append((ds_data, exercised)) classifier = MaxentClassifier.train( train_set, algorithm='IIS', max_iter=100, min_lldelta=0.0001) classifier.show_most_informative_features()
print(featuresets[0:10]) train_set, test_set = featuresets[500:], featuresets[:500] print(len(train_set)) print(len(test_set)) nb_classifier = NaiveBayesClassifier.train(train_set) print(nb_classifier.classify(gender_features('Gary'))) print(nb_classifier.classify(gender_features('Grace'))) print(classify.accuracy(nb_classifier, test_set)) print(nb_classifier.show_most_informative_features(5)) me_classifier = MaxentClassifier.train(train_set) print(me_classifier.classify(gender_features('Gary'))) print(me_classifier.classify(gender_features('Grace'))) classify.accuracy(me_classifier, test_set) me_classifier.show_most_informative_features(5) def gender_features2(name): features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter)