Exemplo n.º 1
0
def rte_classifier(algorithm):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(
        ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(
        ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)
    # Train the classifier
    print('Training classifier...')
    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
        # Ensure that MEGAM is configured first.
        check_megam_config()
        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str("RTEClassifier only supports these algorithms:\n "
                      "'megam', 'BFGS', 'GIS', 'IIS'.\n")
        raise Exception(err_msg)
    print('Testing classifier...')
    acc = accuracy(clf, featurized_test_set)
    print('Accuracy: %6.4f' % acc)
    return clf
def rte_classifier(algorithm, sample_N=None):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(
        ["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
    test_set = rte_corpus.pairs(
        ["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])

    if sample_N is not None:
        train_set = train_set[:sample_N]
        test_set = test_set[:sample_N]

    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)

    # Train the classifier
    print("Training classifier...")
    if algorithm in ["megam"]:  # MEGAM based algorithms.
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ["GIS", "IIS"]:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str("RTEClassifier only supports these algorithms:\n "
                      "'megam', 'GIS', 'IIS'.\n")
        raise Exception(err_msg)
    print("Testing classifier...")
    acc = accuracy(clf, featurized_test_set)
    print("Accuracy: %6.4f" % acc)
    return clf
Exemplo n.º 3
0
def rte_classifier():  # classifier
    featurized_train_set = rte_featurize(train_set, True)
    featurized_test_set_1 = rte_featurize(test_set_1, False, test_id=0)
    featurized_test_set_2 = rte_featurize(test_set_2, False, test_id=1)
    featurized_test_set_3 = rte_featurize(test_set_3, False, test_id=2)
    featurized_new_1 = rte_featurize(new_1, False, test_id=3)
    featurized_new_2 = rte_featurize(new_2, False, test_id=4)
    testing = [
        featurized_test_set_1, featurized_test_set_2, featurized_test_set_3,
        featurized_new_1, featurized_new_2
    ]
    print('Training classifier...')
    clf_svm = SklearnClassifier(LinearSVC()).train(featurized_train_set)
    clf_nb = nltk.NaiveBayesClassifier.train(featurized_train_set)
    clf_gis = MaxentClassifier.train(featurized_train_set, 'GIS')
    clf_iis = MaxentClassifier.train(featurized_train_set, 'IIS')
    clf_rf = SklearnClassifier(
        RandomForestClassifier(random_state=0)).train(featurized_train_set)
    print('Testing classifier...')
    # acc = m_accuracy(clf_rf, featurized_new_2, new_2)
    for testset in testing:
        print "=====Random Forest====="
        m_accuracy(clf_rf, testset)
        print "=====SVM====="
        m_accuracy(clf_svm, testset)
        print "=====Naive Bayes====="
        m_accuracy(clf_nb, testset)
        print "=====MaxEnt GIS====="
        m_accuracy(clf_gis, testset)
        print "======MaxEnt IIS======"
        m_accuracy(clf_iis, testset)
        print '==================================='
Exemplo n.º 4
0
def rte_classifier(algorithm):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)
    # Train the classifier
    print('Training classifier...')
    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
        # Ensure that MEGAM is configured first.
        check_megam_config()
        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str(
            "RTEClassifier only supports these algorithms:\n "
            "'megam', 'BFGS', 'GIS', 'IIS'.\n"
        )
        raise Exception(err_msg)
    print('Testing classifier...')
    acc = accuracy(clf, featurized_test_set)
    print('Accuracy: %6.4f' % acc)
    return clf
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, required=True)
    parser.add_argument('--targetlang', type=str, required=True)
    parser.add_argument('--taggerhome', type=str, required=True)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang in all_target_languages
    target = args.targetlang
    sourceword = args.sourceword
    stanford.taggerhome = args.taggerhome

    gold_answers = read_gold.get_gold_answers(sourceword, target)
    instances = get_training_data(sourceword, target)
    print("... training ...")
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam')
    print("LABELS", classifier.labels())

    ## with open("../eval/{0}.output".format(sourceword), "w") as outfile:
    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    problems = extract_wsd_problems(fn)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(output_one_best(problem, target, answer))
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG")
        print("distribution was...")
        dist = classifier.prob_classify(featureset)
        for key in dist.samples():
            print(" ", key, dist.prob(key))
    def train_classifier(self, featureVector):

        print('Training the Naive Bayes Classifier..')
        nbclassifier = nbc.train(featureVector)
        print('success\n')
        nbclassifier.show_most_informative_features(20)
        print('Storing the classfier object...')
        pickle.dump(nbclassifier,
                    open('data/trained_model_naivebayes.pickle', 'wb'))
        print('success')
        print('-------------------------\n')

        print('Training the Maximum Entropy Classifier..')
        meclassifier = mec.train(featureVector)
        print('success\n')
        meclassifier.show_most_informative_features()
        print('Storing the classfier object...')
        pickle.dump(meclassifier,
                    open('data/trained_model_maxentropy.pickle', 'wb'))
        print('success')
        print('-------------------------\n')

        print('Training the SVM Classifier..')
        svmclassifier = SklearnClassifier(LinearSVC())
        svmclassifier.train(featureVector)
        print('success\n')
        print('Storing the classfier object...')
        pickle.dump(svmclassifier, open('data/trained_model_svm.pickle', 'wb'))
        print('success')
        print('-------------------------\n')
Exemplo n.º 7
0
    def train(self, clf_type):
        print('Training classifier...')

        words, labels = self.load_data(self.train_path)

        self.pos = [t[1] for t in nltk.pos_tag(words)]

        self.previous_labels = ["O"] + labels
        # next_labels = labels[1:] + ['O']

        features = [self.features(words, i) for i in range(len(words))]
        train_samples = [(f, l) for (f, l) in zip(features, labels)]
        if clf_type == 'SVM':
            # classifier = SklearnClassifier( make_pipeline(StandardScaler(with_mean=False), SVC(kernel='rbf',
            # probability=True, max_iter=1000))).train(train_samples)
            classifier = SklearnClassifier(LinearSVC()).train(train_samples)
        elif clf_type == 'MLP':
            classifier = SklearnClassifier(
                MLPClassifier()).train(train_samples)
        elif clf_type == 'Naive Bayes':
            classifier = NaiveBayesClassifier.train(train_samples)
        else:
            classifier = MaxentClassifier.train(train_samples,
                                                max_iter=self.max_iter)
        self.dict_classifiers[clf_type] = classifier
        self.pos = self.previous_labels = None
Exemplo n.º 8
0
def main():
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')

    with open(sys.argv[1]) as infile:
        lines = infile.readlines()
    words_to_include = [line.strip() for line in lines]

    print("extracting training instances...")
    for wordnum, sw in enumerate(words_to_include):
        instances = get_instances(sw)
        if not instances:
            print("no instances for {0}, skipping".format(sw))
            continue

        if len(instances) > MAX_TRAINING_INSTANCES:
            print("TOO MANY! Sampling {0} down.".format(sw))
            instances = random.sample(instances, MAX_TRAINING_INSTANCES)
            
        print("training", sw, "{0}/{1} with {2} instances".format(
            wordnum, len(words_to_include), len(instances)))
        classifier = MaxentClassifier.train(instances,
                                            trace=0,
                                            max_iter=10,
                                            algorithm='megam')
        picklestore.save(sw, classifier)
Exemplo n.º 9
0
    def test_feature_extraction_for_maxent_classifier(self):
        print("Testing Feature extraction for maxent classifier...")
        from oke.oak.nif2rdfProcessor import NIF2RDFProcessor
        dataProcessor = NIF2RDFProcessor()
        context_data = dataProcessor.aggregate_context_data(
            dataProcessor.graphData_goldstandards,
            'http://www.ontologydesignpatterns.org/data/oke-challenge/task-2/sentence-93#char=0,179',
            'The Southern Intercollegiate Athletic Conference is a College athletic conference consisting of historically black colleges and universities located in the southern United States.'
        )
        featFactory = FeatureFactory()
        datums = featFactory.compute_features(context_data)
        featFactory.writeData(datums, 'test_trainWithFeatures')

        datums = featFactory.readData('test_trainWithFeatures.json')
        train_set = [(datum.features, datum.label) for datum in datums]
        print(train_set)

        from nltk.classify.maxent import MaxentClassifier

        me_classifier = MaxentClassifier.train(train_set)
        predit_label = me_classifier.classify({
            'word': 'conference',
            'word_root': 'conference',
            'word_pos': 'NN',
            'isEntity': 'N',
            'isStopWord': 'N',
            'prev_word_isStopWord': 'N'
        })
        print('predicted label:', predit_label)
        print('========Show top 10 most informative features========')
        me_classifier.show_most_informative_features(10)
Exemplo n.º 10
0
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        """
        MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
        sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences. Each sentence is
        represented by a list of tuples. Each tuple holds two strings, a
        word and its tag, e.g. ('company','NN').

        @type algorithm: C{str}
        @param algorithm: The algorithm that is used by
        L{nltk.MaxentClassifier.train()} to train and optimise the model. It is
        B{strongly recommended} to use the C{LM-BFGS} algorithm provided by the
        external package U{megam<http://hal3.name/megam/>} as it is much faster
        and uses less memory than any of the algorithms provided by NLTK (i.e.
        C{GIS}, C{IIS}) or L{scipy} (e.g. C{CG} and C{BFGS}).

        @type rare_word_cutoff: C{int}
        @param rare_word_cutoff: Words with less occurrences than
        C{rare_word_cutoff} will be treated differently by L{extract_feats}
        than non-rare words (cf. Ratnaparkhi 1996).

        @type rare_feat_cutoff: C{int}
        @param rare_feat_cutoff: ignore features that occur less than
        C{rare_feat_cutoff} during training.

        @type uppercase_letters: C{regex}
        @param uppercase_letters: a regular expression that covers all
        uppercase letters of the language of your corpus (e.g. '[A-ZÄÖÜ]' for
        German)

        @type trace: C{int}
        @param trace: The level of diagnostic output to produce. C{0} doesn't
        produce any output, while C{3} will give all the output that C{megam}
        produces plus the time it took to train the model.

        @param cutoffs: Arguments specifying various conditions under
            which the training should be halted. When using C{MEGAM}, only
            C{max_iter} should be relevant. For other cutoffs see
            L{nltk.MaxentClassifier}

              - C{max_iter=v}: Terminate after C{v} iterations.
       """
        self.uppercase_letters = uppercase_letters
        self.word_freqdist = self.gen_word_freqs(train_sents)
        self.featuresets = self.gen_featsets(train_sents,
                rare_word_cutoff)
        self.features_freqdist = self.gen_feat_freqs(self.featuresets)
        self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()
        if trace > 0:
            print "time to train the classifier: {0}".format(round(t2-t1, 3))
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        """
        MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
        sentences.

        @type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
        @param train_sents: A list of tagged sentences. Each sentence is
        represented by a list of tuples. Each tuple holds two strings, a
        word and its tag, e.g. ('company','NN').

        @type algorithm: C{str}
        @param algorithm: The algorithm that is used by
        L{nltk.MaxentClassifier.train()} to train and optimise the model. It is
        B{strongly recommended} to use the C{LM-BFGS} algorithm provided by the
        external package U{megam<http://hal3.name/megam/>} as it is much faster
        and uses less memory than any of the algorithms provided by NLTK (i.e.
        C{GIS}, C{IIS}) or L{scipy} (e.g. C{CG} and C{BFGS}).

        @type rare_word_cutoff: C{int}
        @param rare_word_cutoff: Words with less occurrences than
        C{rare_word_cutoff} will be treated differently by L{extract_feats}
        than non-rare words (cf. Ratnaparkhi 1996).

        @type rare_feat_cutoff: C{int}
        @param rare_feat_cutoff: ignore features that occur less than
        C{rare_feat_cutoff} during training.

        @type uppercase_letters: C{regex}
        @param uppercase_letters: a regular expression that covers all
        uppercase letters of the language of your corpus (e.g. '[A-ZÄÖÜ]' for
        German)

        @type trace: C{int}
        @param trace: The level of diagnostic output to produce. C{0} doesn't
        produce any output, while C{3} will give all the output that C{megam}
        produces plus the time it took to train the model.

        @param cutoffs: Arguments specifying various conditions under
            which the training should be halted. When using C{MEGAM}, only
            C{max_iter} should be relevant. For other cutoffs see
            L{nltk.MaxentClassifier}

              - C{max_iter=v}: Terminate after C{v} iterations.
       """
        self.uppercase_letters = uppercase_letters
        self.word_freqdist = self.gen_word_freqs(train_sents)
        self.featuresets = self.gen_featsets(train_sents,
                rare_word_cutoff)
        self.features_freqdist = self.gen_feat_freqs(self.featuresets)
        self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()
        if trace > 0:
            print("time to train the classifier: {0}".format(round(t2-t1, 3)))
Exemplo n.º 12
0
def get_maxent_classifier(sourceword, target):
    instances = get_training_data_from_extracted(sourceword, target)
    instances = train_from_extracted.remove_onecount_instances(instances)
    print("got {0} training instances!!".format(len(instances)))
    print("... training ...")
    classifier = MaxentClassifier.train(instances, trace=0, max_iter=20, algorithm="megam")
    print("LABELS", classifier.labels())
    return classifier
Exemplo n.º 13
0
    def train(self, train_sents, **cutoffs):
        self.word_freqdist = self.gen_word_freqs(train_sents)
        featuresets = self.gen_featsets(train_sents, self._rare_word_cutoff)

        print("Start training maxent...")
        self.classifier = MaxentClassifier.train(featuresets, self._algorithm,
                                                 self._trace, **cutoffs)
        print("Finish training maxent!")
Exemplo n.º 14
0
def memm_train():
    X_train, y_train = prep_memm_feats('assignment2dataset/train.txt')

    train_feature = [(a, b) for a, b in zip(X_train, y_train)]
    memm = MaxentClassifier.train(train_feature, max_iter=60)

    fopen = open("./MEMM/memm.pt", "wb")
    pickle.dump(memm, fopen)
def results(train, query_data, query_no_label, query_labels):
    print '\nCalculating final results...'
    megam_classifier = MaxentClassifier.train(train, 'megam')  # build and train the maxent classifier
    accu = accuracy(megam_classifier, query_data)  # calculate the classification accuracy

    predicted = megam_classifier.classify_many(query_no_label)  # get a list of predicted labels
    cm = confusion_matrix(query_labels, predicted)  # build confusion matrix

    return accu, cm
Exemplo n.º 16
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []
        for name in filenames:
        # print name
            lineno=0
            path = os.path.join(inputdir, name)
            sense = name.split('\\')[-1].split('.')[0]
            print 'training', sense

            file = codecs.open(path, 'r', 'utf-8')
            allwords = []
            for line in file:
              if len(line.split())>2:
                     lineno+=1
                     line = line.strip()
                     words=[]
                     tags=[]
                     tokens = line.split()

                     for item in tokens:
                           if len(item.split('\\'))==2:
                                word=item.split('\\')[0]
                                tag= item.split('\\')[1]
                                words.append(word)
                                tags.append(tag)
                                allwords.append(word)
                     feat_set.append((bag_of_words(line),sense))
                     #feat_set.append((get_feature2(line),sense))
              else:
                  words=[]
                  tags=[]
            file.close()

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        #random.shuffle(feat_set)



        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        #classifier=  MaxentClassifier.train(train_data)
        nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5)
        print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100)
        print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
        mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier)
        print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
Exemplo n.º 17
0
 def findMEPerformance(self):
     self.train_set_size, self.test_set_size, self.trainSet, self.testSet = self.findSet(
     )
     classifier = mec.train(self.trainSet, algorithm='iis', max_iter=50)
     bull_precision, bear_precision, neutral_precision = self.findPrecision(
         classifier)
     bull_recall, bear_recall, neutral_recall = self.findRecall(classifier)
     bull_fmetric, bear_fmetric, neutral_fmetric = self.findFMetric(
         classifier)
     accuracy = self.findAccuracy(classifier)
     return self.train_set_size, self.test_set_size, accuracy, bull_precision, bear_precision, neutral_precision, bull_recall, bear_recall, neutral_recall, bull_fmetric, bear_fmetric, neutral_fmetric
	def cross_validate(self):
		all_train_list = []
		all_held_list = []
		for ethnicity_list in self.training_lists:
			train_list, held_list = self.split_list_crossvalidation(ethnicity_list)
			all_train_list.append(train_list)
			all_held_list.append(held_list)

		toks = self.make_train_toks(all_train_list)
		self.classifier = mxc.train(toks)
		self.evaluate_success(all_held_list)
Exemplo n.º 19
0
 def train(self):
     print('Training classifier...')
     words, labels = self.load_data(self.train_path)
     previous_labels = ["O"] + labels
     features = [
         self.features(words, previous_labels[i], i)
         for i in range(len(words))
     ]
     train_samples = [(f, l) for (f, l) in zip(features, labels)]
     classifier = MaxentClassifier.train(train_samples,
                                         max_iter=self.max_iter)
     self.classifier = classifier
Exemplo n.º 20
0
    def cross_validate(self):
        all_train_list = []
        all_held_list = []
        for ethnicity_list in self.training_lists:
            train_list, held_list = self.split_list_crossvalidation(
                ethnicity_list)
            all_train_list.append(train_list)
            all_held_list.append(held_list)

        toks = self.make_train_toks(all_train_list)
        self.classifier = mxc.train(toks)
        self.evaluate_success(all_held_list)
Exemplo n.º 21
0
def get_maxent_classifier(sourceword, target):
    instances = get_training_data_from_extracted(sourceword, target)
    print("got {0} training instances!!".format(len(instances)))
    without_onecounts = remove_onecount_instances(instances)
    print("removed {0} one-count instances!!".format(
        len(instances) - len(without_onecounts)))
    instances = without_onecounts

    print("... training ...")
    classifier = MaxentClassifier.train(instances,
                                        trace=0,
                                        max_iter=20,
                                        algorithm='megam')
    print("LABELS", classifier.labels())
    return classifier
Exemplo n.º 22
0
    def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
              rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
              **cutoffs):
        self.uppercase_letters = uppercase_letters
        self.word_freqdist = self.gen_word_freqs(train_sents)
        self.featuresets = self.gen_featsets(train_sents,
                rare_word_cutoff)
        self.features_freqdist = self.gen_feat_freqs(self.featuresets)
        self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff)

        t1 = time.time()
        self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
                                                 trace, **cutoffs)
        t2 = time.time()
        if trace > 0:
            print "time to train the classifier: {0}".format(round(t2-t1, 3))
Exemplo n.º 23
0
def _train_ME_Classifier(extractedBases, lbls, params = {}):
    """ NLTK ME Training Wrapper"""

    trainset = [[eb, lbl] for eb, lbl in zip(extractedBases, lbls)]

    optimizer = params.get('optimizer', 'GIS')
    trace = params.get('trace', 3)
    encoding = params.get('encoding',None)
    labels = params.get('labels', None)
    sparse = params.get('sparse', True)
    gaussian_prior_sigma = params.get('gaussian_prior_sigma', 0)
    max_iter = params.get('max_iter', 25)

    classifier = nltkmec.train(trainset, optimizer, trace=trace, \
    	encoding=encoding, labels=labels, sparse=sparse, gaussian_prior_sigma=gaussian_prior_sigma, max_iter = max_iter)

    return classifier, classifier.labels()
    def train(self, train_set):
        split_size_train = 0.7
        print(' split ', split_size_train * 100,
              '% from gold standards for training ... ')

        from nltk.classify.maxent import MaxentClassifier
        from nltk.classify.naivebayes import NaiveBayesClassifier

        # 10 fold test
        fold_n = 2
        all_f_measure = []
        all_precision = []
        all_recall = []
        import random
        for i in range(1, fold_n):
            print("start [%s] fold validation..." % i)
            random.shuffle(train_set)

            _train_set, _test_set = train_set[:round(
                len(train_set) * split_size_train
            )], train_set[round(len(train_set) * split_size_train):]

            me_classifier = MaxentClassifier.train(_train_set)
            #nb_classifier = NaiveBayesClassifier.train(_train_set)

            #from sklearn.svm import LinearSVC
            #from nltk.classify.scikitlearn import SklearnClassifier
            #print("training SVM Classifier...")
            #svm_classifier = SklearnClassifier(LinearSVC())
            #svm_classifier = svm_classifier.train(_train_set)
            #print("complete SVM training.")

            self.benchmarking(me_classifier, _test_set, all_f_measure,
                              all_precision, all_recall)
        print("all_f_measure,", all_f_measure)
        print("all_precision,", all_precision)
        print("all_recall", all_recall)

        print("Final F-measure",
              sum(all_f_measure) / float(len(all_f_measure)))
        print("Final precision",
              sum(all_precision) / float(len(all_precision)))
        print("Final recall", sum(all_recall) / float(len(all_recall)))

        self.save_classifier_model(me_classifier, 'me_class_inducer.m')
        return me_classifier
Exemplo n.º 25
0
def train_classifier(classifier, directory, feature, name=None, scorethreshold=None):
	"""Creates and trains a NLTK classifier from nltk.classify package.
	
	classifier	- a classifier class that supports training
	directory	- directory containing the training set (inside wedt/training)
	feature		- feature set function (features.py)
	"""
	if classifier=="MaxEnt":
		from nltk.classify.maxent import MaxentClassifier as Classifier
	elif classifier=="PositiveNaiveBayes":
		from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier as Classifier
	else:
		from nltk.classify.naivebayes import NaiveBayesClassifier as Classifier
	featuresets = get_featuresets(directory, feature, scorethreshold)
	c = Classifier.train(featuresets)
	if name:
		with open(os.path.join(classifier_path, name), 'w') as file:
			pickle.dump((c,feature), file)
	return c
    def train(self, train_set):
        split_size_train=0.7
        print(' split ',split_size_train*100,'% from gold standards for training ... ')
        
        from nltk.classify.maxent import MaxentClassifier       
        from nltk.classify.naivebayes import NaiveBayesClassifier        
        
        # 10 fold test
        fold_n=2
        all_f_measure=[]
        all_precision=[]
        all_recall=[]
        import random
        for i in range(1,fold_n):
            print("start [%s] fold validation..." %i)
            random.shuffle(train_set)
        
            _train_set, _test_set=train_set[:round(len(train_set)*split_size_train)],train_set[round(len(train_set)*split_size_train):]
        
            me_classifier = MaxentClassifier.train(_train_set)
            #nb_classifier = NaiveBayesClassifier.train(_train_set)

            #from sklearn.svm import LinearSVC
            #from nltk.classify.scikitlearn import SklearnClassifier
            #print("training SVM Classifier...")
            #svm_classifier = SklearnClassifier(LinearSVC())
            #svm_classifier = svm_classifier.train(_train_set)
            #print("complete SVM training.")
        
            self.benchmarking(me_classifier,_test_set,all_f_measure, all_precision, all_recall)
        print("all_f_measure,",all_f_measure)
        print("all_precision,",all_precision)
        print("all_recall", all_recall)
        
        print("Final F-measure", sum(all_f_measure) / float(len(all_f_measure))) 
        print("Final precision", sum(all_precision) / float(len(all_precision))) 
        print("Final recall", sum(all_recall) / float(len(all_recall))) 
        
        self.save_classifier_model(me_classifier,'me_class_inducer.m')
        return me_classifier
    def trainClassifier(self):
        '''
        Calculates features and trains the maxent classifier, storing the resulting
        model in <self.model>
        '''
        # check if pickled
        pickled_model = self.checkForPickle()
        if pickled_model:
            self.model = pickled_model
        else:

            self.initFeatures()
            print 'Done reading in training examples'
            kargs = {
                'algorithm' : 'gis',
            }
            if self.max_iter != None:
                kargs['max_iter'] = self.max_iter

            self.model = MaxentClassifier.train(self.shrunk_training_examples, **kargs)
            self.pickleModel()
        print 'Reading Pickle files..'
    def trainClassifier(self):
        '''
        Calculates features and trains the maxent classifier, storing the resulting
        model in <self.model>
        '''
        # check if pickled
        pickled_model = self.checkForPickle()
        if pickled_model:
            self.model = pickled_model
        else:

            self.initFeatures()
            print 'Done reading in training examples'
            kargs = {
                'algorithm': 'gis',
            }
            if self.max_iter != None:
                kargs['max_iter'] = self.max_iter

            self.model = MaxentClassifier.train(self.shrunk_training_examples,
                                                **kargs)
            self.pickleModel()
Exemplo n.º 29
0
 def test_feature_extraction_for_maxent_classifier(self):
     print("Testing Feature extraction for maxent classifier...")
     from oke.oak.nif2rdfProcessor import NIF2RDFProcessor
     dataProcessor=NIF2RDFProcessor()
     context_data=dataProcessor.aggregate_context_data(dataProcessor.graphData_goldstandards, 
                                                       'http://www.ontologydesignpatterns.org/data/oke-challenge/task-2/sentence-93#char=0,179',
                                                       'The Southern Intercollegiate Athletic Conference is a College athletic conference consisting of historically black colleges and universities located in the southern United States.')
     featFactory = FeatureFactory()
     datums=featFactory.compute_features(context_data)
     featFactory.writeData(datums,'test_trainWithFeatures')
     
     datums = featFactory.readData('test_trainWithFeatures.json')
     train_set = [(datum.features, datum.label) for datum in datums]
     print(train_set)
     
     from nltk.classify.maxent import MaxentClassifier
     
     me_classifier = MaxentClassifier.train(train_set)
     predit_label=me_classifier.classify({'word': 'conference', 'word_root': 'conference', 'word_pos': 'NN', 'isEntity': 'N', 'isStopWord': 'N', 'prev_word_isStopWord': 'N'})
     print('predicted label:',predit_label)
     print('========Show top 10 most informative features========')
     me_classifier.show_most_informative_features(10)
Exemplo n.º 30
0
def maxent(featuresets, num):
    size= int(len(featuresets)) * 0.1
    train_set, test_set= featuresets[int(size):], featuresets[:int(size)]

    maxent_classifier= MaxentClassifier.train(train_set, trace= 3, max_iter= num)
    return (maxent_classifier, test_set)
Exemplo n.º 31
0
def classify(inputdir):
    #filenames = os.listdir('d:\\shir\\')
    filenames = os.listdir(inputdir)

    feat_set = []
    sets = []

    for name in filenames:
        # print name
        labeledlist = []
        lineno = 0
        path = os.path.join(inputdir, name)
        sense = name.split('\\')[-1].split('.')[0]
        print 'training', sense

        file = codecs.open(path, 'r', 'utf-8')
        allwords = []
        for line in file:
            if len(line.split()) > 2:
                lineno += 1
                line = line.strip()
                words = []
                tags = []
                tokens = line.split()

                for item in tokens:
                    if len(item.split('\\')) == 2:
                        word = item.split('\\')[0]
                        tag = item.split('\\')[1]
                        words.append(word)
                        tags.append(tag)
                        allwords.append(word)
                feat_set.append((bag_of_bigrams_words(words), sense))
            # feat_set.append((context_feature(line),sense))
            else:
                words = []
                tags = []
        print lineno
        labeledlist.append((sense, allwords))

        #                feat_set.append((bigram_feature(allwords),sense))
        file.close()
    high_info_words = set(high_information_words(labeledlist))
    for item in high_info_words:
        print item

    random.shuffle(feat_set)
    random.shuffle(feat_set)
    random.shuffle(feat_set)

    train_data = train_feats(feat_set)
    test_data = test_feats(feat_set)
    print "training on " + str(len(train_data)) + " instances"
    print "testting on " + str(len(test_data)) + " instances"
    #classifier=  MaxentClassifier.train(train_data)
    # nb_classifier = NaiveBayesClassifier.train(train_data)
    dt_classifier = DecisionTreeClassifier.train(train_data,
                                                 entropy_cutoff=0.8,
                                                 depth_cutoff=7,
                                                 support_cutoff=10)
    # print dt_classifier.pp()
    # pickle.dump(classifier, classifier_save_file)
    entropy_classifier = MaxentClassifier.train(train_data,
                                                algorithm='iis',
                                                trace=0,
                                                max_iter=2,
                                                min_lldelta=0.5)
    print "nb accuracy "
    # print accuracy(nb_classifier, test_data) * 100
    # print "nb precision and recall"
    #        print precision_recall(nb_classifier,test_data)

    #    print   nb_classifier.show_most_informative_features()
    #        for item in  nb_classifier.most_informative_features():
    #            print item
    #   print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
    print "entropy accuracy " + str(
        accuracy(entropy_classifier, test_data) * 100)
Exemplo n.º 32
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []

        for name in filenames:
        # print name
                labeledlist = []
                lineno=0
                path = os.path.join(inputdir, name)
                sense = name.split('\\')[-1].split('.')[0]
                print 'training', sense

                file = codecs.open(path, 'r', 'utf-8')
                allwords = []
                for line in file:
                      if len(line.split())>2:
                             lineno+=1
                             line = line.strip()
                             words=[]
                             tags=[]
                             tokens = line.split()

                             for item in tokens:
                                   if len(item.split('\\'))==2:
                                        word=item.split('\\')[0]
                                        tag= item.split('\\')[1]
                                        words.append(word)
                                        tags.append(tag)
                                        allwords.append(word)
                             feat_set.append((bag_of_bigrams_words(words),sense))
                            # feat_set.append((context_feature(line),sense))
                      else:
                          words=[]
                          tags=[]
                print lineno
                labeledlist.append((sense,allwords))


#                feat_set.append((bigram_feature(allwords),sense))
                file.close()
        high_info_words = set(high_information_words(labeledlist))
        for item in  high_info_words:
                      print item

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        random.shuffle(feat_set)

        

        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        print "training on "+str(len(train_data))+" instances"
        print "testting on "+str(len(test_data))+" instances"
        #classifier=  MaxentClassifier.train(train_data)
       # nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=7, support_cutoff=10)
       # print dt_classifier.pp()
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=2, min_lldelta=0.5)
        print "nb accuracy "
       # print accuracy(nb_classifier, test_data) * 100
       # print "nb precision and recall"
#        print precision_recall(nb_classifier,test_data)

    #    print   nb_classifier.show_most_informative_features()
#        for item in  nb_classifier.most_informative_features():
#            print item
     #   print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
    return dict((row['date'], row) for row in L)


with open('counter-dump-normalized', 'r') as f:
    reader = csv.DictReader(f)
    W = dateMap(reader)

train_set = []
dates = set(W)
for ds in dates:
    try:
        ds_data = {}
        if bool(int(W[ds]['relaxation'])):
            ds_data['relaxation'] = True
        if int(W[ds]['caffeine']) > 0:
            ds_data['caffeine'] = True
        if int(W[ds]['sweets']) > 1:
            ds_data['sweets'] = True
        if int(W[ds]['alcohol']) > 4:
            ds_data['alcohol'] = True
    except (ValueError, KeyError):
        continue
    exercised = bool(int(W[ds]['exercise'])) and 'exercise' or 'no-exercise'
    train_set.append((ds_data, exercised))

classifier = MaxentClassifier.train(train_set,
                                    algorithm='IIS',
                                    max_iter=100,
                                    min_lldelta=0.0001)
classifier.show_most_informative_features()
	def train(self):
		tokens = self.make_train_toks(self.training_lists)
		self.classifier = mxc.train(tokens,algorithm="iis")
Exemplo n.º 35
0
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,
                                    criterion="entropy",
                                    random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

cm_stats(y_test, y_pred, "Random Forest")

# Fitting CART to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

cm_stats(y_test, y_pred, "CART")

# Fitting Maximum Entropy to the Training set
from nltk.classify.maxent import MaxentClassifier
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = MaxentClassifier.train()

# Predicting the Test set results
y_pred = classifier.predict(X_test)

cm_stats(y_test, y_pred, "CART")
Exemplo n.º 36
0
i = 0
while (i < 1400):
    if (i < 700):
        train_output.append(1)
    else:
        train_output.append(0)
    i = i + 1

train_set = []
for i in range(1400):
    f = trainDataVecs[i]
    f = f.tolist()
    output = train_output[i]
    train_set.append((dict(enumerate(f)), output))

maxent_classifier = MaxentClassifier.train(train_set, max_iter=25)

true_positives = 0.0
true_negatives = 0.0
false_positives = 0.0
false_negatives = 0.0

for i in range(600):
    input_feature = testDataVecs[i].tolist()
    prediction = maxent_classifier.classify(dict(enumerate(input_feature)))
    if (i < 300):
        if (prediction == 1):
            true_positives = true_positives + 1
        else:
            false_negatives = false_negatives + 1
    else:
Exemplo n.º 37
0
 def train(self, fset):
     self.classifier = MaxentClassifier.train(fset, encoding=self._encoding)
Exemplo n.º 38
0
 def train(self, max_iter):
   return MaxentClassifier.train(train_toks=self.trainTokens, max_iter=max_iter)
Exemplo n.º 39
0
 def clf(x):
     return MaxentClassifier.train(featurized_train_set, algorithm)
Exemplo n.º 40
0
def build_classifier(trainSet):
    if not trainSet:
        return None
    classifier = MaxentClassifier.train(trainSet, algorithm='gis')
    return classifier
Exemplo n.º 41
0
 def train(self):
     tokens = self.make_train_toks(self.training_lists)
     self.classifier = mxc.train(tokens, algorithm="iis")
Exemplo n.º 42
0
def build_classifier(trainSet):
    if not trainSet:
        return None
    classifier = MaxentClassifier.train(trainSet, algorithm='gis')
    return classifier
Exemplo n.º 43
0
i=0
while(i<1400):
	if (i<700):
		train_output.append(1)
	else:
		train_output.append(0)
	i=i+1

train_set=[]
for i in range(1400):
	f = trainDataVecs[i]
	f = f.tolist()
	output = train_output[i]
	train_set.append((dict(enumerate(f)), output))

maxent_classifier = MaxentClassifier.train(train_set, max_iter=25)

true_positives = 0.0
true_negatives = 0.0
false_positives = 0.0
false_negatives = 0.0

for i in range(600):
	input_feature = testDataVecs[i].tolist()
	prediction = maxent_classifier.classify(dict(enumerate(input_feature)))
	if(i<300):
		if(prediction==1):
			true_positives=true_positives+1
		else:
			false_negatives=false_negatives+1
	else:
Exemplo n.º 44
0
        print('* Loaded training corpus', file=sys.stderr, end='\n  ')
        print('\n  '.join(out), file=sys.stderr)

        print('* Training model...', end=' ', file=sys.stderr)
        if args.model_name == 'memo':
            train_toks = make_word_featuresets(train_corpus.reader)
            model = MemoTraining.train(train_toks)
        elif args.model_name == 'maxent':
            contexts = args.contexts if args.contexts is not None else range(
                -2, 3)
            train_toks = make_maxent_featuresets(train_corpus.reader,
                                                 ds=contexts)
            encoding = TypedMaxentFeatureEncoding.train(
                train_toks, count_cutoff=args.cutoff, alwayson_features=True)
            model = MaxentClassifier.train(train_toks,
                                           encoding=encoding,
                                           max_iter=args.max_iter)
        else:
            train_toks = make_dummy_featuresets(train_corpus.reader)
            model = MajorityTag.train(train_toks)
        print('done', file=sys.stderr)

        with open(args.model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f'* Model saved to {args.model_path}', file=sys.stderr)

        print(f'* Evaluate on training set:', file=sys.stderr)
        train_featuresets = [fs for fs, _ in train_toks]
        hyp_tags = model.classify_many(train_featuresets)
        ref_tags = [tag for _, tag in train_corpus.reader.tagged_words()]
        result = evaluate(ref_tags, hyp_tags)
  return dict((row['date'], row) for row in L)

with open('counter-dump-normalized', 'r') as f:
  reader = csv.DictReader(f)
  W = dateMap(reader)

train_set = []
dates = set(W)
for ds in dates:
  try:
    ds_data = {}
    if bool(int(W[ds]['relaxation'])):
      ds_data['relaxation'] = True
    if int(W[ds]['caffeine']) > 0:
      ds_data['caffeine'] = True
    if int(W[ds]['sweets']) > 1:
      ds_data['sweets'] = True
    if int(W[ds]['alcohol']) > 4:
      ds_data['alcohol'] = True
  except (ValueError, KeyError):
    continue
  exercised = bool(int(W[ds]['exercise'])) and 'exercise' or 'no-exercise'
  train_set.append((ds_data, exercised))

classifier = MaxentClassifier.train(
  train_set,
  algorithm='IIS',
  max_iter=100,
  min_lldelta=0.0001)
classifier.show_most_informative_features()
Exemplo n.º 46
0
print(featuresets[0:10])

train_set, test_set = featuresets[500:], featuresets[:500]

print(len(train_set))
print(len(test_set))

nb_classifier = NaiveBayesClassifier.train(train_set)
print(nb_classifier.classify(gender_features('Gary')))
print(nb_classifier.classify(gender_features('Grace')))

print(classify.accuracy(nb_classifier, test_set))
print(nb_classifier.show_most_informative_features(5))

me_classifier = MaxentClassifier.train(train_set)

print(me_classifier.classify(gender_features('Gary')))
print(me_classifier.classify(gender_features('Grace')))

classify.accuracy(me_classifier, test_set)

me_classifier.show_most_informative_features(5)


def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)