예제 #1
0
def update_category_by_pos():
    from nltk.corpus import brown
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.tag import untag
    from nltk import DecisionTreeClassifier

    def pos_features(sentence, i):
        features = {'suffix(1)':sentence[i][-1:],
                    'suffix(2)':sentence[i][-2:],
                    'suffix(3)':sentence[i][-3:]
                    }
        features['prev-word'] = '<start>' if i==0 else sentence[i-1]
        return features

    print pos_features(brown.sents()[0], 8)

    tagged_sents = brown.tagged_sents(categories='news')
    featuresets = []

    for tagged_sent in tagged_sents:
        untagged_sent = untag(tagged_sent)
        for i, (word, tag) in enumerate(tagged_sent):
            featuresets.append((pos_features(untagged_sent, i), tag))

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = NaiveBayesClassifier.train(train_set)
    classifier = DecisionTreeClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
예제 #2
0
    def train(self) -> ClassifierI:
        """Train the classifier.

        This takes a very long time, so after training, pickle it and save it
        to disk.
        """
        training_set = apply_features(
            self.extract_features_from_tweet,
            self.training_set)
        testing_set = apply_features(
            self.extract_features_from_tweet,
            self.testing_set)
        self.testing_set = testing_set
        if self.classifier_name == self.NAIVE_BAYES:
            classifier = NaiveBayesClassifier.train(training_set)
        elif self.classifier_name == self.DECISION_TREE:
            classifier = DecisionTreeClassifier.train(training_set)
        else:
            raise ValueError("Couldn't create classifier")
        dump(
            classifier,
            open(
                'twitteranalyser/data/%s.p' %
                self.classifier_name,
                'wb'),
            HIGHEST_PROTOCOL)
        return classifier
예제 #3
0
 def test_classification(self):
     dataset_name = 'breast_cancer'
     mode = 'classification'
     X, y = get_data(dataset_name)
     est_dict = {
         'logreg': LogisticRegression(),
         'dtree': DecisionTreeClassifier(max_depth=5)
     }
     features_scores = get_scores_df(est_dict, X, y, mode)
     print(features_scores)
예제 #4
0
    def cross_validation_train(self, dev_docs):
        '''
        Applies k-fold cross validation technique to split the docs into different
        pairs of training and testing sets. For each pair, it trains and evals the
        a classifier, choosing the one with the best accuracy

        Parameters
        ----------
        dev_docs: iterable
            An iterable which yields a list of strings

        '''
        dev_docs = shuffled(dev_docs)
        accuracies = []
        best_accuracy = 0
        subset_size = int(len(dev_docs) / self.n_folds)

        for i in range(self.n_folds):
            classifier_list = []
            train_docs = (dev_docs[(i + 1) * subset_size:] + \
                          dev_docs[:i * subset_size])
            test_docs = dev_docs[i * subset_size:(i + 1) * subset_size]
            train_set = apply_features(self.get_doc_features, train_docs)
            if self.t_classifier == "NB":
                classifier = NaiveBayesClassifier.train(train_set)
            elif self.t_classifier == "DT":
                classifier = DecisionTreeClassifier.train(train_set)
            elif self.t_classifier == "RF":
                classifier = SklearnClassifier(RandomForestClassifier())\
                                                       .train(train_set)
            elif self.t_classifier == "SVM":
                classifier = SklearnClassifier(LinearSVC(), sparse=False)\
                                                         .train(train_set)

            classifier_list.append(classifier)
            test_set = apply_features(self.get_doc_features, test_docs, True)
            accuracies.append((accuracy(classifier, test_set)) * 100)

            if accuracies[-1] > best_accuracy:
                best_accuracy = accuracies[-1]
                self._classifier = classifier
                self._train_docs = train_docs
                self._test_docs = test_docs
예제 #5
0
    def all_class_train(self, dev_docs):
        '''
        Train classifier with train_p percentage of all classes. The remaining
        docs of each class is used for testing.

        Parameters
        ----------
        dev_docs: iterable
            An iterable which yields a list of strings
        '''
        categories_count = self.count_categories(dev_docs)

        labeled_docs = {}
        for (cat, count) in categories_count.items():
            labeled_docs[cat] = shuffled(
                [t for (t, k) in dev_docs if k == cat])

        train_docs = []
        test_docs = []

        for cat, l in labeled_docs.items():
            cat_limit = int(self.train_p * len(l))
            train_docs += [(t, cat) for t in l[:cat_limit]]
            test_docs += [(t, cat) for t in l[cat_limit:]]

        self._train_docs = train_docs
        self._test_docs = test_docs

        train_set = apply_features(self.get_doc_features, self._train_docs)
        # create and train the classification model according to t_classifier
        if self.t_classifier == "NB":
            self._classifier = NaiveBayesClassifier.train(train_set)
        elif self.t_classifier == "DT":
            self._classifier = DecisionTreeClassifier.train(train_set)
        elif self.t_classifier == "RF":
            self._classifier = SklearnClassifier(RandomForestClassifier())\
                                                         .train(train_set)
        elif self.t_classifier == "SVM":
            self._classifier = SklearnClassifier(LinearSVC(), sparse=False)\
                                                          .train(train_set)
예제 #6
0
    def equitative_class_train(self, dev_docs):
        categories_count = self.count_categories(dev_docs)

        labeled_docs = {}
        for (cat, count) in categories_count.items():
            labeled_docs[cat] = shuffled(
                [t for (t, k) in dev_docs if k == cat])

        train_docs = []
        test_docs = []

        for cat, l in labeled_docs.items():
            cat_limit = int(self.train_p * len(l))
            train_docs += [(t, cat) for t in l[:cat_limit]]
            test_docs += [(t, cat) for t in l[cat_limit:]]

        self._train_docs = train_docs
        self._test_docs = test_docs

        # print("len dev docs", len(dev_docs))
        # print("categories count", categories_count)
        # print("count train", self.count_categories(train_docs))
        # print("count test", self.count_categories(test_docs))

        # split dev docs and create traning and test set
        # self.split_train_and_test(dev_docs)
        train_set = apply_features(self.get_doc_features, self._train_docs)
        # create and train the classification model according to t_classifier
        if self.t_classifier == "NB":
            self._classifier = NaiveBayesClassifier.train(train_set)
        elif self.t_classifier == "DT":
            self._classifier = DecisionTreeClassifier.train(train_set)
        elif self.t_classifier == "RF":
            self._classifier = SklearnClassifier(RandomForestClassifier())\
                                                         .train(train_set)
        elif self.t_classifier == "SVM":
            self._classifier = SklearnClassifier(LinearSVC(), sparse=False)\
                                                          .train(train_set)
예제 #7
0
'''


def pos_features(sentence, i):
    features = {
        "suffix(1)": sentence[i][-1:],
        "suffix(2)": sentence[i][-2:],
        "suffix(3)": sentence[i][-3:]
    }
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i - 1]

    return features


tagged_words = brown.tagged_words(categories='news')

# print(tagged_words)

featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[:1000], featuresets[1000:2000]
classifier = DecisionTreeClassifier.train(train_set)
ac = accuracy(classifier, test_set)
print(ac)

print(classifier.classify(pos_features('cats')))

print(classifier.pseudocode(depth=4))
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features


def gender_features3(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}


if __name__ == '__main__':
    print("Lab 3 - Exercise 2")
    data = get_data()
    train_set = apply_features(gender_features3, data[500:])
    test_set = apply_features(gender_features3, data[:500])

    print("Training classifiers")
    # Train the different classifiers on the training set
    classifier = [(NaiveBayesClassifier.train(train_set), "NaiveBayes"),
                  (DecisionTreeClassifier.train(train_set), "DecisionTree"),
                  (MaxentClassifier.train(train_set, max_iter=10, trace=0), "MaxEntropy")]

    # Test all classifiers on the test set
    for classifier, name in classifier:
        acc = accuracy(classifier, test_set)
        print("{} classifier test accuracy: {}".format(name, acc))
예제 #9
0
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#KNN
# clf = KNeighborsClassifier(n_neighbors=1)
# clf.fit(features_train, labels_train)

#Random Forest
# clf = RandomForestClassifier(n_estimators=100)
# clf = clf.fit(features_train, labels_train)

#AdaBoost
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200)
clf = clf.fit(features_train, labels_train)

predict = clf.predict(features_test)
print calculate_accuracy(labels_test, predict)

try:
    prettyPicture(clf, features_test, labels_test)
except NameError:
    pass
featuresets = [(gender_feature(name), predictor)
               for (name, predictor) in gender]
trainingSet, testSet = featuresets[500:], featuresets[0:500]
# print(featuresets)
# # print(trainingSet)
# # print(testSet)

name = input('Enter a Name:')
classifiers = int(
    input(
        'Enter a Classifier:\n1:Naive Bayes Classifier\n2: Decision Tree\n3: Support Vector machine (SVM)'
    ))
accuracies = []
trainNaiveBaseClassifier = lambda data: NaiveBayesClassifier.train(data)
decisionTreeBaseClassifier = lambda decision: DecisionTreeClassifier.train(
    decision)
supportVectorMachine = lambda svm: classify.SklearnClassifier(LinearSVC())

if classifiers == 1:

    # naiveClassifier = NaiveBayesClassifier.train(trainingSet)

    naiveBayesClassifier = trainNaiveBaseClassifier(trainingSet)
    n = naiveBayesClassifier.classify(gender_feature(name))
    print('*' * 80)
    print('Naive Bays Classifier')
    print('*' * 80)
    print(naiveBayesClassifier.show_most_informative_features())

    #test the accuracy  of the classifier using Naive Bayes
    # Observe that these character names from The Matrix are correctly classified. Although this science fiction movie
예제 #11
0
from nltk import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier

from imputers import FillNaTransformer, HotDeckFullImputer, ModelBasedFullImputer
from transformers import BoxCoxTransformer, PolynomialsAdder, CustomBinner, OutliersClipper, FeatureProduct

baseline = {
    'DecisionTreeClassifier': {
        'params': {
            'predictor':
            DecisionTreeClassifier(max_depth=None),
            'scaler':
            None,
            'simple_imputer':
            FillNaTransformer(from_dict={},
                              mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                              median=[],
                              nan_flag=[],
                              zero=[])
        },
        'score': 0.1974390243902439,
        'std': 0.0756691348271984
    },
    'KNeighborsClassifier': {
        'params': {
            'predictor':
예제 #12
0
pos_features('test')

##feature extractor, we can use it to train a new decision tree classifier:
tagged_words = brown.tagged_words(categories='news')

print(tagged_words)
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets[0]

from nltk import DecisionTreeClassifier
from nltk.classify import accuracy

cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers.


accuracy(classifier, test_set)
classifier.classify(pos_features('cats'))


'''

To accompany the video, here is the sample code for NLTK part of speech tagging with lots of comments and info as well:

POS tag list:

CC coordinating conjunction
CD cardinal digit
DT determiner
예제 #13
0
def dttrain(train_set):
    # entropy_cutoff=0.05, depth_cutoff=100,
    # support_cutoff=10, binary=False, feature_values=None,
    # verbose=False
    classifier = DecisionTreeClassifier.train(train_set, depth_cutoff=10000)
    return classifier