Пример #1
0
def score(classifier):
    classifier = nltk.SklearnClassifier(classifier)  # 在nltk 中使用scikit-learn 的接口
    classifier.train(train)  # 训练分类器
    classifier = joblib.load('model.m')
    # joblib.dump(classifier,'model.m')
    pred = classifier.classify_many(dev)  # 对开发测试集的数据进行分类,给出预测的标签

    return precision_recall_fscore_support(tag_dev, pred)
def get_classifier(classifier,
                   train,
                   mealgorithm='GIS',
                   sklalgorithm='bernoulli'):
    if classifier == 'NaiveBayes':
        return nltk.NaiveBayesClassifier.train(train)
    elif classifier == 'DecisionTree':
        return nltk.DecisionTreeClassifier.train(train)
    elif classifier == 'Maxent':
        return nltk.DecisionTreeClassifier.train(train, mealgorithm)
    elif classifier == 'Sklearn':
        if sklalgorithm == 'bernoulli':
            return nltk.SklearnClassifier(BernoulliNB()).train(train)
        elif sklalgorithm == 'svc':
            return nltk.SklearnClassifier(SVC(), sparse=False).train(train)
    elif classifier == 'SVM':
        pass
Пример #3
0
def score2(classifier):
    classifier = nltk.SklearnClassifier(classifier)  # 在nltk 中使用scikit-learn 的接口
    classifier.train(train)  # 训练分类器

    pred = classifier.classify_many(dev)  # 对开发测试集的数据进行分类,给出预测的标签

    # print(classification_report(tag_dev, pred))
    # print('\n')
    return accuracy_score(tag_dev, pred)  # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier):
    classifier = nltk.SklearnClassifier(classifier)  # 在nltk 中使用scikit-learn的接口
    classifier.train(train)  # 训练分类器

    pred = classifier.classify_many(dev)  # 对开发测试集的数据进行分类,给出预测的标签
    a_s = accuracy_score(tag_dev, pred)
    p_s = precision_score(tag_dev, pred, average='binary', pos_label="pos")
    r_s = recall_score(tag_dev, pred, average='binary', pos_label="pos")
    f1_s = f1_score(tag_dev, pred, average='binary', pos_label="pos")
    return a_s, p_s, r_s, f1_s  # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
Пример #5
0
def score(classifier):

    classifier = nltk.SklearnClassifier(classifier)
    classifier.train(train)
    joblib.dump(classifier, 'LogisticRegression.model')
    pred = classifier.classify_many(data)
    n = 0
    s = len(pred)
    for i in range(0, s):
        if pred[i] == tag[i]:
            n = n + 1
    return n / s
Пример #6
0
def saveBestClassifier(classifier,
                       fileName,
                       featureSet,
                       testRatio,
                       sklean=False):
    rows = len(featureSet)
    print("Training %s 100 times..." % (fileName))

    if sklean:
        classifierBest = nltk.SklearnClassifier(classifier)
    else:
        classifierBest = classifier

    maxAcc = 0
    for i in range(100):
        random.shuffle(featureSet)

        # Spliting into train and test set
        trainingSet = featureSet[:1900]
        testingSet = featureSet[1900:]

        if sklean:
            classifierCurr = nltk.SklearnClassifier(classifier).train(
                trainingSet)
        else:
            classifierCurr = classifier.train(trainingSet)

        currAcc = nltk.classify.accuracy(classifierCurr, testingSet)
        if currAcc > maxAcc:
            classifierBest = classifierCurr
            maxAcc = currAcc
            print("New best accuracy for %s:" % (fileName), maxAcc)
        print("Done: %.2f" % (i / 100))
    print("Saving %s with max accuracy:" % (fileName), maxAcc)

    file = open("./classifiers/" + fileName + ".pickle", "wb")
    pickle.dump(classifierBest, file)
    file.close()
Пример #7
0
    def classify(self, training_set, classifierDumpFile):
        """

        :param training_set: Features to train the classifier
        :param classifierDumpFile: File name to dump -save- the trained data.
        :return: classifier
        """
        # Train the module of the specified algorithm using the created training set.
        self.classifier = nltk.SklearnClassifier(LinearSVC())
        self.classifier.train(training_set)
        #self.classifier = nltk.NaiveBayesClassifier.train(training_set)
        # Save the trainied data in pickle for further use.
        self.dump_files(self.classifier, self.classifierDumpFile)

        return self.classifier
Пример #8
0
def score(classifier):
    classifier = nltk.SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    classifier.train(train)  #训练分类器
    pred = classifier.classify_many(data)  #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag, pred)
# naive_bayes = NaiveBayesClassifier.train(train_set)
# print("Accuracy - Naive Bayes Classifier: ")
# print(nltk.classify.accuracy(naive_bayes, test_set))
# print("Most informative features - Naive Bayes Classifier:")
# print(naive_bayes.show_most_informative_features())

# maxent = MaxentClassifier.train(train_set, 'GIS', trace=0,
#                                 encoding=None, gaussian_prior_sigma=0, max_iter=100)
# print("Accuracy - Max Entropy Classifier: ")
# print(nltk.classify.accuracy(maxent, test_set))
# print("Most informative features - Max Entropy Classifier:")
# print(maxent.show_most_informative_features())

linear_svm_classifier = nltk.SklearnClassifier(LinearSVC(C=2.0, dual=True, fit_intercept=True,
                                                         intercept_scaling=0.1, loss='squared_hinge',
                                                         max_iter=1500, penalty='l2', random_state=0,
                                                         tol=0.0001), sparse=False)
linear_svm_classifier.train(train_set)
print("Accuracy - Linear SVM Classifier: ")
print(nltk.classify.accuracy(linear_svm_classifier, test_set))


nonlinear_svm = SklearnClassifier(SVC(gamma='scale', kernel='poly', coef0 = 5.0, degree = 5, C = 5.0, shrinking=True, probability=False, tol=1e-3), sparse=False).train(train_set)
print("Accuracy - Nonlinear SVM: ")
print(nltk.classify.accuracy(nonlinear_svm, test_set))


random_forest = SklearnClassifier(RandomForestClassifier(n_estimators = 100,
                                                         criterion = 'gini',
                                                         max_depth = 5,
                                                         min_samples_split = 2,
Пример #10
0
all_words = nltk.FreqDist(all_words)
# print(all_words.most_common(15))

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = nltk.word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(rev), category) for (category, rev) in documents]
random.shuffle(featuresets)

training_set = featuresets[:4500]
testing_set = featuresets[4500:]

# classifier_f = open("Wilfred_naivebayes.pickle", "rb")
# classifier = pickle.load(classifier_f)
# classifier_f.close()
#
# classifier = nltk.NaiveBayesClassifier.train(training_set)
# print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100)
# classifier.show_most_informative_features(15)

start_time = time.time()
classifier = nltk.SklearnClassifier(RandomForestClassifier(n_estimators=4000, criterion='entropy'))
classifier.train(training_set)
print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100)
print("%s seconds" % (time.time() - start_time))
Пример #11
0
def gender_features(word):
    return {'last_letter': word[-1]}


labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print("Original Naive Bayes Algo accuracy percent:",
      (nltk.classify.accuracy(classifier, test_set)) * 100)
classifier.show_most_informative_features(15)

MNB_classifier = nltk.SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print("MNB_classifier accuracy percent:",
      (nltk.classify.accuracy(MNB_classifier, test_set)) * 100)

# print(classifier.classify(gender_features('Neo')))
# print(MNB_classifier.classify(gender_features('Neo')))

BernoulliNB_classifier = nltk.SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train_set)
print("BernoulliNB_classifier accuracy percent:",
      (nltk.classify.accuracy(BernoulliNB_classifier, test_set)) * 100)

LogisticRegression_classifier = nltk.SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_set)
print("LogisticRegression_classifier accuracy percent:",
Пример #12
0
def svm(training_set,dev_test_set):
    classifier = nltk.SklearnClassifier(ls()).train(training_set)
    accuracy = nltk.classify.accuracy(classifier,dev_test_set)
    return (classifier,accuracy)
Пример #13
0
    n=1,
    extended=0,
):
    # make bag of words
    bow = dict()
    for word in nltk.ngrams(text, n):
        if extended == 0:
            if word not in bow:
                bow[word] = True
        else:
            bow[word] = bow.get(word, 0) + 1
    return bow


models = [
    ("SVM", nltk.SklearnClassifier(sklearn.svm.SVC(probability=True))),
    ("NaiveBayes", nltk.classify.NaiveBayesClassifier),
    # ("RandomForest", nltk.SklearnClassifier(ensemble.RandomForestClassifier())),
    # ("MaxEntropy", nltk.classify.MaxentClassifier),
    # ("MLP", nltk.SklearnClassifier(neural_network.MLPClassifier()))
]
onlyBayes = [("NaiveBayes", nltk.classify.NaiveBayesClassifier)]


def test_classifiers(classifiers, ngrams, cutoffs):
    test_counter = 0
    scores = dict()
    for i in range(2):
        for ndx, (name, cl) in enumerate(classifiers):
            for gram in ngrams:
                test_results = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
Пример #14
0
#main 
data_list = getTrainingData()
question_list = [x for (x,y) in data_list]
label_list = [y for (x,y) in data_list]

print("\n question_list = ", question_list)
print("\n label_list = ", label_list)

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(question_list)
print("\n X.toarray()=", X.toarray())
print("\n X.shape[0]=",X.shape[0])

test_v = vectorizer.transform(['it is a test of sales order'])
print("\n test_v.toarray()=", test_v.toarray())

training_set= [ (getContentFeature(question_list[i]), label_list[i]) for i in range(X.shape[0])]
print("\n training_set=", training_set)
    
myClassifier = nltk.SklearnClassifier(SVC()).train(training_set)

#Test
while True:
    query = str(input("Input query:")).strip().lower()
    feature = getContentFeature(query)
    #print feature
    print("\n result = ", myClassifier.classify(feature))