Exemplo n.º 1
0
	store_pkl(pipeline, name)
	species = DataFrame(pipeline.predict(versicolor_X), columns = ["Species"])
	if with_proba == True:
		species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns = ["probability(0)", "probability(1)"])
		species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)

if "Versicolor" in datasets:
	build_versicolor(DummyClassifier(strategy = "prior"), "DummyVersicolor")
	build_versicolor(GBDTLR(GradientBoostingClassifier(n_estimators = 11, random_state = 13), LogisticRegression(random_state = 13)), "GBDTLRVersicolor")
	build_versicolor(KNeighborsClassifier(), "KNNVersicolor", with_proba = False)
	build_versicolor(MLPClassifier(activation = "tanh", hidden_layer_sizes = (8,), solver = "lbfgs", random_state = 13, tol = 0.1, max_iter = 100), 	"MLPVersicolor")
	build_versicolor(SGDClassifier(random_state = 13, max_iter = 100), "SGDVersicolor", with_proba = False)
	build_versicolor(SGDClassifier(random_state = 13, loss = "log", max_iter = 100), "SGDLogVersicolor")
	build_versicolor(SVC(), "SVCVersicolor", with_proba = False)
	build_versicolor(NuSVC(), "NuSVCVersicolor", with_proba = False)

versicolor_X, versicolor_y = load_versicolor("Versicolor")

def build_versicolor_direct(classifier, name, with_proba = True, **pmml_options):
	transformer = ColumnTransformer([
		("all", "passthrough", ["Petal.Length", "Petal.Width"])
	], remainder = "drop")
	pipeline = PMMLPipeline([
		("transformer", transformer),
		("classifier", classifier)
	])
	pipeline.fit(versicolor_X, versicolor_y)
	pipeline.configure(**pmml_options)
	pipeline.verify(versicolor_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, name)
Exemplo n.º 2
0
def main():
    # filepath: sentence data file path
    # vecfile: word vector file path pre-generated from other
    # vectype: compression methods. Average, avg+tf-idf one line, agg+tf-idf whole data
    # vec_path: vector file save path

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/stem_testdata'  # 'data/data_test'
    vecfile = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt'

    vec_files = [
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.100d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.200d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.42B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.840B.300d.txt'
    ]
    # don't know why yet, relative file path having permission deny
    # so we're using absolute path for now
    vec_path = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/word_vector/'

    # Here, we can choose type of vectorization
    # there are 6 word vector file downloaded from glove
    """
    vectype = 1
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path+name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 2
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_OnelineTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 3
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_WholeDataTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))
    """

    # from here, will earase.

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    #filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/hyp1-hyp2-ref'
    vectype = 1
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_diffOrder'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 2
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_OnelineTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 3
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_WholeDataTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    vec_path = 'data/word_vector/glove.6B.50d_vec_diffOrder'
    wvec = load_wordvec(vec_path)
    target_path = 'data/dev.answers'
    answer = load_target(target_path)

    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import ExtraTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import NuSVC
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import LinearSVC

    clf1 = KNeighborsClassifier()
    clf2 = DecisionTreeClassifier()
    clf3 = ExtraTreeClassifier()
    clf4 = MLPClassifier()
    clf5nu = NuSVC()
    clf6lin = LinearSVC()
    # 'sag', 'saga' and 'lbfgs' ’

    print("Training Starts")
    X_train, X_test, y_train, y_test = train_test_split(wvec,
                                                        answer,
                                                        test_size=0.10,
                                                        random_state=42)
    #clf1.fit(X_train, y_train)
    clf1.fit(X_train, y_train)
    print('KNeighborsClassifier score 50d', clf1.score(X_test, y_test))
    clf2.fit(X_train, y_train)
    print('DecisionTreeClassifier score 50d', clf2.score(X_test, y_test))
    clf3.fit(X_train, y_train)
    print('ExtraTreeClassifier score 50d', clf3.score(X_test, y_test))
    clf4.fit(X_train, y_train)
    print('MLPClassifier score 50d', clf4.score(X_test, y_test))

    clf1 = OneVsRestClassifier(KNeighborsClassifier())
    clf2 = OneVsRestClassifier(DecisionTreeClassifier())
    clf3 = OneVsRestClassifier(ExtraTreeClassifier())
    clf4 = OneVsRestClassifier(MLPClassifier())
    clf5 = OneVsOneClassifier(NuSVC())
    clf6 = OneVsRestClassifier(LinearSVC())

    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    clf7 = OneVsRestClassifier(SGDClassifier())
    clf8 = OneVsRestClassifier(Perceptron())
    clf9 = OneVsRestClassifier(PassiveAggressiveClassifier())

    print('One vs Rest methods case::')
    print('KNeighborsClassifier score 50d',
          clf1.fit(X_train, y_train).score(X_test, y_test))
    print('DecisionTreeClassifier score 50d',
          clf2.fit(X_train, y_train).score(X_test, y_test))
    print('ExtraTreeClassifier score 50d',
          clf3.fit(X_train, y_train).score(X_test, y_test))
    print('MLPClassifier score 50d',
          clf4.fit(X_train, y_train).score(X_test, y_test))

    print('SGDClassifier score 50d',
          clf7.fit(X_train, y_train).score(X_test, y_test))
    print('Perceptron score 50d',
          clf8.fit(X_train, y_train).score(X_test, y_test))
    print('PassiveAggressiveClassifier score 50d',
          clf9.fit(X_train, y_train).score(X_test, y_test))

    print('NuSVC score 50d', clf5.fit(X_train, y_train).score(X_test, y_test))
    print('LinearSVC score 50d',
          clf6.fit(X_train, y_train).score(X_test, y_test))

    clf5nu.fit(X_train, y_train)
    print('NuSVC score 50d', clf5nu.score(X_test, y_test))
    clf6lin.fit(X_train, y_train)
    print('LinearSVC score 50d', clf6lin.score(X_test, y_test))

    from sklearn.datasets import make_friedman1
    from sklearn.feature_selection import RFECV
    from sklearn.neighbors import KNeighborsClassifier
    estimator = DecisionTreeClassifier()
Exemplo n.º 3
0
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier Algo Accuracy: ",
      (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC Algo Accuracy: ",
      (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC Algo Accuracy: ",
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC Algo Accuracy: ",
      (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

print("---------------\n")

from nltk.classify import ClassifierI
from statistics import mode


class VoteClassifier(ClassifierI):
    # List of classifiers passsed to this
    def __init__(self, *classifiers):
        self._classifiers = classifiers
Exemplo n.º 4
0
def score(classifier):
    classifier = SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    classifier.train(train_set)  #训练分类器
    pred = classifier.classify_many([fea for (fea, tag) in test_set
                                     ])  #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score([tag for (fea, tag) in test_set],
                          pred)  #对比分类预测结果和人工标注的正确结果,给出分类器准确度


print 'BernoulliNB`s accuracy is %f' % score(BernoulliNB())
print 'MultinomiaNB`s accuracy is %f' % score(MultinomialNB())
print 'LogisticRegression`s accuracy is %f' % score(LogisticRegression())
print 'SVC`s accuracy is %f' % score(SVC())
print 'LinearSVC`s accuracy is %f' % score(LinearSVC())
print 'NuSVC`s accuracy is %f' % score(NuSVC())

#更改特征
#对所有评论进行分词
comment_words = []
for com in good['comment']:
    seg_list = jieba.cut(com.decode('utf-8'), cut_all=False)
    for seg in list(seg_list):
        comment_words.append(seg)

for com in bad['comment']:
    seg_list = jieba.cut(com.decode('utf-8'), cut_all=False)
    for seg in list(seg_list):
        comment_words.append(seg)

#获取所有分词
Exemplo n.º 5
0
        segments = metadata.get_training_segments()
        logging.info("# of Segments: %s" % len(segments))

        features = metadata.get_training_author_or_not_features('James Joyce')
        #print "# of Features: %s" % len(features) 

        vec = CountVectorizer(min_df=1, ngram_range=args['ngrams'], analyzer=args['analyser'])
        counts = vec.fit_transform(segments)
        
        transformer = TfidfTransformer() 
        tfidf = transformer.fit_transform(counts) # counts
        
        if args['alg'] == 'SVC': 
            svm = SVC(**args['params'])
        elif args['alg'] == 'NuSVC':
            svm = NuSVC(**args['params'])
        elif args['alg'] == 'LinearSVC': 
            svm = LinearSVC(**args['params'])

        features = np.array(features)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(tfidf,features,test_size=0.4, random_state=0)
        clf = svm.fit(X_train, y_train)
        score = clf.score(X_test, y_test)  
        cross = cross_validation.cross_val_score(svm, tfidf, y=features, cv=5)

        logging.info("Score: %s" % score)
        logging.info("Five way cross validation: %s" % cross)
        logging.info("Mean of cross validation: %s" % np.mean(cross))
        logging.info("Variance of cross validation: %s" % np.var(cross))
        f = open(args['backing_store'], 'wb')
        p = pickle.Pickler(f) 
def main():
    pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl')
    neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl')
    # 使评论集合随机分布
    shuffle(pos_tk_lst)
    shuffle(neg_tk_lst)
    posWords = list(itertools.chain(*pos_tk_lst))  #把多维数组解链成一维数组
    negWords = list(itertools.chain(*neg_tk_lst))  #同理

    # 二选一(前面是所有词,后面是所有词+双词,基于卡方检验疾进行特征选择)
    # print('1.Word Feature Selection-Chi-sq!')
    # word_scores = create_word_scores(posWords, negWords)
    print('2.Word_Plus_Bigram Feature Selection-Chi-sq!')
    pos_tk_lst = words_plus_bigram(pos_tk_lst)
    neg_tk_lst = words_plus_bigram(neg_tk_lst)
    word_scores = create_word_bigram_scores(posWords, negWords)

    global best_words
    best_words = find_best_words(word_scores, 1500)
    iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl')

    posFeatures = pos_features(
        pos_tk_lst, best_word_features
    )  # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']]
    negFeatures = neg_features(neg_tk_lst, best_word_features)
    print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' %
          (len(posFeatures), len(negFeatures)))
    assert len(posFeatures) == len(negFeatures)
    print('-------------------------------------------------')

    Classifier_Type = [
        'Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC'
    ]  # 'SVC' IS CANCELLED
    (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon()

    # 10_fold_cross-validation(10折交叉验证)
    cut_size = int(len(posFeatures) * 0.9)
    offset_size = len(posFeatures) - cut_size
    avg_scores = {}
    avg_precision = {}
    avg_recall = {}
    avg_time = {}
    for tp in Classifier_Type:
        avg_scores[tp] = 0.0
        avg_precision[tp] = 0.0
        avg_recall[tp] = 0.0
        avg_time[tp] = 0.0
    posTmp = []
    negTmp = []
    # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的)
    for tp in Classifier_Type:
        precision = 0.0
        recall = 0.0
        score = 0.0
        time = 0.0
        if tp == 'Lexicons':
            posTmp = posFeatures
            negTmp = negFeatures
            posFeatures = pos_tk_lst
            negFeatures = neg_tk_lst

        print('Classifier_Type : %s' % (tp))
        for k in range(1, 11):
            test_list = posFeatures[(k - 1) * offset_size:k *
                                    offset_size] + negFeatures[
                                        (k - 1) * offset_size:k * offset_size]
            if k == 1:
                train_list = posFeatures[k * offset_size:] + negFeatures[
                    k * offset_size:]
            elif k == 10:
                train_list = posFeatures[:(
                    k - 1) * offset_size] + negFeatures[:(k - 1) * offset_size]
            else:
                train_list = posFeatures[:(k - 1) * offset_size] + posFeatures[
                    k * offset_size:] + negFeatures[:(
                        k - 1) * offset_size] + negFeatures[k * offset_size:]

            if tp == 'Lexicons':
                test = test_list
                test_tag = ['pos' for i in range(offset_size)]
                test_tag.extend(['neg' for i in range(offset_size)])
                time, precision, recall, score = sentiment_lexicon_score(
                    pos_lexicon_dict, neg_lexicon_dict, test, test_tag)
            else:
                test, test_tag = zip(
                    *test_list
                )  # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...)
                if tp == 'LR':
                    time, precision, recall, score = classifier_score(
                        tp, LogisticRegression(), train_list, test, test_tag)
                elif tp == 'BernoulliNB':
                    time, precision, recall, score = classifier_score(
                        tp, BernoulliNB(), train_list, test, test_tag)
                elif tp == 'MultinomialNB':
                    time, precision, recall, score = classifier_score(
                        tp, MultinomialNB(), train_list, test, test_tag)
                elif tp == 'LinearSVC':
                    time, precision, recall, score = classifier_score(
                        tp, LinearSVC(), train_list, test, test_tag)
                elif tp == 'NuSVC':
                    time, precision, recall, score = classifier_score(
                        tp, NuSVC(probability=True), train_list, test,
                        test_tag)
                elif tp == 'SVC':
                    precision, recall, score = classifier_score(
                        tp,
                        SVC(gamma=0.001,
                            C=100.,
                            kernel='linear',
                            probability=True), train_list, test, test_tag)
            avg_scores[tp] += score
            avg_precision[tp] += precision
            avg_recall[tp] += recall
            avg_time[tp] += time
            print(
                'The precision recall accuracy score and training time is repectively : %f %f %f %f'
                % (precision, recall, score, time))
        if tp == 'Lexicons':
            posFeatures = posTmp
            negFeatures = negTmp
            posTmp = []
            posTmp = []
        print('-------------------------------------------------')
    for tp in Classifier_Type:
        avg_scores[tp] = avg_scores[tp] / 10
        avg_precision[tp] = avg_precision[tp] / 10
        avg_recall[tp] = avg_recall[tp] / 10
        avg_time[tp] = avg_time[tp] / 10
        print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \
            (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp]))
    print("The End!")
Exemplo n.º 7
0
print "Accuracy", scores.mean()

print "\nUsing Passive aggressive Classifier"
pac = PassiveAggressiveClassifier()
scores = cross_val_score(pac, feature_normal, labels, cv=10, n_jobs = 4)
print scores
print "Accuracy", scores.mean()

print "\nUsing nearest centroid"
nc = NearestCentroid()
scores = cross_val_score(nc, feature_normal, labels, cv=10, n_jobs = 4)
print scores
print "Accuracy", scores.mean()

print "\nnusvc"
nusvc = NuSVC()
scores = cross_val_score(nusvc, feature_normal, labels, cv=10, n_jobs = 4)
print scores
print "Accuracy", scores.mean()

# This hangs my computer for some reason
#print "\n Using quadratic discriminant analysis"
#qda = QuadraticDiscriminantAnalysis(store_covariances=True)
#scores = cross_val_score(qda, feature_normal, labels, cv=10, n_jobs = 2)
#print scores
#print "Accuracy", scores.mean()

print "\nUsing Random Forest classifiers"
rfc = RandomForestClassifier(n_estimators=25)
scores = cross_val_score(rfc, feature_normal, labels, cv=10, n_jobs = 4)
print scores
 def SVM_trainer(self):
     self.SVM_classifier = make_pipeline(StandardScaler(), NuSVC(degree=6))
     self.SVM_classifier.fit(self.input_train,self.target_train)
Exemplo n.º 9
0
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
)

from yellowbrick.datasets import load_mushroom
from yellowbrick.classifier import ClassificationReport

ESTIMATORS = {
    "SVC": {
        "model": SVC(gamma="auto"),
        "path": "images/tutorial/modelselect_svc.png"
    },
    "NuSVC": {
        "model": NuSVC(gamma="auto"),
        "path": "images/tutorial/modelselect_nu_svc.png",
    },
    "LinearSVC": {
        "model": LinearSVC(),
        "path": "images/tutorial/modelselect_linear_svc.png",
    },
    "SGD": {
        "model": SGDClassifier(max_iter=100, tol=1e-3),
        "path": "images/tutorial/modelselect_sgd_classifier.png",
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "path": "images/tutorial/modelselect_kneighbors_classifier.png",
    },
    "LR": {
Exemplo n.º 10
0
    cache_size=200, 
    class_weight=None, 
    verbose=False, 
    max_iter=-1, 
    decision_function_shape='ovr', 
    break_ties=False, 
    random_state=None)

NuSVC_C = NuSVC(
    nu=0.5, 
    kernel='rbf', 
    degree=3, 
    gamma='scale', 
    coef0=0.0, 
    shrinking=True, 
    probability=False, 
    tol=0.001, 
    cache_size=200, 
    class_weight=None, 
    verbose=False, 
    max_iter=-1, 
    decision_function_shape='ovr', 
    break_ties=False, 
    random_state=None)

LSVC_C = LinearSVC(
    penalty='l2', 
    loss='squared_hinge', 
    dual=True, 
    tol=0.0001,
    C=1.0, 
    multi_class='ovr', 
Exemplo n.º 11
0
def ggg_train():
    pos = [pos.strip() for pos in open("./sentiment/pos.txt").readlines()]
    neg = [neg.strip() for neg in open("./sentiment/neg.txt").readlines()]
    neutral = [
        neg.strip() for neg in open("./sentiment/neutral.txt").readlines()
    ]
    train_pos = pos
    train_neg = neg
    train_neutral = neutral

    training_data = list(zip(train_pos, ["pos"] * len(train_pos))) + list(
        zip(train_neg, ['neg'] * len(train_neg)))

    vocabulary = set(
        chain(*[(set(word_tokenize(i[0]))) for i in training_data]))
    feature_set = [({i: (i in word_tokenize(sentence))
                     for i in vocabulary}, tag)
                   for sentence, tag in training_data]

    classifier = nltk.NaiveBayesClassifier.train(feature_set)
    print("feature_set")
    with open("./sentiment/classifier.data", "wb") as out_strm:
        dill.dump(classifier, out_strm)
    out_strm.close()
    with open("./sentiment/vocabulary.data", "wb") as out_strm:
        dill.dump(vocabulary, out_strm)
    out_strm.close()

    # classifier = dill.load(open('./sentiment/sentiment.data', 'rb'))
    # print("classi")
    # print("ori acc:", nltk.classify.accuracy(classifier, feature_set_test))

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(feature_set)
    with open("./sentiment/MNB_classifier.data", "wb") as out_strm:
        dill.dump(MNB_classifier, out_strm)
    out_strm.close()
    # print("MNB acc:", nltk.classify.accuracy(MNB_classifier, feature_set_test))

    # print(classifier)
    # GaussianNB, BernoulliNB
    # GaussianNB_classifier = SklearnClassifier(GaussianNB())
    # GaussianNB_classifier.train(feature_set)
    # print("GaussianNB acc:",
    #       nltk.classify.accuracy(GaussianNB_classifier, feature_set_test))

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(feature_set)
    with open("./sentiment/BernoulliNB_classifier.data", "wb") as out_strm:
        dill.dump(BernoulliNB_classifier, out_strm)
    out_strm.close()
    #    LogisticRegression, SGDClassifier
    # from sklearn.svm import SVC, LinearSVC, NuSVC

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(feature_set)
    with open("./sentiment/LogisticRegression_classifier.data",
              "wb") as out_strm:
        dill.dump(LogisticRegression_classifier, out_strm)
    out_strm.close()
    # print(
    #     "LogisticRegression acc:",
    #     nltk.classify.accuracy(LogisticRegression_classifier,
    #                            feature_set_test))

    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(feature_set)
    with open("./sentiment/SGDClassifier_classifier.data", "wb") as out_strm:
        dill.dump(SGDClassifier_classifier, out_strm)
    out_strm.close()
    # print("SGDClassifier acc:",
    #       nltk.classify.accuracy(SGDClassifier_classifier, feature_set_test))
    # SVC_classifier = SklearnClassifier(SVC())
    # SVC_classifier.train(feature_set)
    # print("SVC acc:", nltk.classify.accuracy(SVC_classifier, feature_set_test))
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(feature_set)
    with open("./sentiment/LinearSVC_classifier.data", "wb") as out_strm:
        dill.dump(LinearSVC_classifier, out_strm)
    out_strm.close()
    # print("LinearSVC acc:",
    #       nltk.classify.accuracy(LinearSVC_classifier, feature_set_test))
    NuSVC_classifier = SklearnClassifier(NuSVC())
    NuSVC_classifier.train(feature_set)
    with open("./sentiment/NuSVC_classifier.data", "wb") as out_strm:
        dill.dump(NuSVC_classifier, out_strm)
    out_strm.close()
Exemplo n.º 12
0
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import NuSVC

from sklearn.model_selection import GridSearchCV

#%%

pipelines = [
    Pipeline([('tfidf',
               TfidfVectorizer(binary=True,
                               analyzer='char',
                               ngram_range=(2, 5),
                               lowercase=True)), ('clf', NuSVC())]),
]

#%%

parameters = [{
    'clf__nu': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    'clf__kernel': (
        'linear',
        'poly',
    ),
}]

#%%

if __name__ == "__main__":
Exemplo n.º 13
0
# build_model(BernoulliNB(),train)
#朴素贝叶斯
print('BernoulliNB`s accuracy is %f' %
      score(BernoulliNB(), train, test_list, test_result_list))
#分布数据的贝叶斯算法
print('MultinomiaNB`s accuracy is %f' %
      score(MultinomialNB(), train, test_list, test_result_list))
#逻辑回归算法
print('LogisticRegression`s accuracy is %f' %
      score(LogisticRegression(), train, test_list, test_result_list))
#svc算法
print('SVC`s accuracy is %f' %
      score(SVC(), train, test_list, test_result_list))
#线性svc
print('LinearSVC`s accuracy is %f' %
      score(LinearSVC(), train, test_list, test_result_list))
print('NuSVC`s accuracy is %f' %
      score(NuSVC(), train, test_list, test_result_list))

# def predict(clf,comment):
#     # feat = []
#     comment_words = delivery_word(comment)
#     pred = clf.prob_classify(best_word_features(comment_words))
#     return pred
#
# model = load_model()
# comment = "很不错的软件,旧手机都没问题"
# pred = predict(model,comment)
# print("积极:"+str(pred.prob('pos')) + "  消极:" + str(pred.prob('neg')) + '\n')
Exemplo n.º 14
0
 def test_decision_function(self):
     model = NuSVC()
     self.assertRaise(
         lambda: dump_binary_classification(
             model, folder=self.folder, methods=['decision_function']),
         OnnxBackendAssertionError)
Exemplo n.º 15
0
# replace missing values
dataframe = dataframe.replace('Tumor', 1)
dataframe = dataframe.replace('Normal', 0)

#The features X are everything except for the class.
X = np.array(dataframe.drop(['val'], 1))

# Y is just the class or the diagnosis column
y = np.array(dataframe['val'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2)

clf_lsvc = LinearSVC()
clf = svm.SVC()
clf_nu = NuSVC()

clf.fit(X_train, y_train)
clf_nu.fit(X_train, y_train)
clf_lsvc.fit(X_train, y_train)

accuracy_svm = clf.score(X_test, y_test)
accuracy_nu = clf_nu.score(X_test, y_test)
accuracy_lsvc = clf_lsvc.score(X_test, y_test)

print("SVC", accuracy_svm)
print("NuSVC", accuracy_nu)
print("Linear SVC", accuracy_lsvc)

# Cross Validation
predicted_svm = cross_val_predict(clf, X, y, cv=10)
Exemplo n.º 16
0
#using QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis()
t0 = time.time()
scores = cross_val_score(clf, X, y, cv=10)
print("training time for  QuadraticDiscriminantAnalysis :",
      round(time.time() - t0, 5), "s")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("")
#clf.fit(X,y)
#joblib.dump(clf, 'Quad.pkl')

#using NuSVC
from sklearn.svm import NuSVC
clf = NuSVC()
t0 = time.time()
scores = cross_val_score(clf, X, y, cv=10)
print("training time for  NuSVC :", round(time.time() - t0, 5), "s")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("")

#using PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.datasets import make_classification
clf = PassiveAggressiveClassifier(random_state=0)
t0 = time.time()
scores = cross_val_score(clf, X, y, cv=10)
print("training time for  PassiveAggressiveClassifier :",
      round(time.time() - t0, 5), "s")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Exemplo n.º 17
0
def speculate(context, data):
    """
    Execute orders according to our schedule_function() timing.
    """
    prices = data.history(assets=context.speculations, bar_count=context.historical_bars, frequency='1d',
                          fields='price')

    for stock in context.speculations:

        try:

            price_hist = data.history(stock, 'price', 50, '1d')
            ma1 = price_hist.mean()  # 50 day moving average
            price_hist = data.history(stock, 'price', 200, '1d')
            ma2 = price_hist.mean()  # 200 day moving average

            start_bar = context.feature_window
            price_list = prices[stock].tolist()

            X = []  # list of feature sets
            y = []  # list of labels, one for each feature set

            bar = start_bar

            # feature creation
            """
            Summary: Get the daily % return as a feature set, if tomorrow's price increased, label
            that feature set as a 1 (strong outlook/buy).
            
            Once we have generated a feature set for the last 100 days, in 10 day windows, we can train our model 
            to identify (fit) % returns (feature set) to a buy or sell (short) recommendation  (labels).
            """
            while bar < len(price_list) - 1:
                try:

                    end_price = price_list[bar + 1]  # "tomorrow"'s price'
                    begin_price = price_list[bar]  # today's price

                    pricing_list = []
                    xx = 0
                    for _ in range(context.feature_window):
                        price = price_list[bar - (context.feature_window - xx)] # 10-(10-i) i++, get the trailing 10 day prices
                        pricing_list.append(price)
                        xx += 1

                    # get the % change in daily prices of last 10 days, this will be our feature set
                    features = np.around(np.diff(pricing_list) / pricing_list[:-1] * 100.0, 1)

                    # if tomorrow's price is more than today's price
                    # label the feature set (% change in last 10 days)
                    # a 1 (strong outlook, buy) else -1 (weak outlook, sell)
                    if end_price > begin_price:
                        label = 1
                    else:
                        label = -1

                    bar += 1
                    X.append(features)
                    y.append(label)
                    # print(features)

                except Exception as e:
                    bar += 1
                    print(('feature creation', str(e)))

            clf1 = RandomForestClassifier()
            clf2 = LinearSVC()
            clf3 = NuSVC()
            clf4 = LogisticRegression()

            # now we get the prices and features for the last 10 days
            last_prices = price_list[-context.feature_window:]
            current_features = np.around(np.diff(last_prices) / last_prices[:-1] * 100.0, 1)

            # append the last 10 days feature set
            # scale the data (mean becomes zero, SD = 0), necessary for ML algo to work
            X.append(current_features)
            X = preprocessing.scale(X)

            # the current feature will be the last SCALED feature set
            # X will be all the feature sets, excluding the most recent one,
            # this is the feature set which we will be using to predict
            current_features = X[-1]
            X = X[:-1]

            # this is where the magic happens:
            # we will be training our algorithm here to see the correlation between
            # the features and the labels (this feature set, was a buy etc.)
            # the Most CPU intensive part of the program
            # sklearn documentation says it time complexity is quadratic to number of samples
            # this means it is difficult to scale to a large dataset > a couple 10,000 samples
            # Bonus: How the documentation describes this function: Build a forest of trees from the training set (X, y).
            # we can also provide a sample_weight, if some samples are more important than others
            clf1.fit(X, y)
            clf2.fit(X, y)
            clf3.fit(X, y)
            clf4.fit(X, y)

            # then based on the RandomForestClassifier we predict what our current
            # feature set should be labelled: (1 (buy) or 0 (sell), [0] is the index of the actual predection
            # returns an array of labels based on the  n samples
            p1 = clf1.predict(current_features)[0]
            p2 = clf2.predict(current_features)[0]
            p3 = clf3.predict(current_features)[0]
            p4 = clf4.predict(current_features)[0]

            # Counter('abracadabra').most_common(3)
            #   >>[('a', 5), ('r', 2), ('b', 2)]
            # if all the classifiers agree on the same prediction we will either buy or sell the stock
            # if there is no consensus, we do nothing

            if Counter([p1, p2, p3, p4]).most_common(1)[0][1] >= 4:
                p = Counter([p1, p2, p3, p4]).most_common(1)[0][0]

            else:
                p = 0

            print(('ma1_d: ', ma1))
            print(('ma2_d :', ma2))
            print(('p1 :',p1))
            print(('p2 :',p2))
            print(('p3 :',p3))
            print(('p4 :',p4))
            print(('Prediction', p))


            speculations_allocation = 0.2
            # Based on the voted prediction and the momentum of the moving averages,
            # We will either buy or short the stock (or do nothing).

            if p == 1 and ma1 > ma2:
                order_target_percent(stock, 0.11 * speculations_allocation)
            elif p == -1 and ma1 < ma2:
                order_target_percent(stock, -0.11 * speculations_allocation)

                # alternatively we could just do:
                # order_target_percent(stock,(p*0.11))

        except Exception as e:
            print(str(e))

    record('ma1', ma1)
    record('ma2', ma2)
    record('Leverage_Spec', context.account.leverage)
Exemplo n.º 18
0
build_versicolor(KNeighborsClassifier(), "KNNVersicolor", with_proba=False)
build_versicolor(
    MLPClassifier(activation="tanh",
                  hidden_layer_sizes=(8, ),
                  algorithm="l-bfgs",
                  random_state=13,
                  tol=0.01,
                  max_iter=100), "MLPVersicolor")
build_versicolor(SGDClassifier(random_state=13, n_iter=100),
                 "SGDVersicolor",
                 with_proba=False)
build_versicolor(SGDClassifier(random_state=13, loss="log", n_iter=100),
                 "SGDLogVersicolor")
build_versicolor(SVC(), "SVCVersicolor", to_sparse=True, with_proba=False)
build_versicolor(NuSVC(), "NuSVCVersicolor", to_sparse=True, with_proba=False)

#
# Multi-class classification
#

iris_df = load_csv("Iris.csv")

print(iris_df.dtypes)

iris_mapper = DataFrameMapper([
    (["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"], [
        ContinuousDomain(),
        RobustScaler(),
        IncrementalPCA(n_components=3, whiten=True)
    ]), ("Species", None)
Exemplo n.º 19
0
def main():
    from sklearn.svm import NuSVC

    X_train, X_test, y_train, y_test = bench.load_data(params)
    y_train = np.asfortranarray(y_train).ravel()

    if params.gamma is None:
        params.gamma = 1.0 / X_train.shape[1]

    cache_size_bytes = bench.get_optimal_cache_size(
        X_train.shape[0], max_cache=params.max_cache_size)
    params.cache_size_mb = cache_size_bytes / 1024**2
    params.n_classes = len(np.unique(y_train))

    clf = NuSVC(nu=params.nu,
                kernel=params.kernel,
                cache_size=params.cache_size_mb,
                tol=params.tol,
                gamma=params.gamma,
                probability=params.probability,
                random_state=43,
                degree=params.degree)

    fit_time, _ = bench.measure_function_time(clf.fit,
                                              X_train,
                                              y_train,
                                              params=params)
    params.sv_len = clf.support_.shape[0]

    if params.probability:
        state_predict = 'predict_proba'
        clf_predict = clf.predict_proba
        y_proba_train = clf_predict(X_train)
        y_proba_test = clf_predict(X_test)
        train_log_loss = bench.log_loss(y_train, y_proba_train)
        test_log_loss = bench.log_loss(y_test, y_proba_test)
        train_roc_auc = bench.roc_auc_score(y_train, y_proba_train)
        test_roc_auc = bench.roc_auc_score(y_test, y_proba_test)
    else:
        state_predict = 'prediction'
        clf_predict = clf.predict
        train_log_loss = None
        test_log_loss = None
        train_roc_auc = None
        test_roc_auc = None

    predict_train_time, y_pred = bench.measure_function_time(clf_predict,
                                                             X_train,
                                                             params=params)
    train_acc = bench.accuracy_score(y_train, y_pred)

    _, y_pred = bench.measure_function_time(clf_predict, X_test, params=params)
    test_acc = bench.accuracy_score(y_test, y_pred)

    bench.print_output(
        library='sklearn',
        algorithm='nusvc',
        stages=['training', state_predict],
        params=params,
        functions=['NuSVC.fit', f'NuSVC.{state_predict}'],
        times=[fit_time, predict_train_time],
        metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'],
        metrics=[
            [train_acc, test_acc],
            [train_log_loss, test_log_loss],
            [train_roc_auc, test_roc_auc],
            [int(clf.n_support_.sum()),
             int(clf.n_support_.sum())],
        ],
        data=[X_train, X_train],
        alg_instance=clf,
    )
Exemplo n.º 20
0
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    linear_model.LogisticRegression(solver='lbfgs',
                                    multi_class='multinomial',
                                    C=100),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
                                     max_features='sqrt',
                                     min_samples_leaf=1,
                                     min_samples_split=2,
                                     n_estimators=300)
classifier3 = LogisticRegression(random_state=0,
                                 solver='lbfgs',
                                 max_iter=1000,
                                 multi_class='multinomial')
classifier4 = XGBClassifier(random_state=0,
                            n_jobs=-1,
                            learning_rate=0.1,
                            n_estimators=100,
                            max_depth=3)
classifier5 = NuSVC(gamma=0.005,
                    kernel="rbf",
                    nu=0.5,
                    class_weight=None,
                    probability=True,
                    decision_function_shape='ovr')

estimators = []
estimators.append(('SVC', classifier1))
estimators.append(('RF', classifier2))
estimators.append(('LR', classifier3))
estimators.append(('XGB', classifier4))
estimators.append(('NuSVC', classifier5))

vot_hard = VotingClassifier(estimators=estimators,
                            voting='hard',
                            verbose=False)
vot_hard.fit(X_train, Y_train)
y_pred = vot_hard.predict(X_test)
Exemplo n.º 22
0
                             random_state=random_state)
le = LabelEncoder()
X_train_t = scaler.fit_transform(X_train)
# X_train_t = X_train
y_train_t = le.fit_transform(y_train)
X_test_t = scaler.transform(X_val)
# X_test_t = X_val
y_test_t = le.transform(y_val)

# %%
models = {
    'LogisticRegression': LogisticRegression(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVC': SVC(),
    'NuSVC': NuSVC(),
    'LinearSVC': LinearSVC(),
    'SGCDClass': SGDClassifier(),
    'DecisionTree': DecisionTreeClassifier(max_depth=10),
    'RandomForest': RandomForestClassifier(max_depth=10),
    # 'BoostedTree': GradientBoostingClassifier()
}

# %% Quick score test of basic models
print('\nScores only scaling: ')
scores = quick_model(models, X_train_t, X_test_t, y_train_t, y_test_t)

# %% Do a grid search
if train_models:
    log_params = {
        'C': [0.1, 1, 100, 1000, 10000, 100000, 1000000],
Exemplo n.º 23
0
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                           QuadraticDiscriminantAnalysis)
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.neural_network import MLPClassifier

CLASSIFIERS = {
    "knn": KNeighborsClassifier(),
    "svm": SVC(),
    "nusvm": NuSVC(),
    "dtree": DecisionTreeClassifier(),
    "rdforest": RandomForestClassifier(),
    "adaboost": AdaBoostClassifier(),
    "grdboost": GradientBoostingClassifier(),
    "nbayes": GaussianNB(),
    "gaussproc": GaussianProcessClassifier(),
    "lda": LinearDiscriminantAnalysis(),
    "qda": QuadraticDiscriminantAnalysis(),
    "mlpc": MLPClassifier(),
}

TUNING = {
    "knn": [
        {
            "n_neighbors": range(1, 20),
    if isinstance(clf, OneVsRestClassifier):
        assert_multiclass_linear_classifier_explained(
            newsgroups_train, clf, explain_prediction_sklearn)


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [LogisticRegressionCV(random_state=42)],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
    [SGDClassifier(**SGD_KWARGS)],
    [SVC(kernel='linear', random_state=42)],
    [SVC(kernel='linear', random_state=42, decision_function_shape='ovr')],
    [SVC(kernel='linear', random_state=42, decision_function_shape='ovr',
         probability=True)],
    [SVC(kernel='linear', random_state=42, probability=True)],
    [NuSVC(kernel='linear', random_state=42)],
    [NuSVC(kernel='linear', random_state=42, decision_function_shape='ovr')],
])
def test_explain_linear_binary(newsgroups_train_binary, clf):
    assert_binary_linear_classifier_explained(newsgroups_train_binary, clf,
                                              explain_prediction)


def test_explain_one_class_svm():
    X = np.array([[0, 0], [0, 1], [5, 3], [93, 94], [90, 91]])
    clf = OneClassSVM(kernel='linear', random_state=42).fit(X)
    res = explain_prediction(clf, X[0])
    assert res.targets[0].score < 0
    for expl in format_as_all(res, clf):
        assert 'BIAS' in expl
        assert 'x0' not in expl
Exemplo n.º 25
0
    feature_sets.append((find_features(review), category))

random.shuffle(feature_sets)

save_file = open("pickled/feature_sets.pickle", "wb")
pickle.dump(feature_sets, save_file)
save_file.close()

training_set = feature_sets[:10000]
testing_set = feature_sets[10000:]

all_classifiers = [MultinomialNB(),
                   BernoulliNB(),
                   LogisticRegression(),
                   LinearSVC(),
                   NuSVC()]

all_classifier_names = ["MultinomialNB",
                        "BernoulliNB",
                        "Logistic Regression",
                        "LinearSVC",
                        "NuSVC"]

# Train all classifiers
for i in range(0, len(all_classifiers)):
    classifier = SklearnClassifier(all_classifiers[i])
    classifier.train(training_set)
    all_classifiers[i] = classifier
    print(all_classifier_names[i], " accuracy: ",
          (nltk.classify.accuracy(classifier, testing_set)) * 100,
          "%")
Exemplo n.º 26
0
def ClassiferSelect(X, y):

    knn = KNeighborsClassifier()
    k_range = list(range(1, 31))
    leaf_range = list(range(1, 40))
    weight_options = ['uniform', 'distance']
    algorithm_options = ['auto', 'ball_tree', 'kd_tree', 'brute']
    param_gridKnn = dict(n_neighbors=k_range,
                         weights=weight_options,
                         algorithm=algorithm_options
                         #leaf_size = leaf_range
                         )
    gridKNN = GridSearchCV(knn, param_gridKnn, cv=10, scoring='accuracy')
    gridKNN.fit(X, y)
    print "Knn Score " + str(gridKNN.best_score_)
    print "Knn  best Params " + str(gridKNN.best_params_)
    #LogReg with gridSearch

    logreg = LogisticRegression()
    penalty_options = ['l1', 'l2']
    solver_options = ['liblinear', 'newton_cg', 'lbfgs', 'sag']
    tol_options = [0.0001, 0.00001, 0.000001, 0.000001]
    param_gridLog = dict(penalty=penalty_options, tol=tol_options)
    gridLog = GridSearchCV(logreg, param_gridLog, cv=10, scoring='accuracy')
    gridLog.fit(X, y)

    print "LogReg Score " + str(gridLog.best_score_)
    print "LogReg  best Params " + str(gridLog.best_params_)
    #NN with gridSearch

    NN = MLPClassifier(hidden_layer_sizes=(8, 5, 4))
    activation_options = ['identity', 'logistic', 'tanh', 'relu']
    solver_options = ['lbfgs', 'sgd', 'adam']
    learning_rate_options = ['constant', 'invscaling', 'adaptive']
    param_gridNN = dict(activation=activation_options,
                        solver=solver_options,
                        learning_rate=learning_rate_options)
    gridNN = GridSearchCV(NN, param_gridNN, cv=10, scoring='accuracy')
    gridNN.fit(X, y)
    print "NN Score " + str(gridNN.best_score_)
    print "NN  best Params " + str(gridNN.best_params_)

    #SVM with SVC
    flag = True
    if flag is True:
        svm = NuSVC()
        kernel_options = ['linear', 'sigmoid', 'rbf', 'precomputed']
        nu_options = np.arange(0.1, 1, 0.1)
        param_gridSVM = dict(kernel=kernel_options, nu=nu_options)
        gridSVM = GridSearchCV(svm, param_gridSVM, cv=10, scoring='accuracy')
        gridSVM.fit(X, y)
        print "SVM Score " + str(gridSVM.best_score_)
        print "SVM Params" + str(gridSVM.best_params)

        #Random Forest
        dtree = DecisionTreeClassifier(random_state=0)
        criterion_options = ['gini', 'entropy']
        splitter_options = ['best', 'random']

        param_gridDtree = dict(criterion=criterion_options,
                               splitter=splitter_options)

        gridDtree = GridSearchCV(dtree,
                                 param_gridDtree,
                                 cv=10,
                                 scoring='accuracy')
        gridDtree.fit(X, y)

        print "Decision Tree Score " + str(gridDtree.best_score_)
        print "Decision Tree params " + str(gridDtree.best_params_)

        #Random Forest Classifier with GridSearch
        random = RandomForestClassifier()
        n_estimators_range = list(range(1, 31))
        criterion_options = ['gini', 'entropy']
        max_features_options = ['auto', 'log2', None]
        param_grid = dict(n_estimators=n_estimators_range,
                          criterion=criterion_options,
                          max_features=max_features_options)
        gridRandom = GridSearchCV(random,
                                  param_grid,
                                  cv=10,
                                  scoring='accuracy')
        gridRandom.fit(X, y)

        print "RTrees Score " + str(gridRandom.best_score_)
        print "RTrees Best Params " + str(gridRandom.best_params_)
#split data sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)

#feature scaling
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#initialize training environment
from sklearn.svm import NuSVC
Linear_SVC = NuSVC(kernel='linear')
Poly_SVC = NuSVC(kernel='poly')
Rbf_SVC = NuSVC(kernel='rbf')
Sigmoid_SVC = NuSVC(kernel='sigmoid')

#training with SVM
Linear_SVC.fit(X_train, y_train)
Linear_SVC_y_predict = Linear_SVC.predict(X_test)
print 'The accuracy of Linear_SVC is:', Linear_SVC.score(X_test, y_test)

Poly_SVC.fit(X_train, y_train)
Poly_SVC_y_predict = Poly_SVC.predict(X_test)
print 'The accuracy of Poly_SVC is:', Poly_SVC.score(X_test, y_test)

Rbf_SVC.fit(X_train, y_train)
Rbf_SVC_y_predict = Rbf_SVC.predict(X_test)
Exemplo n.º 28
0
 def __init__(self, featureset=None, target=None, mode='predict', path=''):
     if (mode == 'train'):
         self.__svm = SVC(C=1.0,
                          cache_size=200,
                          class_weight=None,
                          coef0=0.0,
                          decision_function_shape='ovr',
                          degree=3,
                          gamma='auto',
                          kernel='rbf',
                          max_iter=-1,
                          probability=False,
                          random_state=None,
                          shrinking=True,
                          tol=0.001,
                          verbose=False)
         self.__svr = SVR(C=1.0,
                          cache_size=200,
                          coef0=0.0,
                          degree=3,
                          epsilon=0.1,
                          gamma='auto',
                          kernel='rbf',
                          max_iter=-1,
                          shrinking=True,
                          tol=0.001,
                          verbose=False)
         self.__nusvm = NuSVC(cache_size=200,
                              class_weight=None,
                              coef0=0.0,
                              decision_function_shape='ovr',
                              degree=3,
                              gamma='auto',
                              kernel='rbf',
                              max_iter=-1,
                              nu=0.5,
                              probability=False,
                              random_state=None,
                              shrinking=True,
                              tol=0.001,
                              verbose=False)
         self.__nusvr = NuSVR(C=1.0,
                              cache_size=200,
                              coef0=0.0,
                              degree=3,
                              gamma='auto',
                              kernel='rbf',
                              max_iter=-1,
                              nu=0.5,
                              shrinking=True,
                              tol=0.001,
                              verbose=False)
         self.__linsvm = LinearSVC(C=1.0,
                                   class_weight=None,
                                   dual=True,
                                   fit_intercept=True,
                                   intercept_scaling=1,
                                   loss='squared_hinge',
                                   max_iter=1000,
                                   multi_class='ovr',
                                   penalty='l2',
                                   random_state=None,
                                   tol=0.0001,
                                   verbose=0)
         self.__linsvr = LinearSVR(C=1.0,
                                   dual=True,
                                   epsilon=0.0,
                                   fit_intercept=True,
                                   intercept_scaling=1.0,
                                   loss='epsilon_insensitive',
                                   max_iter=1000,
                                   random_state=None,
                                   tol=0.0001,
                                   verbose=0)
         self.__mlpc = MLPC(activation='relu',
                            alpha=1e-05,
                            batch_size='auto',
                            beta_1=0.9,
                            beta_2=0.999,
                            early_stopping=False,
                            epsilon=1e-08,
                            hidden_layer_sizes=(100, 25),
                            learning_rate='constant',
                            learning_rate_init=0.001,
                            max_iter=200,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            power_t=0.5,
                            random_state=1,
                            shuffle=True,
                            solver='lbfgs',
                            tol=0.0001,
                            validation_fraction=0.1,
                            verbose=False,
                            warm_start=False)
         self.__mlpr = MLPR(activation='relu',
                            alpha=0.0001,
                            batch_size='auto',
                            beta_1=0.9,
                            beta_2=0.999,
                            early_stopping=False,
                            epsilon=1e-08,
                            hidden_layer_sizes=(100, 25),
                            learning_rate='constant',
                            learning_rate_init=0.001,
                            max_iter=200,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            power_t=0.5,
                            random_state=None,
                            shuffle=True,
                            solver='adam',
                            tol=0.0001,
                            validation_fraction=0.1,
                            verbose=False,
                            warm_start=False)
         self.__dtc = DTC(class_weight=None,
                          criterion='gini',
                          max_depth=None,
                          max_features=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          presort=False,
                          random_state=None,
                          splitter='best')
         self.__dtr = DTR(criterion='mse',
                          max_depth=None,
                          max_features=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          presort=False,
                          random_state=None,
                          splitter='best')
         self.__rfc = RFC(bootstrap=True,
                          class_weight=None,
                          criterion='gini',
                          max_depth=100,
                          max_features='auto',
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          n_estimators=50,
                          n_jobs=1,
                          oob_score=False,
                          random_state=None,
                          verbose=0,
                          warm_start=False)
         self.__rfr = RFR(bootstrap=True,
                          criterion='mse',
                          max_depth=None,
                          max_features='auto',
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          n_estimators=10,
                          n_jobs=1,
                          oob_score=False,
                          random_state=None,
                          verbose=0,
                          warm_start=False)
         (self.__svm, self.__svr, self.__nusvm, self.__nusvr, self.__linsvm,
          self.__linsvr, self.__mlpc, self.__mlpr, self.__dtc, self.__dtr,
          self.__rfc, self.__rfr) = self.__trainAll(X=list(featureset),
                                                    Y=list(target))
         self.__saveModelsToFile(path)
     else:
         self.__svm = joblib.load(path + 'Mel_SVM.pkl')
         self.__svr = joblib.load(path + 'Mel_SVR.pkl')
         self.__nusvm = joblib.load(path + 'Mel_NuSVM.pkl')
         self.__nusvr = joblib.load(path + 'Mel_NuSVR.pkl')
         self.__linsvm = joblib.load(path + 'Mel_LinSVM.pkl')
         self.__linsvr = joblib.load(path + 'Mel_LinSVR.pkl')
         self.__mlpc = joblib.load(path + 'Mel_MLPC.pkl')
         self.__mlpr = joblib.load(path + 'Mel_MLPR.pkl')
         self.__dtc = joblib.load(path + 'Mel_DTC.pkl')
         self.__dtr = joblib.load(path + 'Mel_DTR.pkl')
         self.__rfc = joblib.load(path + 'Mel_RFC.pkl')
         self.__rfr = joblib.load(path + 'Mel_RFR.pkl')
Exemplo n.º 29
0
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

from yellowbrick.utils.timer import Timer

warnings.filterwarnings("ignore")

# Try them all!
models = [
    LinearSVC(),
    SVC(gamma='auto'),
    NuSVC(gamma='auto'),
    BaggingClassifier(),
    KNeighborsClassifier(),
    LogisticRegressionCV(cv=3),
    LogisticRegression(solver='lbfgs'),
    SGDClassifier(max_iter=100, tol=1e-3),
    MLPClassifier(alpha=1, max_iter=1000),
    ExtraTreesClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100),
    GaussianProcessClassifier(1.0 * RBF(1.0))
]

short_list = [
    LogisticRegression(solver='lbfgs'),  # simple model that scales well
    MLPClassifier(alpha=1, max_iter=1000),  # complex model
    SVC(gamma='auto'),  # more complex model
print("Usando AD extra se tiene una tasa de acierto del ",np.mean(scoresADex)*100)

#Matrices de validación
print("Matriz ArbDec Normal: ",matrizCruzada(predADnor))
print("Matriz ArbDec Extra: ",matrizCruzada(predADex))


#Pintamos los árboles
tree.plot_tree(arbNor)
tree.plot_tree(arbEx)


#Tercer algoritmo SUPPORT VECTOR MACHINE 

#SVM - NuSVC
svr_nu = NuSVC()
svr_nu.fit(data_train, target_train)
predsvNu = svr_nu.predict(data_test)
scoresNu = cross_val_score(svr_nu, atributos, target, cv=5, scoring='accuracy')

#SVM - SVC
svr_svc = SVC()
svr_svc.fit(data_train, target_train)
predsvSvc = svr_svc.predict(data_test)
scoresSvc = cross_val_score(svr_svc, atributos, target, cv=5, scoring='accuracy')


#Porcentajes de acierto
print("Usando NuSVC se tiene una tasa de acierto del ",np.mean(scoresNu)*100)
print("Usando SVC se tiene una tasa de acierto del ",np.mean(scoresSvc)*100)