예제 #1
0
def main():
    """
    Use a linear SVM for multi-class classification.

    One vs the rest : 77.61%
    Default         : 77.61%
    One vs one      : 85.07%
    """
    seed = 123456789
    np.random.seed(seed)
    ntrain, ntest = 800, 200
    (tr_x, tr_y), (te_x, te_y) = load_mnist()
    x, y = np.vstack((tr_x, te_x)), np.hstack((tr_y, te_y))
    cv = MNISTCV(tr_y, te_y, ntrain, ntest, 1, seed)

    for tr, te in cv:
        clf = OneVsRestClassifier(LinearSVC(random_state=seed), -1)
        clf.fit(x[tr], y[tr])
        print(clf.score(x[te], y[te]))

        clf = LinearSVC(random_state=seed)
        clf.fit(x[tr], y[tr])
        print(clf.score(x[te], y[te]))

        clf = OneVsOneClassifier(LinearSVC(random_state=seed), -1)
        clf.fit(x[tr], y[tr])
        print(clf.score(x[te], y[te]))
def main():
    word_vec_dict = readGloveData("./glove.twitter.27B/glove.twitter.27B.25d.txt")
    tweets = readTweets("./dataset_raw/semeval2016-task6-trainingdata.txt")

    tweetVectors = getTweetVectors(tweets[0 : len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {"favor": 1, "none": 0, "against": 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC())
    # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543)
    X_train = X[0 : int(0.7 * len(X))]
    y_train = Y[0 : int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    for indexMax in xrange(len(y_test)):
        print str(y_pred[indexMax]) + " " + str(y_test[indexMax])
def main():
    word_vec_dict = readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt')
    tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt')

    tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {'favor': 1, 'none': 0, 'against': 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = X[0:int(0.7 * len(X))]
    y_train = Y[0:int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
예제 #4
0
def test_solve_primal_l2_svc_with_line_search_optimizers():
    X, y = load_iris(return_X_y=True)
    X_scaled = MinMaxScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                        y,
                                                        train_size=0.75,
                                                        random_state=123456)

    svc = OVR(SVC(loss=squared_hinge, optimizer=SteepestGradientDescent))
    svc = svc.fit(X_train, y_train)
    assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)),
                        estimator.loss.x_star())
            for estimator in svc.estimators_)
    assert svc.score(X_test, y_test) >= 0.57

    svc = OVR(SVC(loss=squared_hinge, optimizer=ConjugateGradient))
    svc = svc.fit(X_train, y_train)
    assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)),
                        estimator.loss.x_star())
            for estimator in svc.estimators_)
    assert svc.score(X_test, y_test) >= 0.57

    svc = OVR(SVC(loss=squared_hinge, optimizer=Newton))
    svc = svc.fit(X_train, y_train)
    assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)),
                        estimator.loss.x_star())
            for estimator in svc.estimators_)
    assert svc.score(X_test, y_test) >= 0.57

    svc = OVR(SVC(loss=squared_hinge, optimizer=BFGS))
    svc = svc.fit(X_train, y_train)
    assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)),
                        estimator.loss.x_star())
            for estimator in svc.estimators_)
    assert svc.score(X_test, y_test) >= 0.57
예제 #5
0
def test_solve_dual_l2_svc_with_AdaGrad():
    X, y = load_iris(return_X_y=True)
    X_scaled = MinMaxScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                        y,
                                                        train_size=0.75,
                                                        random_state=123456)

    svc = OVR(
        SVC(loss=squared_hinge,
            kernel=gaussian,
            reg_intercept=True,
            dual=True,
            optimizer=AdaGrad,
            learning_rate=1.))
    svc = svc.fit(X_train, y_train)
    assert svc.score(X_test, y_test) >= 0.97

    svc = OVR(
        SVC(loss=squared_hinge,
            kernel=gaussian,
            reg_intercept=False,
            dual=True,
            optimizer=AdaGrad,
            learning_rate=1.))
    svc = svc.fit(X_train, y_train)
    assert svc.score(X_test, y_test) >= 0.97
예제 #6
0
def main():
    word_vec_dict = readGloveData('./glove.twitter.27B/glove.twitter.27B.25d.txt')
    tweets = readTweets('./dataset_raw/semeval2016-task6-trainingdata.txt')

    tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {'favor': 1, 'none': 0, 'against': 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC())
    # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543)
    X_train = X[0:int(0.7 * len(X))]
    y_train = Y[0:int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    for indexMax in xrange(len(y_test)):
        print str(y_pred[indexMax]) + ' ' + str(y_test[indexMax]) 
예제 #7
0
def test_solve_dual_l1_svc_with_proximal_bundle():
    X, y = load_iris(return_X_y=True)
    X_scaled = MinMaxScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                        y,
                                                        train_size=0.75,
                                                        random_state=123456)

    svc = OVR(
        SVC(loss=hinge,
            kernel=gaussian,
            reg_intercept=True,
            dual=True,
            optimizer=ProximalBundle,
            max_iter=150))
    svc = svc.fit(X_train, y_train)
    assert svc.score(X_test, y_test) >= 0.97

    svc = OVR(
        SVC(loss=hinge,
            kernel=gaussian,
            reg_intercept=False,
            dual=True,
            optimizer=ProximalBundle,
            max_iter=150))
    svc = svc.fit(X_train, y_train)
    assert svc.score(X_test, y_test) >= 0.97
예제 #8
0
def fit_multiclass_svm(documents, idfs):
	model = gensim.models.Word2Vec.load("train_word2vec.model")
	dim = 50;
	X = np.zeros([4000, dim]);
	X_test = np.zeros([490, dim]);
	y = np.zeros(4000);
	y_test = np.zeros(490);
	i = 0
	for doc in documents[:4000]:
		x = np.zeros(dim)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X[i, :] = x/count
		y[i] = doc["topic_id"]
		i = i + 1;
	svm_model = OneVsRestClassifier(LinearSVC(random_state=0, C = 1)).fit(X, y)
	
	
	i = 0
	for doc in documents[4000:4490]:
		x = np.zeros(dim)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X_test[i, :] = x/count
		y_test[i] = doc["topic_id"]
		i = i + 1;
	print svm_model.score(X_test, y_test)
예제 #9
0
def main():
    word_vec_dict = readGloveData(
        '../glove.twitter.27B/glove.twitter.27B.25d.txt')
    tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt')

    tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {'favor': 1, 'none': 0, 'against': 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = X[0:int(0.7 * len(X))]
    y_train = Y[0:int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)):len(X)]
    y_test = Y[int(0.7 * len(Y)):len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
예제 #10
0
def fit_multiclass_svm1(documents, idfs):
	model = gensim.models.doc2vec.Doc2Vec.load("train_doc2vec.model")
	X = np.zeros([4000, 300]);
	X_test = np.zeros([490, 300]);
	y = np.zeros(4000);
	y_test = np.zeros(490);
	i = 0
	for doc in documents[:4000]:
		x = np.zeros(300)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X[i, :] = x/count
		y[i] = doc["topic_id"]
		i = i + 1;
	svm_model = OneVsRestClassifier(svm.SVC(kernel='poly', gamma=2)).fit(X, y)
	
	
	i = 0
	for doc in documents[4000:4490]:
		x = np.zeros(300)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X_test[i, :] = x/count
		y_test[i] = doc["topic_id"]
		i = i + 1;
	print svm_model.score(X_test, y_test)
예제 #11
0
def one_vs_all(X, y, test_size=0.2, run_num = 100, svm_type='linear'):
    """Trains 15 1 vs all SVM classifiers of specified type"""
    # Python has a wonderful wrapper function that creates 1 vs all classifiers!
    if type == 'linear':
        estimator = LinearSVC()
    else:
        # This will automatically use RBF functions
        estimator = SVC()

    ovr = OneVsRestClassifier(estimator = estimator)

    acc_tr = []
    acc_tst = []

    for i in range(run_num):
        [X_train, X_test, y_train, y_test] = train_test_split(X, y,
                                                              test_size=test_size)
        # Train the classifier
        ovr.fit(X_train, y_train.ravel())

        # Work out the score on the training data. However there is nothing
        # to optimise for - we are just getting an idea of the accuracy for
        # training vs test data. box plot opportunity!
        tr_acc = ovr.score(X_train, y_train.ravel())
        tst_acc = ovr.score(X_test, y_test.ravel())

        acc_tr.append(tr_acc)
        acc_tst.append(tst_acc)

        # All the data isn't used here as it tends to overtrain the classifier.

    return ovr, acc_tr, acc_tst
예제 #12
0
def main():
    # load train data and test data
    print("loading data...")
    train_datas, train_labels = load_mnist_data(
        file_name='data/train_data/TrainSamples',
        label_filename='data/train_data/TrainLabels')
    test_datas, test_labels = load_mnist_data(
        file_name='data/valid_data/ValidSamples',
        label_filename='data/valid_data/ValidLabels')
    print("Train data size: {}, Test data size: {}".format(
        len(train_datas), len(test_datas)))

    # 对训练和测试的特征数据进行标准化
    ss = StandardScaler()
    train_datas = ss.fit_transform(train_datas)
    test_datas = ss.transform(test_datas)

    # create models and train
    print("training the model....")
    clf = OneVsRestClassifier(estimator=svm.SVC(C=10,
                                                kernel='rbf',
                                                gamma='auto'),
                              n_jobs=4)
    clf.fit(train_datas, train_labels)
    print("training is done!")
    print("fit result: ", clf.score(train_datas, train_labels))
    # test on the test data
    acc = clf.score(test_datas, test_labels)
    print("test acc: ", acc)
    predict = clf.predict(test_datas)
    print("Classification report:\n ",
          metrics.classification_report(test_labels, predict))
    print("Confusion matrix:\n ",
          metrics.confusion_matrix(test_labels, predict))
예제 #13
0
def tfidfDoClassify(X_train, X_test, y_train, y_test, labels, label,
                    n_components):
    resultDict = {}
    X_train = StandardScaler(with_mean=False).fit_transform(X_train)
    X_test = StandardScaler(with_mean=False).fit_transform(X_test)
    # iterate over classifiers
    for name, aclf in zip(names, classifiers):
        print n_components, label, name
        if name != "Logistic Regression":
            clf = OneVsRestClassifier(aclf)
        else:
            clf = aclf
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_ = clf.predict(X_train)
        prf1sDict = {}
        # chi,pval = chi2(X_train, y_train)
        # prf1sDict["chi2"] = chi
        # prf1sDict["pval"] = pval
        precision = 0
        recall = 0
        fscore = 0
        support = 0
        try:
            precision, recall, fscore, support = precision_recall_fscore_support(
                y_test, y_pred, average='weighted', labels=labels)
            logging.debug(
                str(precision) + "," + str(recall) + "," + str(fscore) + "," +
                str(support) + "," + name + "," + str(n_components) + "," +
                label)
            score = clf.score(X_test, y_test)
            prf1sDict["testReport"] = classification_report(y_test,
                                                            y_pred,
                                                            labels=labels)
            prf1sDict["testConfusionMatrix"] = confusion_matrix(
                y_train, y_pred)
            # pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
            prf1sDict["testScore"] = score
            prf1sDict["testPrecision"] = precision
            prf1sDict["testRecall"] = recall
            prf1sDict["testFscore"] = fscore
            precision_, recall_, fscore_, support_ = precision_recall_fscore_support(
                y_train, y_pred_, average='weighted', labels=labels)
            score_ = clf.score(X_train, y_train)
            prf1sDict["trainReport"] = classification_report(y_train,
                                                             y_pred_,
                                                             labels=labels)
            prf1sDict["trainConfusionMatrix"] = confusion_matrix(
                y_train, y_pred_)
            # pd.crosstab(y_train, y_pred_, rownames=['True'], colnames=['Predicted'], margins=True)
            prf1sDict["trainScore"] = score_
            prf1sDict["trainPrecision"] = precision_
            prf1sDict["trainRecall"] = recall_
            prf1sDict["trainFscore"] = fscore_
            resultDict[name] = prf1sDict
        except ValueError:
            print name
            continue
    return resultDict
예제 #14
0
class SVMSentiment:

    def __init__(self):
       self.max_length = 500
       self.batch_size=50
       self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=1,C = 1,tol=0.0001,cache_size=5000)  )


    def configureSVMModel(self,TrainX,TrainY,validX,validY):
       print('Configuring the SVM Model')
       currPath = os.getcwd()
       currFiles =  os.listdir(currPath)
       print('################### Test #####################')
       print(currFiles.count('SVMScores.pkl'))
       if(currFiles.count('SVMScores.pkl')==0):
          self.model.fit(TrainX, TrainY)
          # Saving model scores
          joblib.dump(self.model,currPath+'/SVMScores.pkl')
       else:
          print('Loading already existing Model')
          self.model = joblib.load(currPath+'/SVMScores.pkl')
       

    def evaluateSVMModel(self,TestX,TestY):
       print self.model.score(TestX, TestY)

       predicted_data=[]
       for i in range(len(TestX)):
          predicted_data.append(list([self.model.predict (TestX[i].reshape(1,-1)) ,TestY[i]]) )

       print "Predicted Data"
       print predicted_data
       #print TestY

    def predictSentiment(self,dataX,dataY):
       print('@@@@@@@@@@@@@@@@ Length of test data : ',len(dataX))
       for i in range(len(dataX)):
         predicted_data = self.model.predict(dataX[i].reshape(1,-1))
         expected_out = dataY[i]

       print('############### Predicted data :',predicted_data,' ; ; ',expected_out)
       return predicted_data

    def getTrainTestData(self):
       print('Loading Training and Test data')
       trainX=[]
       trainY=[]
       testX=[]
       testY = []
       f= open('trainingdata.pkl','rb')
       (trainX,trainY) = cPickle.load(f)       
       f= open('testingdata.pkl','rb')
       (testX,testY)  = cPickle.load(f)

       return ((trainX,trainY),(testX,testY))

    def getValidationData(self,dataX,dataY):
       return dataX[0:self.batch_size,:],dataY[0:self.batch_size,:]
예제 #15
0
class SVMSentiment:
    def __init__(self):
        self.max_length = 500
        self.batch_size = 50
        self.model = OneVsRestClassifier(
            svm.SVC(kernel='rbf', gamma=1, C=1, tol=0.0001, cache_size=5000))

    def configureSVMModel(self, TrainX, TrainY, validX, validY):
        print('Configuring the SVM Model')
        currPath = os.getcwd()
        currFiles = os.listdir(currPath)
        print('################### Test #####################')
        print(currFiles.count('SVMScores.pkl'))
        if (currFiles.count('SVMScores.pkl') == 0):
            self.model.fit(TrainX, TrainY)
            # Saving model scores
            joblib.dump(self.model, currPath + '/SVMScores.pkl')
        else:
            print('Loading already existing Model')
            self.model = joblib.load(currPath + '/SVMScores.pkl')

    def evaluateSVMModel(self, TestX, TestY):
        print self.model.score(TestX, TestY)

        predicted_data = []
        for i in range(len(TestX)):
            predicted_data.append(
                list([self.model.predict(TestX[i].reshape(1, -1)), TestY[i]]))

        print "Predicted Data"
        print predicted_data
        #print TestY

    def predictSentiment(self, dataX, dataY):
        print('@@@@@@@@@@@@@@@@ Length of test data : ', len(dataX))
        for i in range(len(dataX)):
            predicted_data = self.model.predict(dataX[i].reshape(1, -1))
            expected_out = dataY[i]

        print('############### Predicted data :', predicted_data, ' ; ; ',
              expected_out)
        return predicted_data

    def getTrainTestData(self):
        print('Loading Training and Test data')
        trainX = []
        trainY = []
        testX = []
        testY = []
        f = open('trainingdata.pkl', 'rb')
        (trainX, trainY) = cPickle.load(f)
        f = open('testingdata.pkl', 'rb')
        (testX, testY) = cPickle.load(f)

        return ((trainX, trainY), (testX, testY))

    def getValidationData(self, dataX, dataY):
        return dataX[0:self.batch_size, :], dataY[0:self.batch_size, :]
def train_model(reviews, result):
    X_train, X_test, y_train, y_test = train_test_split(reviews,
                                                        result,
                                                        test_size=0.2,
                                                        random_state=42)
    svm_classifier = OneVsRestClassifier(LinearSVC(random_state=0))
    svm_classifier.fit(X_train, y_train)
    print svm_classifier.score(X_test, y_test)
    joblib.dump(svm_classifier, './model_svm/svm_model.pkl')
def doClassify(X, y):
    resultDict = {}
    X_train, X_test, y_train, y_test = \
     train_test_split(X, y, test_size=testRatio, random_state=42)
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        print "Running cliasifer:", name
        if name != "Logistic Regression":
            clf = OneVsRestClassifier(clf)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_ = clf.predict(X_train)
        prf1sDict = {}
        precision = 0
        recall = 0
        fscore = 0
        support = 0
        try:
            precision, recall, fscore, support = precision_recall_fscore_support(
                y_test, y_pred, average="weighted")
            logging.debug(
                str(precision) + "," + str(recall) + "," + str(fscore) + "," +
                str(support) + "," + name)
            score = clf.score(X_test, y_test)
            prf1sDict["testReport"] = classification_report(y_test, y_pred)
            labels = list(set(y_test))
            confMat = confusion_matrix(y_test, y_pred, labels=labels)
            print "confMat type:", type(confMat)
            print "confMat len:", len(confMat)
            print "confMat:"
            print confMat
            print labels
            prf1sDict["testConfMat"] = confMat.tolist()
            prf1sDict["testScore"] = score
            prf1sDict["testPrecision"] = precision
            prf1sDict["testRecall"] = recall
            prf1sDict["testFscore"] = fscore
            precision_, recall_, fscore_, support_ = precision_recall_fscore_support(
                y_train, y_pred_, average="weighted")
            score_ = clf.score(X_train, y_train)
            prf1sDict["trainReport"] = classification_report(y_train, y_pred_)
            prf1sDict["trainConfMat"] = confusion_matrix(y_train,
                                                         y_pred_).tolist()
            prf1sDict["trainScore"] = score_
            prf1sDict["trainPrecision"] = precision_
            prf1sDict["trainRecall"] = recall_
            prf1sDict["trainFscore"] = fscore_
            resultDict[name] = prf1sDict
        except ValueError:
            print "Error for claissifier:", name
            print "Unexpected error in test:", sys.exc_info()
            continue
    return resultDict
예제 #18
0
def OneVsAll(train_data, test_data, train_label, test_label, numFeatures):
    binary_model = MSE_binary()
    model = OneVsRestClassifier(binary_model)
    model.fit(train_data[:, :numFeatures], train_label)
    print('Using', numFeatures, 'features:')
    print("Training Accuracy:  " +
          str(model.score(train_data[:, :numFeatures], train_label) * 100) +
          " %")
    print("Testing Accuracy:   " +
          str(model.score(test_data[:, :numFeatures], test_label) * 100) +
          " %")
예제 #19
0
def doClassify(jsonDict, label, feature, gramIndex):
    termDocMatrix, allSyscallsVector = cd.createTermDocMatrix(
        jsonDict, feature)
    labels = getAppLabelList(termDocMatrix, label)
    X, y = generateNormalFeatureMatrix(termDocMatrix, allSyscallsVector, label,
                                       labels)
    resultDict = {}
    X_train, X_test, y_train, y_test = \
     train_test_split(X, y, test_size=testRatio, random_state=42)
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)
    # iterate over classifiers
    for name, aclf in zip(names, classifiers):
        print gramIndex, label, feature, name
        if name != "Logistic Regression":
            clf = OneVsRestClassifier(aclf)
        else:
            clf = aclf
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_ = clf.predict(X_train)
        prf1sDict = {}
        precision = 0
        recall = 0
        fscore = 0
        support = 0
        try:
            precision, recall, fscore, support = precision_recall_fscore_support(
                y_test, y_pred, average='weighted')
            logging.debug(
                str(precision) + "," + str(recall) + "," + str(fscore) + "," +
                str(support) + "," + name + "," + str(gramIndex) + "," +
                label + "," + feature)
            score = clf.score(X_test, y_test)
            prf1sDict["testReport"] = classification_report(y_test, y_pred)
            prf1sDict["testScore"] = score
            prf1sDict["testPrecision"] = precision
            prf1sDict["testRecall"] = recall
            prf1sDict["testFscore"] = fscore
            precision_, recall_, fscore_, support_ = precision_recall_fscore_support(
                y_train, y_pred_, average='weighted')
            score_ = clf.score(X_train, y_train)
            prf1sDict["trainReport"] = classification_report(y_train, y_pred_)
            prf1sDict["trainScore"] = score_
            prf1sDict["trainPrecision"] = precision_
            prf1sDict["trainRecall"] = recall_
            prf1sDict["trainFscore"] = fscore_
            resultDict[name] = prf1sDict
        except ValueError:
            print name
            continue
    return resultDict
예제 #20
0
def clasificar_OVA(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname):
	print("\n[" + str(graphname) + "]")
	clfBase=DecisionTreeClassifier()
	scores = cross_val_score(clfBase, X, y, cv=10)
	clf=OneVsRestClassifier(clfBase)
	clf=clf.fit(trainInputs, trainOutputs)
	precisionTrain = clf.score(trainInputs, trainOutputs)
	precisionTest = clf.score(testInputs, testOutputs)
	print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100))
	prediccion_test = clf.predict(testInputs)
	print(prediccion_test)
	print(testOutputs)
	return precisionTest
예제 #21
0
 def classifiers(self):
     #对于多元分类,用SVM,且还要画ROC曲线的,必须使用OneVsRestClassifier,否则能分类但无法画出ROC
     classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True,))
     classifier.fit(self.x_train,self.y_train)
     #输出四大参数供评估使用
     self.y_predict=classifier.predict(self.x_test)
     #这里用predict_proba和decision_function都能画出ROC曲线,且不同,原因?
     #self.y_predict_proba=classifier.predict_proba(self.x_test)  
     self.y_predict_proba=classifier.decision_function(self.x_test)
     self.train_accuracy=classifier.score(self.x_train,self.y_train)
     self.test_accuracy=classifier.score(self.x_test,self.y_test)
     #返回classifier,供交叉验证评估使用
     return classifier
예제 #22
0
def start_classification(training_props):
    #    print "training SVM model"
    vectorizer = DictVectorizer()
    X = convert_props(training_props)
    X = vectorizer.fit_transform(X)
    mb = MultiLabelBinarizer()
    Y = mb.fit_transform(zip(*training_props)[0])
    X_train, X_test, y_train, y_test=train_test_split(X, Y,\
            test_size=0.1, random_state=0)
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)
    predicted = clf.predict(X_test)
    print classification_report(y_test, predicted)
예제 #23
0
def evaluate(train_vector, test_vector):
    log.info('total training instances: {0}'.format(len(train_vector[0])))
    log.info('total testing instances: {0}'.format(len(test_vector[0])))

    classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1)
    classifier.fit(train_vector[0], train_vector[1])

    with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f:
        pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

    train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1])
    test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1])
    log.info('accuracy train: {0}'.format(train_accuracy))
    log.info('accuracy test: {0}'.format(test_accuracy))
예제 #24
0
def evaluate(train_vector, test_vector):
    log.info('total training instances: {0}'.format(len(train_vector[0])))
    log.info('total testing instances: {0}'.format(len(test_vector[0])))

    classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1)
    classifier.fit(train_vector[0], train_vector[1])

    with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f:
        pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

    train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1])
    test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1])
    log.info('accuracy train: {0}'.format(train_accuracy))
    log.info('accuracy test: {0}'.format(test_accuracy))
예제 #25
0
def main():
    number = [1000, 5000, 10000, 15000, 60000]
    for data in number:
        x_train, y_train = m.read_mnist(
            'MNIST_data/train-images-idx3-ubyte.gz',
            'MNIST_data/train-labels-idx1-ubyte.gz')
        x_train = x_train[:data]
        y_train = y_train[:data]
        X_train = x_train.reshape(-1, 28 * 28).astype(np.float32)
        X_train = X_train * (2.0 / 255.0) - 1.0
        x_test, y_test = m.read_mnist('MNIST_data/t10k-images-idx3-ubyte.gz',
                                      'MNIST_data/t10k-labels-idx1-ubyte.gz')
        X_test = x_test.reshape(-1, 28 * 28).astype(np.float32)
        X_test = X_test * (2.0 / 255.0) - 1.0

        classif = OneVsRestClassifier(LinearSVC(C=100.))

        print "Started learning..."
        before1 = dt.datetime.now()
        classif.fit(X_train, y_train)
        after1 = dt.datetime.now()
        print "Done learning!"
        beforeA = dt.datetime.now()
        scoreA = classif.score(X_test, y_test)
        afterA = dt.datetime.now()
        beforeTA = dt.datetime.now()
        scoreTA = classif.score(X_train, y_train)
        afterTA = dt.datetime.now()
        print "Test data accuracy: ", scoreA
        print "Training data accuracy: ", scoreTA
        print "Time it took to train once: ", after1 - before1
        print "Time it took to verify test: ", afterA - beforeA
        print "Time it took to verify training: ", afterTA - beforeTA

        print "Learning again..."
        before2 = dt.datetime.now()
        classif.fit(X_train, y_train)
        after2 = dt.datetime.now()
        print "Done learning!"
        beforeB = dt.datetime.now()
        scoreB = classif.score(X_test, y_test)
        afterB = dt.datetime.now()
        beforeTB = dt.datetime.now()
        scoreTB = classif.score(X_train, y_train)
        afterTB = dt.datetime.now()
        print "Test data accuracy: ", scoreB
        print "Training data accuracy: ", scoreTB
        print "Time it took to train once: ", after2 - before2
        print "Time it took to verify test: ", afterB - beforeB
        print "Time it took to verify training: ", afterTB - beforeTB
예제 #26
0
def get_trained_regr_classifier(training_data, test_data):
    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
    X,y,X_test, y_test = [], [], [], []
    for key in training_data.keys() & test_data.keys():
        # Training samples
        X += training_data[key]
        y += [key]*len(training_data[key])
        # Test samples
        X_test += test_data[key]
        y_test += [key]*len(test_data[key])

    ovr.fit(X,y)
    training_score = ovr.score(X,y)
    test_score = ovr.score(X_test,y_test)
    return {"ovr": ovr, "training_score": training_score, "test_score": test_score}
예제 #27
0
def strat_1_v_rest_rf(X,y):
    #StratifiedShuffleSplit-OneVsRestClassifier-RandomForestClassifier
    #best mean accuracy = 0.7571428571428571
    sss=sratifier(X,y)
    sss_rf_scores =[]

    trees_to_grow=155
    depth = None
    sample_split=2
    feature_selection='auto'

    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        sss_rf = OneVsRestClassifier(RandomForestClassifier( n_jobs=-1, max_features = feature_selection, 
                max_depth= depth, n_estimators=trees_to_grow, min_samples_split=sample_split,random_state=38))
        sss_rf.fit(X_train, y_train)
        y_hat=sss_rf.predict(X_test)
        #print(y_hat,y_test)
        score=sss_rf.score(X_test,y_test)
        sss_rf_scores.append(score)
        #print(score)

        
        sss_rf_mean_accuracy=sum(sss_rf_scores)/len(sss_rf_scores)
        print(f'sss_rf mean accuracy= {sss_rf_mean_accuracy}')
        confuse_mattrix(y_test,y_hat)
        filename = '../data/models/Strat_oneVRest_Random_Forest.sav'
        pickle.dump(sss_rf, open(filename, 'wb'))   
def oneVsRest_LogReg_TfIdf(X_train, X_test, Y_train, Y_test, word_dict, tags_dict, data_files, test_doc_ids ):
  print('Processing : oneVsRest_LogReg_TfIdf')
  print('-'*50)

  Y_original = Y_test
  vectorizer = CountVectorizer(min_df=1, vocabulary=word_dict)
  X_v_train = vectorizer.fit_transform(X_train)
  X_v_test = vectorizer.fit_transform(X_test)
  transformer = TfidfTransformer(smooth_idf=False)
  X_train_tf = transformer.fit_transform(X_v_train)
  X_test_tf = transformer.fit_transform(X_v_test)

  uniq_tags_names = list(tags_dict.keys())
  mlb = preprocessing.MultiLabelBinarizer(classes=uniq_tags_names)
  Y_train = mlb.fit_transform(Y_train)
  Y_test = mlb.fit_transform(Y_test)

  classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=0.01))
  classifier.fit(X_train_tf, Y_train)
  score = classifier.score(X_test_tf, Y_test)
  print('-' * 50)
  print('Score oneVsRest_LogReg_TfIdf : {}'.format(score))
  print('-' * 50)
  Y_pred = classifier.predict(X_v_test)
  Y_back = mlb.inverse_transform(Y_pred)
  write_to_file(Y_original, Y_back, 'oneVsRest_LogREg', score, data_files, test_doc_ids)
예제 #29
0
def test_solve_svc_as_bcqp_with_active_set():
    X, y = load_iris(return_X_y=True)
    X_scaled = MinMaxScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=1)
    svc = OneVsRestClassifier(DualSVC(kernel=gaussian, optimizer=ActiveSet))
    svc = svc.fit(X_train, y_train)
    assert svc.score(X_test, y_test) >= 0.97
예제 #30
0
def svm_one_vs_rest_class(x_tr, x_ts, y_tr, y_ts):
    print("\nSVM One vs Rest Classification")
    x_train = x_tr
    x_test = x_ts
    y_train = y_tr
    y_test = y_ts

    print("Fitting Data...")
    trt_strt = time.time()
    svm_model_linear_ovr = OneVsRestClassifier(
        SVC(kernel='rbf', gamma='auto', C=1000000)).fit(x_train, y_train)
    trt_end = time.time()
    print("SVM OneVsRest Train Time",
          str(round(trt_end - trt_strt, 2)) + " sec")

    print("Predicting Data...")
    tst_strt = time.time()
    prediction = svm_model_linear_ovr.predict(x_test)
    tst_end = time.time()
    print("SVM OneVsRest Test Time",
          str(round(tst_end - tst_strt, 2)) + " sec")

    accuracy = svm_model_linear_ovr.score(x_test, y_test)
    print("SVM OneVsRest Accuracy", str(accuracy * 100) + ' %')
    print("SVM OneVsRest MSE",
          metrics.mean_squared_error(np.asarray(y_test), prediction))
def main():
    train = False
    print('Reading data....')
    data = pd.read_csv('processed.cleveland.data',
                       names=[
                           'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                           'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                           'ca', 'thal', 'num'
                       ],
                       header=None)
    print('Removing invalid values from data....')
    data = filterDataset(data)
    ## split data into train and test
    cols = list(data.columns)
    cols.remove('num')
    target = data['num'].copy()
    input_data = data[cols].copy()
    dTrain, dTest, targetTrain, targetTest = train_test_split(input_data,
                                                              target,
                                                              test_size=0.20)
    scaler = preprocessing.StandardScaler().fit(dTrain)
    dTrain = scaler.transform(dTrain)
    # if train:
    ovr = OneVsRestClassifier(LinearSVC(random_state=0), n_jobs=-1)
    print('Training model...')
    ovr.fit(dTrain, targetTrain)
    #     joblib.dump(ovr, 'oneVsAll.pkl')
    #     print('Model saved!')
    # else:
    #     ovr = joblib.load('oneVsAll.pkl')
    dTest = scaler.transform(dTest)
    pred = ovr.predict(dTest)
    print(ovr.score(dTest, targetTest))
    cm = confusion_matrix(targetTest, pred, labels=[0, 1, 2, 3, 4])
    np.savetxt("confusion_matrix_ova.csv", cm, delimiter=",")
def predict(X_train, X_test, y_train, y_test, k, method_name):

    print('Start knn predicting...')

    knn = neighbors.KNeighborsClassifier(n_neighbors=k,
                                         weights='distance',
                                         algorithm='auto',
                                         leaf_size=30,
                                         p=2,
                                         metric='minkowski',
                                         metric_params=None,
                                         n_jobs=-1)
    knn_ovo = OneVsOneClassifier(knn)
    knn_ovo.fit(X_train, y_train.values.ravel())
    print('Accuracy score of knn_ovo: ' +
          '%.3f' % knn_ovo.score(X_test, y_test))

    knn_ovr = OneVsRestClassifier(knn)
    knn_ovr.fit(X_train, y_train.values.ravel())

    print('Accuracy score of knn_ovr: ' +
          '%.3f' % knn_ovr.score(X_test, y_test))

    plot.plot_conf_matrix(X_test, y_test, knn_ovr, method_name + '_ovr')
    plot.plot_conf_matrix(X_test, y_test, knn_ovo, method_name + '_ovo')
    plot.plot_roc(X_train, X_test, y_train, y_test, knn_ovr,
                  method_name + '_ovr')
예제 #33
0
def test_solve_linear_svc_with_proximal_bundle():
    X, y = load_iris(return_X_y=True)
    X_scaled = MinMaxScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=1)
    svc = OneVsRestClassifier(PrimalSVC(loss=hinge, optimizer=ProximalBundle))
    svc = svc.fit(X_train, y_train)
    assert svc.score(X_test, y_test) >= 0.57
def classification_with_SVM(Fglobal, y):

    database = joblib.load(open('Metadata/database.pkl', 'rb'))
    print(
        "transforming the model with Principle Component Analysis and evaluating the model with k-folds cross-validations..."
    )

    k_folds = 10
    sss = StratifiedShuffleSplit(n_splits=k_folds,
                                 test_size=0.3,
                                 random_state=1)
    splits = sss.split(Fglobal, y)

    pp = Preprocessor('standard', n_components=60)
    n_classes = len(database.classes)
    clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=10, gamma=0.01))
    prfs = []
    scores = []
    acc = np.zeros(n_classes)
    for (train, test) in splits:
        Ftrain = Fglobal[train]
        Ftest = Fglobal[test]
        (Ftrain, Ftest) = pp.standardize(Ftrain, Ftest)
        (Ftrain, Ftest) = pp.project_on_pc(Ftrain, Ftest)
        clf.fit(Ftrain, y[train])
        ypred = clf.predict(Ftest)
        scores.append(clf.score(Ftest, y[test]))
        prfs.append(precision_recall_fscore_support(y[test], ypred))

    print("\nAccuracy =  %0.2f (%0.2f)\n" % (np.mean(scores), np.std(scores)))

    joblib.dump(pp, open('Metadata/transformation_module.pkl', 'wb'))
    joblib.dump(clf, open('Metadata/classifier.pkl', 'wb'))
예제 #35
0
def classify_images_one_v_all(train_df, test_df):
    """One vs. All linear SVC with Grid Search for image classification"""

    param_grid = {'C': [0.1, 0.5, 1.0, 5., 10.]}
    clf = OneVsRestClassifier(GridSearchCV(LinearSVC(), param_grid=param_grid))
    with open(train_df, 'rb') as train:
        train_df = pickle.load(train)
        with open(test_df, 'rb') as test:
            test_df = pickle.load(test)
            clf.fit(train_df.T.ix[:, train_df.T.columns != 'y'],
                    train_df.T['y'])

            print('score: ' + str(
                clf.score(test_df.T.ix[:, test_df.T.columns != 'y'],
                          test_df.T['y'])))

            # Compute confusion matrix
            cnf_matrix = confusion_matrix(
                test_df.T['y'],
                clf.predict(test_df.T.ix[:, test_df.T.columns != 'y']))
            np.set_printoptions(precision=2)

            # Plot normalized confusion matrix
            plt.figure(figsize=(10, 10))
            plot_confusion_matrix(cnf_matrix,
                                  classes=[
                                      'ant', 'bee', 'butterfly', 'centipede',
                                      'dragonfly', 'ladybug', 'tick', 'beetle',
                                      'termite', 'worm'
                                  ],
                                  normalize=True,
                                  title='Normalized confusion matrix')

            plt.show()
예제 #36
0
def hardMarginSVM_subclass(features, labels, g_vals, d_vals, k):
    accuracy_store = np.zeros(
        (len(np.atleast_1d(d_vals)), len(np.atleast_1d(g_vals)), k))

    for g in range(len(np.atleast_1d(d_vals))):
        print(g)
        for i in range(len(np.atleast_1d(g_vals))):
            for j in range(k):
                features_t = np.zeros(
                    (features[0].shape[0], features[0].shape[1]))
                labels_t = np.zeros((labels[0].shape[0], labels[0].shape[1]))

                features_v = np.zeros(
                    (features[0].shape[0], features[0].shape[1]))
                labels_v = np.zeros((labels[0].shape[0], labels[0].shape[1]))

                #prep data
                for m in range(k):
                    if j != m:
                        features_t = np.vstack((features_t, features[m]))
                        labels_t = np.vstack((labels_t, labels[m]))
                    else:
                        features_v = np.vstack((features_v, features[m]))
                        labels_v = np.vstack((labels_v, labels[m]))

                # SVM
                classifier = OneVsRestClassifier(
                    SVC(kernel='poly', degree=d_vals[g], gamma=g_vals[i]))
                classifier.fit(features_t, labels_t)

                svm_acc = classifier.score(features_v, labels_v)
                accuracy_store[g, i, j] = svm_acc
    return accuracy_store
예제 #37
0
def run(data_path):
    print "Reading the dataset:", data_path
    mnist = fetch_mldata('MNIST original')
    mnist.data, mnist.target = shuffle(mnist.data, mnist.target)

    # Trunk the data
    n_train = 600
    n_test = 400

    # Define training and testing sets
    indices = arange(len(mnist.data))
    random.seed(0)
    train_idx = random.sample(indices, n_train)
    test_idx = random.sample(indices, n_test)
    X_train, y_train = mnist.data[train_idx], mnist.target[train_idx]
    X_test, y_test = mnist.data[test_idx], mnist.target[test_idx]

    # Apply a learning algorithm
    print "Applying a learning algorithm..."
    clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train)

    # Make a prediction
    print "Making predictions..."
    y_pred = clf.predict(X_test)

    print y_pred

    # Evaluate the prediction
    print "Evaluating results..."
    print "Precision: \t", metrics.precision_score(y_test, y_pred)
    print "Recall: \t", metrics.recall_score(y_test, y_pred)
    print "F1 score: \t", metrics.f1_score(y_test, y_pred)
    print "Mean accuracy: \t", clf.score(X_test, y_test)
예제 #38
0
def Main():
	directory = 'data'
	X_train, y_train = parseData('training', directory)
	X_test, y_test = parseData('testing', directory)
	num_labels = 10
	lamd = 0.1

	a = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_test,y_test)#.predict(X_train)
	#print(a)
	print(a.score(X_train, y_train))
def svm_Model(X, y, Xt, yt):
	model = OneVsRestClassifier(svm.LinearSVC(C=1)).fit(X,y)

	predicted = model.predict(Xt)

	acc = model.score(Xt,yt)

	#print 'Actual output = \n',yt
	#print 'Predicted output = \n',predicted

	print 'Accuracy = ', acc* 100 ,'%'
class SVMSentiment:

    def __init__(self):
       l.getLogger("SVMSentimentAnalysis")
       l.basicConfig(level=l.ERROR)
       l.debug('Initializing the SVM Model')
       self.max_length = 500
       self.batch_size=50
       self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=3,C = 0.5,tol=0.0001,cache_size=5000))

    def configureSVMModel(self,TrainX,TrainY,validX,validY):
       l.debug('Configuring the SVM Model')
       currPath = os.getcwd()
       currFiles =  os.listdir(currPath)
       if(currFiles.count('SVMScores.pkl')==0):
          self.model.fit(TrainX, TrainY)
          # Saving model scores
          joblib.dump(self.model,currPath+'/SpeechTextModels/SVMScores.pkl')
       else:
          l.debug('Loading already existing Model')
          self.model = joblib.load(currPath+'/SpeechTextModels//SVMScores.pkl')
       

    def evaluateSVMModel(self,TestX,TestY):
       l.debug("Model Score:::%s",self.model.score(TestX, TestY))
       predicted_data=[]
       for i in range(len(TestX)):
          predicted_data.append(list([self.model.predict (TestX[i].reshape(1,-1)) ,TestY[i]]) )
       l.debug("Current Model Prediction::: %s",str(predicted_data))

    def predictSentiment(self,dataX):
       for i in range(len(dataX)):
         predicted_data = self.model.predict(dataX[i].reshape(1,-1))
       return predicted_data

    def getTrainTestData(self):
       l.debug('Loading Training and Test data')
       (trainX,trainY) = cPickle.load(open('trainingdata.pkl','rb'))
       (testX,testY)  = cPickle.load(open('testingdata.pkl','rb'))
       return ((trainX,trainY),(testX,testY))

    def getValidationData(self,dataX,dataY):
       return dataX[0:self.batch_size,:],dataY[0:self.batch_size,:]
예제 #41
0
def run():
    N = 5; #number of classes
    mat_contents = sio.loadmat('octave_X.mat')
    X = mat_contents['Norm_X']
    mat_contents = sio.loadmat('octave_Y.mat')
    Y = mat_contents['Y']
    mat_contents = sio.loadmat('octave_XT.mat')
    XT = mat_contents['Norm_XT']
    mat_contents = sio.loadmat('octave_YT.mat')
    YT = mat_contents['Y_Test']

    classifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y)
    prediction = classifier.predict(XT)

    print("Accuracy : %f" %classifier.score(XT,YT))
    confusion_matrix = np.zeros((N, N))
    for (ind,r) in enumerate(prediction):
        confusion_matrix[int(YT[ind][0])][int(r)] +=1

    print("Confusion Matrix")
    for line in confusion_matrix:
        print(line)
예제 #42
0
파일: svm.py 프로젝트: souvenir13/libact
class SVM(ContinuousModel):

    """C-Support Vector Machine Classifier

    When decision_function_shape == 'ovr', we use OneVsRestClassifier(SVC) from
    sklearn.multiclass instead of the output from SVC directory since it is not
    exactly the implementation of One Vs Rest.

    References
    ----------
    http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    """

    def __init__(self, *args, **kwargs):
        self.model = sklearn.svm.SVC(*args, **kwargs)
        if self.model.decision_function_shape == 'ovr':
            self.decision_function_shape = 'ovr'
            # sklearn's ovr isn't real ovr
            self.model = OneVsRestClassifier(self.model)

    def train(self, dataset, *args, **kwargs):
        return self.model.fit(*(dataset.format_sklearn() + args), **kwargs)

    def predict(self, feature, *args, **kwargs):
        return self.model.predict(feature, *args, **kwargs)

    def score(self, testing_dataset, *args, **kwargs):
        return self.model.score(*(testing_dataset.format_sklearn() + args),
                                **kwargs)

    def predict_real(self, feature, *args, **kwargs):
        dvalue = self.model.decision_function(feature, *args, **kwargs)
        if len(np.shape(dvalue)) == 1:  # n_classes == 2
            return np.vstack((-dvalue, dvalue)).T
        else:
            if self.decision_function_shape != 'ovr':
                LOGGER.warn("SVM model support only 'ovr' for multiclass"
                            "predict_real.")
            return dvalue
예제 #43
0
def multiclass_SVC(X, y):

    from sklearn.svm import LinearSVC

    from sklearn import cross_validation

    # first move: split data
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.35)

    # one-vs-rest implementation
    from sklearn.multiclass import OneVsRestClassifier

    ovr = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)

    # one-vs-all implementation
    from sklearn.multiclass import OneVsOneClassifier

    ovo = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)

    one_vs_rest = ovr.score(X_test, y_test)
    one_vs_one = ovo.score(X_test, y_test)

    return one_vs_rest, one_vs_one
예제 #44
0
파일: ova.py 프로젝트: MikulasZelinka/mccd
def ova(train_values, train_classes, test_values, test_classes, desc, deg=2, perc=100):
    # scaler2 = preprocessing.MinMaxScaler((-1, 1)).fit(train_values)
    # train_values_scaled = scaler2.transform(train_values)
    # test_data_scaled = scaler2.transform(test_data)

    # scaler = preprocessing.StandardScaler().fit(train_values)
    # train_values_scaled = scaler.transform(train_values)
    # test_data_scaled = scaler.transform(test_data)

    all_values = numpy.concatenate((train_values, test_values))
    all_classes = numpy.concatenate((train_classes, test_classes))

    x = SelectPercentile(f_classif, percentile=perc).fit_transform(all_values, all_classes)

    svm = SVC(kernel='poly', degree=deg, random_state=0)

    ova = OneVsRestClassifier(svm)
    ova.fit(x[:144, :], train_classes)
    score = ova.score(x[144:, :], test_classes)

    cm = confusion_matrix(test_classes, ova.predict(x[144:, :]))
    print('Confusion matrix')
    print(cm)

    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
    print('Normalized confusion matrix')
    print(cm_normalized)

    plot_confusion_matrix(desc, cm_normalized, title='Normalized confusion matrix')
    # score = ova.score(x[:144, :], train_classes)

    # ova.fit(train_values, train_classes)
    # score = ova.score(test_values, test_classes)

    # ova.fit(train_values_scaled, train_classes)
    # score = ova.score(test_data_scaled, test_classes)
    return score
예제 #45
0
#                  class_weight=None, # all classes are treated equally 
#                  verbose=False, # print the logs 
#                  max_iter=-1, # no limit, let it run
#                  decision_function_shape=None, # will use one vs rest explicitly 
#                  random_state=None)

svc_model = OneVsRestClassifier (classifier, n_jobs=1)
svc_model.fit(X_train,y_train)

# knc_model =KNeighborsClassifier(n_neighbors=5)
# knc_model.fit(X_train,y_train)
# predicted = svc_model.predict(X_test)

scores = cross_val_score(knc_model, X_train, y_train, cv=5)

print 'RF: Accuracy with a single train/test split', svc_model.score(y_test, predicted)
predicted = svc_model.predict(X_test)
print 'RF: Accuracy with a single train/test split', accuracy_score(y_test, predicted)

scores = cross_val_score(svc_model, X_train, y_train, cv=5)

print 'RF: the mean of Accuracy with a cross value train/test split is: ', scores.mean()

print 'RF:The std of Accuracy with a cross value train/test split is', scores.std()



############################ Predict the test ###################################

sub = pd.read_csv("../input/sample_submission.csv")
sub['id'] = test_data.sort_values(by='id' , ascending=True)
예제 #46
0
class speechLSTM:
    # Initializing the LSTM Model
    def __init__(self):
       self.prevData = 100
       self.batchsize=200
       self.model = OneVsRestClassifier(svm.SVC(kernel='poly',gamma=1,C = 1,tol=0.0001,cache_size=5000)  )     #self.model = OneVsRestClassifier(LinearSVC(random_state=0))


    def load_data_file(self):
        outputdata = []
        for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/DC/*.wav"):
            frate, inputdata = sc.read(f)
            pitch=lp.getPitch(f)
            emotion = ""
            loudness = abs(an.loudness(inputdata))
            filename = f.split("/")[-1].split(".")[0]
            if filename[0] == "s":
                emotion = filename[0:2]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            else:
                emotion = filename[0]
                ##emotion =  float(int(hashlib.md5(emotion).hexdigest(), 16))
            outputdata.append(list([loudness,pitch, emotion]))
        for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/JE/*.wav"):
            frate, inputdata = sc.read(f)
            pitch = lp.getPitch(f)
            emotion = ""
            loudness = abs(an.loudness(inputdata))
            filename = f.split("/")[-1].split(".")[0]
            if filename[0] == "s":
                emotion = filename[0:2]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            else:
                emotion = filename[0]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            outputdata.append(list([loudness, pitch, emotion]))
        for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/JK/*.wav"):
            frate, inputdata = sc.read(f)
            pitch = lp.getPitch(f)
            emotion = ""
            loudness = abs(an.loudness(inputdata))
            filename = f.split("/")[-1].split(".")[0]
            if filename[0] == "s":
                emotion = filename[0:2]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            else:
                emotion = filename[0]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            outputdata.append(list([loudness, pitch, emotion]))
        for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/KL/*.wav"):
            frate, inputdata = sc.read(f)
            pitch = lp.getPitch(f)
            emotion = ""
            loudness = abs(an.loudness(inputdata))
            filename = f.split("/")[-1].split(".")[0]
            if filename[0] == "s":
                emotion = filename[0:2]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            else:
                emotion = filename[0]
                ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16))
            outputdata.append(list([loudness, pitch, emotion]))
        return outputdata

    def get_train_test_data(self,data,percent_split):
        noOfSamples = len(data)*(1-percent_split)
        print("No of Samples", noOfSamples)
        test =  data.iloc[0:int(noOfSamples), 2:]
        test1=[]
        for i in range(len(test)):
            test1 = np.append(test1,test.iloc[i].values[0])

        return data.iloc[int(noOfSamples):, 0:2], data.iloc[int(noOfSamples):, 2:],data.iloc[0:int(noOfSamples), 0:2],np.array(test1)

    def trainNNet(self,data_,label_):
        #data = data_/data_.max(axis=0)
        #label = label_/label_.max(axis=0)
        data=data_
        label=label_
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, label.astype(str), test_size = 0.045, random_state = 0)
        self.model.fit(X_train,y_train)
        print("score",self.model.score(X_test,y_test))
        #print (cross_validation.cross_val_score(self.model, data, label.astype(str), cv =4))

    def predict(self,ftest_,ltest_):
        #ltest=ltest_/ltest_.max(axis=0)
        #ftest=ftest_/ftest_.max(axis=0)
        ftest=ftest_
        ltest=ltest_
        predicted_data = []

        count=0
        for i in range(len(ftest)):
            predicted_data.append(self.model.predict(ftest.iloc[i].values.reshape(1,-1)))
        print predicted_data
        print ltest
logistic=LogisticRegression()
logistic.fit(X,y)
print 'Predicted class %s, real class %s' % (logistic.predict(iris.data[-1,:]), iris.target[-1])
print 'Probabilities for each class from 0 to 2: %s' % (logistic.predict_proba(iris.data[-1,:]))

#*******************Logistic Regression on multiClass*****************************
from sklearn.datasets import load_digits
digits=load_digits()
X,y=digits.data[:1700,:],digits.target[:1700]
tX,ty=digits.data[1700:,:],digits.target[1700:]

from sklearn.multiclass import OneVsRestClassifier 
from sklearn.multiclass import OneVsOneClassifier
OVR=OneVsRestClassifier(LogisticRegression()).fit(X,y) 
OVO=OneVsOneClassifier(LogisticRegression()).fit(X, y) 
print 'One vs rest accuracy: %.3f'% OVR.score(tX, ty)














예제 #48
0
def doAnalysis(city, state, category, filterConditions, k=10):
    with open('projectdata\\'+city + category + 'income.json','r') as r_file:
            data = json.load(r_file)
            keyset =set()
            unkeyset = set()
            unkvalueset  =set()
            newdata = []
            nbnewdata = []
            flabels = []
            fnewdata = []
            labels = []
            nblabels = []
            maxfeats = 0
            filtercondition = True
            for dno,datum in enumerate(data):
               #  #filter
               # if 'Chinese' not in datum['categories'] and 'Pizza' not in datum['categories'] and 'Italian' not in datum['categories']:
               #     continue
               # if 'Wi-Fi' in datum['attributes'] and datum['attributes']['Wi-Fi'] == 'no':
               #     continue
               # #if 'Ambience' in datum['attributes'] and 'casual' in datum['Ambience'] and datum['Ambience']['casual'] == False:
               # #    continue
               # if 'Price Range' in datum['attributes'] and datum['attributes']['Price Range'] != 2:
               #     continue
               #if datum["stars"]>=3.0:
               if evalFilter(datum, filterConditions):
                   continue
               newdatum = {}
               label = {}
               featcnt = 0
               for key in datum:
                   if key == "stars":
                       #labels.append(datum[key])
                       #label[key] = str(datum[key])
                       #labels.append(str(datum[key]))

                       ##jj
                       # if datum[key] <= 2.5:
                       # #if datum[key]==3.0: #and datum[key] <= 3.5:
                       #     labels.append("B")
                       #     #nblabels.append("na")
                       # # elif datum[key] >= 4.5:
                       # #     labels.append("G")
                       # #else:
                       # #elif datum[key] == 3.5:

                       if datum[key] > 2.5:
                           labels.append("NB")
                           if datum[key] >= 4.5:
                               nblabels.append("EG")
                           # elif datum[key] >= 3.5:
                           #     nblabels.append("G")
                           else:
                               nblabels.append("G")
                               if datum[key]==3.0:
                                   flabels.append("F")
                               else:
                                   flabels.append("AAvg")
                       # else:
                       #     labels.append("G")
                       else:
                           labels.append("PB")
                   elif key!="name" and key!="business_id" and key!="full_address" and key!="latitude" and key!="longitude" and key!="hours" and key[0]!=" " and key[0]!="$" and key!='review_count' and key!='categories':
                       if isinstance(datum[key], int) or isinstance(datum[key], long) or isinstance(datum[key], float) \
                               or isinstance(datum[key], str) or isinstance(datum[key], bool):
                           newdatum[key] = datum[key]
                           keyset.add(key)
                           featcnt+=1
                       elif isinstance(datum[key],dict):
                           for subkey in datum[key]:
                               if isinstance(datum[key][subkey], int) or isinstance(datum[key][subkey], long) or \
                                       isinstance(datum[key][subkey], float) or isinstance(datum[key][subkey], str) or \
                                       isinstance(datum[key][subkey], bool):
                                   newdatum[key+" "+subkey] = datum[key][subkey]
                                   keyset.add(subkey)
                                   featcnt+=1
                               elif isinstance(datum[key][subkey], unicode):
                                   ascval = unicodedata.normalize('NFKD', datum[key][subkey]).encode('ascii','ignore')
                                   newdatum[key+" "+subkey] = ascval
                                   keyset.add(subkey)
                                   featcnt+=1
                               elif isinstance(datum[key][subkey],dict):
                                   for subsubkey in datum[key][subkey]:
                                       if isinstance(datum[key][subkey][subsubkey], int) or \
                                               isinstance(datum[key][subkey][subsubkey], long) or \
                                               isinstance(datum[key][subkey][subsubkey], float) or \
                                               isinstance(datum[key][subkey][subsubkey], str) or \
                                               isinstance(datum[key][subkey][subsubkey], bool):
                                           newdatum[key+" "+subkey+" "+subsubkey] = datum[key][subkey][subsubkey]
                                           keyset.add(subsubkey)
                                           featcnt+=1
                                       elif isinstance(datum[key][subkey][subsubkey], unicode):
                                           ascval = unicodedata.normalize('NFKD', datum[key][subkey][subsubkey]).encode('ascii','ignore')
                                           newdatum[key+" "+subkey+" "+subsubkey] = ascval
                                           keyset.add(subsubkey)
                                           featcnt+=1
                                       else:
                                           unkeyset.add(key+ " "+subsubkey)
                                           unkvalueset.add(type(datum[key][subkey][subsubkey]))

                               else:
                                   unkeyset.add(key+ " "+subkey)
                                   unkvalueset.add(type(datum[key][subkey]))
                       elif isinstance(datum[key],list):
                           for itnum, item in enumerate(datum[key]):
                               if isinstance(item, int) or isinstance(item, long) or isinstance(item, float) or \
                                       isinstance(item, str) or isinstance(item, bool):
                                   newdatum[key +" "+str(item)] = True
                                   keyset.add(key)
                                   featcnt+=1
                               elif isinstance(item, unicode):
                                   ascval = unicodedata.normalize('NFKD', item).encode('ascii','ignore')
                                   newdatum[key +" "+ascval] = True
                                   keyset.add(key)
                                   featcnt+=1
                               else:
                                   unkeyset.add(key)
                                   unkvalueset.add(type(item))
                       else:
                            if isinstance(datum[key], unicode):
                                ascval = unicodedata.normalize('NFKD', datum[key]).encode('ascii','ignore')
                                newdatum[key] = ascval
                                keyset.add(key)
                                featcnt+=1
                            else:
                                unkeyset.add(key)
                                unkvalueset.add(type(datum[key]))
               newdata.append(newdatum)
               if datum["stars"] > 2.5:
                    nbnewdata.append(newdatum)
                    if datum["stars"] < 4.5:
                        fnewdata.append(newdatum)
               if featcnt > maxfeats:
                   maxfeats = featcnt
               #labels.append(label)
            #print keyset
            print unkeyset
            print unkvalueset
            # for unk in unkeyset:
            #     if isinstance(unk, unicode):
            #         print "yeah"
            #     else:
            #         print "nah"
            dv = DictVectorizer(sparse=False)
            vectdata = dv.fit_transform(newdata)
            #print vectdata[0]


            nbdv = DictVectorizer(sparse=False)
            nbvectdata = nbdv.fit_transform(nbnewdata)
            #print nbvectdata[0]

            fdv = DictVectorizer(sparse=False)
            fvectdata = fdv.fit_transform(fnewdata)
            #print nbvectdata[0]
            print len(flabels), len(fnewdata), len(fvectdata)
            # split = (int)(round(len(vectdata)*0.7))
            # print split
            # traindata = vectdata[0:split]
            # trainlabels = labels[0:split]
            # testdata = vectdata[split:]
            # testlabels = labels[split:]

            #traindata, testdata, trainlabels, testlabels = train_test_split(vectdata, labels, test_size=0.33, random_state=42, stratify=)
            #nbtraindata, nbtestdata, nbtrainlabels, nbtestlabels = train_test_split(vectdata, nblabels, test_size=0.33, random_state=42)
            traindata = []
            trainlabels =[]
            testdata = []
            testlabels = []

            sssidxs = StratifiedShuffleSplit(labels, n_iter=1, test_size=0.7, random_state=0)
            #print len(sssidxs)
            for train_index, test_index in sssidxs:
                #print train_index#, test_index
                #print("TRAIN:", train_index, "TEST:", test_index)
                traindata, testdata = vectdata[train_index], vectdata[test_index]
                for tr_idx in train_index:
                    trainlabels.append(labels[tr_idx])
                for ts_idx in test_index:
                    testlabels.append(labels[ts_idx])

            #nbvectdata = []
            #nbpoplabels= []
            nbtraindata = []
            nbtrainlabels =[]
            nbtestdata = []
            nbtestlabels = []

            # for nbd, nbdatum in enumerate(vectdata):
            #     if nblabels[nbd] != "na":
            #         nbpoplabels.append(nblabels[nbd])
            #         nbvectdata.append(nbdatum)

            nbsssidxs = StratifiedShuffleSplit(nblabels, n_iter=1, test_size=0.7, random_state=0)
            for train_index, test_index in nbsssidxs:
                #print("TRAIN:", train_index, "TEST:", test_index)
                nbtraindata, nbtestdata = nbvectdata[train_index], nbvectdata[test_index]
                #nbtrainlabels, nbtestlabels = nblabels[train_index], nblabels[test_index]
                for tr_idx in train_index:
                    #nbtraindata.append(nbvectdata[tr_idx])
                    nbtrainlabels.append(nblabels[tr_idx])
                for ts_idx in test_index:
                    #nbtestdata.append(nbvectdata[ts_idx])
                    nbtestlabels.append(nblabels[ts_idx])


            ftraindata = []
            ftrainlabels =[]
            ftestdata = []
            ftestlabels = []

            # for nbd, nbdatum in enumerate(vectdata):
            #     if nblabels[nbd] != "na":
            #         nbpoplabels.append(nblabels[nbd])
            #         nbvectdata.append(nbdatum)

            fsssidxs = StratifiedShuffleSplit(flabels, n_iter=1, test_size=0.7, random_state=0)
            for train_index, test_index in fsssidxs:
                #print("TRAIN:", train_index, "TEST:", test_index)
                ftraindata, ftestdata = fvectdata[train_index], fvectdata[test_index]
                #nbtrainlabels, nbtestlabels = nblabels[train_index], nblabels[test_index]
                for tr_idx in train_index:
                    #nbtraindata.append(nbvectdata[tr_idx])
                    ftrainlabels.append(flabels[tr_idx])
                for ts_idx in test_index:
                    #nbtestdata.append(nbvectdata[ts_idx])
                    ftestlabels.append(flabels[ts_idx])





            # vectpca = PCA(n_components=4)
            # vectpca.fit(vectdata)

            # pcacorr = []
            # for var in vectdata:
            #     for comp in vectpca.components_:
            #         corr = pearsonr(var, comp)
            #         #print corr
            #         pcacorr.append(corr)


             # vectlabels = dv.fit_transform(labels)

            # #jj
            # print "LDA"
            # vectlda = LinearDiscriminantAnalysis(n_components=10)
            # ldacomps = vectlda.fit(traindata, trainlabels).transform(traindata)
            # print "==========================="
            # print maxfeats
            # #end jj



            #
            # for jdx, coef in enumerate(vectlda.coef_):
            #     print vectlda.classes_[jdx]
            #     for idx,(k,v) in enumerate(sorted(dv.vocabulary_.items(),key=itemgetter(1))):
            #         print k, coef[idx]
            #     print "==============="

            # #jj
            # ldaacc = vectlda.score(testdata,testlabels)
            # print "ldaacc ", ldaacc
            # #end jj

            # ldacorr = []
            # for var in vectdata:
            #     for comp in ldacomps:
            #         corr = pearsonr(var, comp)
            #         #print corr
            #         ldacorr.append(corr)

             # vectlabels = dv.fit_transform(labels)

            # #jj
            # print "QDA"
            # vectqda = QuadraticDiscriminantAnalysis()
            # qdacomps = vectqda.fit(traindata, trainlabels)#.transform(vectdata)
            # print "==========================="
            # print maxfeats
            # qdaacc = vectqda.score(testdata, testlabels)
            # print "qdaacc ", qdaacc
            # #end jj

            # for jdx, coef in enumerate(vectqda.coef_):
            #     print vectqda.classes_[jdx]
            #     for idx,(k,v) in enumerate(sorted(dv.vocabulary_.items(),key=itemgetter(1))):
            #         print k, coef[idx]
            #     print "==============="
            # ldacorr = []
            # for var in vectdata:
            #     for comp in ldacomps:
            #         corr = pearsonr(var, comp)
            #         #print corr
            #         ldacorr.append(corr)

            # rcf = RandomForestClassifier(n_estimators=200, warm_start=True,oob_score=True)
            # rcfcomps = rcf.fit(traindata,trainlabels).transform(traindata)
            #rcfacc = rcf
            #print "rcfacc ", rcfacc

            # #jj
            # adb = AdaBoostClassifier(n_estimators=200)
            # adcomps = adb.fit(traindata, trainlabels)#.transform(traindata)
            # adbacc = adb.score(testdata, testlabels)
            # print "adbacc ", adbacc
            # print adb.feature_importances_
            #
            # gdb = GradientBoostingClassifier(n_estimators=200)
            # gdcomps = gdb.fit(traindata,trainlabels)#.transform(traindata)
            # gdbacc = gdb.score(testdata,testlabels)
            # print "gdbacc ", gdbacc
            # print gdb.feature_importances_
            # #end jj


            ovs = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True))

            ovcomps = ovs.fit(traindata, trainlabels)
            ovsacc = ovs.score(testdata, testlabels)
            print "ovsacc ", ovsacc
            ovsestimators = ovs.estimators_
            ovsfeatimps = []
            for i,ovsest in enumerate(ovsestimators):
                #print ovs.classes_[i], ovsest.feature_importances_
                ovsfeatimps = ovsest.feature_importances_
            #print len(dv.feature_names_)
            #print dv.feature_names_
            #print dv.vocabulary_[' $100000 to $124999']


            ovsfimps =  sorted(zip(dv.feature_names_,ovsfeatimps),key=itemgetter(1),reverse=True)[:k]
            #print sum(ovsfimps[:][2])
            print ovsfimps


            # rfecv = RFECV(estimator=rfc, step=100, cv=StratifiedKFold(trainlabels, 2),
            #   scoring='accuracy')
            # rfecv.fit(traindata, trainlabels)
            #
            # print("Optimal number of features : %d" % rfecv.n_features_)
            #
            # # Plot number of features VS. cross-validation scores
            # plt.figure()
            # plt.xlabel("Number of features selected")
            # plt.ylabel("Cross validation score (nb of correct classifications)")
            # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
            # plt.show()
            # print rfecv.ranking_
            rfc = RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)
            rfe = RFE(estimator=rfc, n_features_to_select=k,step = 0.1)
            rfe.fit(traindata, trainlabels)
            print rfe.n_features_
            print len(rfe.ranking_)
            rfetestdata = [[each_list[i] for i, supp in enumerate(rfe.support_) if supp == True ] for each_list in testdata]
            print "rfeacc ", rfe.estimator_.score(rfetestdata, testlabels)
            # plt.figure()
            # plt.xlabel("Number of features selected")
            # plt.ylabel("Cross validation score (nb of correct classifications)")
            # plt.plot(range(1, len(rfe.scores_) + 1), rfe.scores_)
            # plt.show()
            rfefeatimps = []
            rfefeatnames = [dv.feature_names_[i] for i, supp in enumerate(rfe.support_) if supp == True ]
            impsums = sum(rfe.estimator_.feature_importances_)
            rfeimps =  sorted(zip(rfefeatnames,rfe.estimator_.feature_importances_/impsums),key=itemgetter(1),reverse=True)
            print rfeimps
            print sum([pair[1] for pair in rfeimps])




            # #jj
            # ovs2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=200))
            # ovcomps2 = ovs2.fit(traindata, trainlabels)
            # ovsacc2 = ovs2.score(testdata, testlabels)
            # print "ovsacc2 ", ovsacc2
            #
            # ovo = OneVsOneClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True))
            # ovocomps = ovo.fit(traindata, trainlabels)
            # ovoacc = ovo.score(testdata, testlabels)
            # print "ovoacc ", ovoacc
            #
            #
            # ovo2 = OneVsOneClassifier(AdaBoostClassifier(n_estimators=200))
            # ovocomps2 = ovo2.fit(traindata, trainlabels)
            # ovoacc2 = ovo2.score(testdata, testlabels)
            # print "ovoacc2 ", ovoacc2
            # #print ovs.coef_
            #
            # ovosvm  = OneVsOneClassifier(NuSVC(nu=0.1,kernel='poly',random_state=0))
            # ovosvm.fit(traindata, trainlabels)
            # ovosvmacc = ovosvm.score(testdata, testlabels)
            # print "ovosvmacc ", ovosvmacc
            # #end jj


            # clf = NuSVR(kernel = 'rbf',C=1.0, nu=0.5)
            # clf.fit(traindata, trainlabels)
            # nusvacc = clf.score(testdata, testlabels)
            # print "nusvacc ", nusvacc


            print "===========================NB================================="


            # #jj
            # nbadb = AdaBoostClassifier(n_estimators=200)
            # nbadcomps = nbadb.fit(nbtraindata, nbtrainlabels)#.transform(traindata)
            # nbadbacc = nbadb.score(nbtestdata, nbtestlabels)
            # print "nbadbacc ", nbadbacc
            # print nbadb.feature_importances_
            #
            # gdb = GradientBoostingClassifier(n_estimators=200)
            # gdcomps = gdb.fit(nbtraindata,nbtrainlabels)#.transform(traindata)
            # gdbacc = gdb.score(nbtestdata,nbtestlabels)
            # print "gdbacc ", gdbacc
            # print gdb.feature_importances_
            # #end jj

            nbovs = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True))
            nbovcomps = nbovs.fit(nbtraindata, nbtrainlabels)
            nbovsacc = nbovs.score(nbtestdata, nbtestlabels)
            print "nbovsacc ", nbovsacc
            nbovsestimators = nbovs.estimators_
            nbovsfeatimps = []
            for i,nbovsest in enumerate(nbovsestimators):
               # print nbovs.classes_[i], nbovsest.feature_importances_
                nbovsfeatimps = nbovsest.feature_importances_
            nbovsfimps =  sorted(zip(nbdv.feature_names_,nbovsfeatimps),key=itemgetter(1),reverse=True)[:k]
            print nbovsfimps

            nbrfc = RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)
            nbrfe = RFE(estimator=nbrfc, n_features_to_select=k,step = 0.1)
            nbrfe.fit(nbtraindata, nbtrainlabels)
            print nbrfe.n_features_
            print len(nbrfe.ranking_)
            nbrfetestdata = [[each_list[i] for i, supp in enumerate(nbrfe.support_) if supp == True ] for each_list in nbtestdata]
            print "nbrfeacc ", nbrfe.estimator_.score(nbrfetestdata, nbtestlabels)

            nbrfefeatnames = [dv.feature_names_[i] for i, supp in enumerate(nbrfe.support_) if supp == True ]
            nbimpsums = sum(nbrfe.estimator_.feature_importances_)
            nbrfeimps =  sorted(zip(nbrfefeatnames,nbrfe.estimator_.feature_importances_/nbimpsums),key=itemgetter(1),reverse=True)
            print nbrfeimps
            print sum([pair[1] for pair in nbrfeimps])

            # #jj
            # nbovs2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=200))
            # nbovcomps2 = nbovs2.fit(nbtraindata, nbtrainlabels)
            # nbovsacc2 = nbovs2.score(nbtestdata, nbtestlabels)
            # print "nbovsacc2 ", nbovsacc2
            #
            # nbovo = OneVsOneClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True))
            # nbovocomps = nbovo.fit(nbtraindata, nbtrainlabels)
            # nbovoacc = nbovo.score(nbtestdata, nbtestlabels)
            # print "nbovoacc ", nbovoacc
            #
            # nbovo2 = OneVsOneClassifier(AdaBoostClassifier(n_estimators=200))
            # nbovocomps2 = nbovo2.fit(nbtraindata, nbtrainlabels)
            # nbovoacc2 = nbovo2.score(nbtestdata, nbtestlabels)
            # print "nbovoacc2 ", nbovoacc2
            # #print ovs.coef_
            #
            # nbovosvm  = OneVsOneClassifier(LinearSVC(random_state=0))
            # nbovosvm.fit(nbtraindata, nbtrainlabels)
            # nbovosvmacc = nbovosvm.score(nbtestdata, nbtestlabels)
            # print "nbovosvmacc ", nbovosvmacc
            # #end jj

            print "===========================f================================="

            # #jj
            # fadb = AdaBoostClassifier(n_estimators=200)
            # fadcomps = fadb.fit(ftraindata, ftrainlabels)#.transform(traindata)
            # fadbacc = fadb.score(ftestdata, ftestlabels)
            # print "fadbacc ", fadbacc
            # print fadb.feature_importances_
            #
            # gdb = GradientBoostingClassifier(n_estimators=200)
            # gdcomps = gdb.fit(ftraindata,ftrainlabels)#.transform(traindata)
            # gdbacc = gdb.score(ftestdata,ftestlabels)
            # print "gdbacc ", gdbacc
            # print gdb.feature_importances_
            # #end jj

            fovs = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True))
            fovcomps = fovs.fit(ftraindata, ftrainlabels)
            fovsacc = fovs.score(ftestdata, ftestlabels)
            print "fovsacc ", fovsacc
            fovsestimators = fovs.estimators_
            fovsfeatimps = []
            for i,fovsest in enumerate(fovsestimators):
                #print fovs.classes_[i], fovsest.feature_importances_
                fovsfeatimps = fovsest.feature_importances_
            fovsfimps = sorted(zip(fdv.feature_names_,fovsfeatimps),key=itemgetter(1),reverse=True)[:k]
            print fovsfimps

            owfile = 'output.json'
            with open(owfile,'w') as owrfile:
                owrfile.write(json.dumps([dict(ovsfimps), dict(nbovsfimps), dict(fovsfimps)]))


            frfc = RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)
            frfe = RFE(estimator=frfc, n_features_to_select=k,step = 0.1)
            frfe.fit(ftraindata, ftrainlabels)
            print frfe.n_features_
            print len(frfe.ranking_)
            frfetestdata = [[each_list[i] for i, supp in enumerate(frfe.support_) if supp == True ] for each_list in ftestdata]
            print "frfeacc ", frfe.estimator_.score(frfetestdata, ftestlabels)

            frfefeatnames = [dv.feature_names_[i] for i, supp in enumerate(frfe.support_) if supp == True ]
            fimpsums = sum(frfe.estimator_.feature_importances_)
            frfeimps =  sorted(zip(frfefeatnames,frfe.estimator_.feature_importances_/fimpsums),key=itemgetter(1),reverse=True)
            print frfeimps
            print sum([pair[1] for pair in frfeimps])

            #

            # #jj
            # fovs2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=200))
            # fovcomps2 = fovs2.fit(ftraindata, ftrainlabels)
            # fovsacc2 = fovs2.score(ftestdata, ftestlabels)
            # print "fovsacc2 ", fovsacc2
            #
            # fovo = OneVsOneClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True))
            # fovocomps = fovo.fit(ftraindata, ftrainlabels)
            # fovoacc = fovo.score(ftestdata, ftestlabels)
            # print "fovoacc ", fovoacc
            #
            # fovo2 = OneVsOneClassifier(AdaBoostClassifier(n_estimators=200))
            # fovocomps2 = fovo2.fit(ftraindata, ftrainlabels)
            # fovoacc2 = fovo2.score(ftestdata, ftestlabels)
            # print "fovoacc2 ", fovoacc2
            # #print ovs.coef_
            #
            # fovosvm  = OneVsOneClassifier(NuSVC(nu=0.435,kernel='poly',random_state=0))
            # fovosvm.fit(ftraindata, ftrainlabels)
            # fovosvmacc = fovosvm.score(ftestdata, ftestlabels)
            # print "fovosvmacc ", fovosvmacc
            # #end jj

            #return ovsfimps, nbovsfimps, fovsfimps
            return rfeimps, nbrfeimps, frfeimps
예제 #49
0
# Define training and testing sets
print "Splitting into a training and a testing set..."
indices = arange(len(raw_data))
random.seed(0)
train_idx = random.sample(indices, n_train)
test_idx = random.sample(indices, n_test)
X_train, y_train = raw_data[train_idx, 1:], raw_data[train_idx, 0]
X_test, y_test = raw_data[test_idx, 1:], raw_data[test_idx, 0]

# Apply a learning algorithm
print "Applying a learning algorithm..."
clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train)

# Make a prediction
print "Making predictions..."
y_pred = clf.predict(X_test)

print y_pred

# Evaluate the prediction
print "Evaluating results..."
print "Precision: \t", metrics.precision_score(y_test, y_pred)
print "Recall: \t", metrics.recall_score(y_test, y_pred)
print "F1 score: \t", metrics.f1_score(y_test, y_pred)
print "Mean accuracy: \t", clf.score(X_test, y_test)

# Calculate overall time
end_time = time.time()
print "Overall running time:", end_time - start_time
df = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')
df['Interest.Rate'] = df['Interest.Rate'].map(lambda x: round(float(x.rstrip('%'))/100, 4))
df['FICO.Score'] = df['FICO.Range'].map(lambda x: int(x.split('-')[0]))

k = df['Interest.Rate'] >= .12
df['IR_TF'] = k.astype(int)


X = np.column_stack((df['FICO.Score'], df['Amount.Requested']))
Y = df['IR_TF']

classifier = OneVsRestClassifier(LogisticRegression(penalty='l1')).fit(X, Y)
print 'Coefficients: ', classifier.coef_
print 'Intercept" ', classifier.intercept_
print 'Accuracy: ', classifier.score(X, Y)

coeff = classifier.coef_
intercept = classifier.intercept_

FICOScore_IRTF0 = df.loc[df['IR_TF']==0, 'FICO.Score']
FICOScore_IRTF1 = df.loc[df['IR_TF']==1, 'FICO.Score']

AmountRequested_IRTF0 = df.loc[df['IR_TF']==0, 'Amount.Requested']
AmountRequested_IRTF1 = df.loc[df['IR_TF']==1, 'Amount.Requested']

fig = plt.figure(figsize = (10, 8))

plt.plot(FICOScore_IRTF0, AmountRequested_IRTF0, '.', label = 'Interest rate < 12% (class 0)',mfc = 'None', mec='coral')
plt.plot(FICOScore_IRTF1, AmountRequested_IRTF1, '.', label = 'Interest rate >= 12% (class 1)',mfc = 'None', mec='steelblue')
100XP
Import LogisticRegression from sklearn.linear_model and OneVsRestClassifier from sklearn.multiclass.
Instantiate the classifier clf by placing LogisticRegression() inside OneVsRestClassifier().
Fit the classifier to the training data X_train and y_train.
Compute and print the accuracy of the classifier using its .score() method, which accepts two arguments: X_test and y_test.
'''
# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Create the DataFrame: numeric_data_only
numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000)

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

# Create training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only,
                                                               label_dummies,
                                                               size=0.2, 
                                                               seed=123)

# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Print the accuracy
print("Accuracy: {}".format(clf.score(X_test, y_test)))
        precision = 100.0*tp/(tp+fp);
        recall = 100.0*tp/(tp+fn);
        F1 = 2*precision *recall /(precision + recall) ;
	#fpr,tpr,tresholds = roc_curve(y_actual,y_predict)
	area = roc_auc_score(y_predict,y_actual)	
        return (precision,recall,F1,area)

if __name__=="__main__":
	(train_data,y)= getData("trainingData2.csv");
	(test_data,y_test)=getData("CVData2.csv");
	# Start OneVsRest Stratergy

	cOneVsRest = OneVsRestClassifier(LinearSVC()).fit(train_data,y)
	oVrPreds = cOneVsRest.predict(test_data)
	print "oVr Percentage Goods :",100.0*sum(oVrPreds == y_test)/len(y_test)
	print "Score of oVr is :",cOneVsRest.score(test_data,y_test)
	
	#scores = cross_val_score(cOneVsRest, test_data, y_test,cv=10)
	#print "Value of mean is :",scores.mean()

	# Start SVM Stratergy

	print "Started SVM Training and Classification"

	cSVM = svm.SVC().fit(train_data,y)
	SVMPreds = cSVM.predict(test_data)
	print "SVM Percentage Goods :",100.0*sum(SVMPreds == y_test) /len(y_test)

	#Start KNN Ways of prediction	

	cKNN = KNeighborsClassifier(n_neighbors=4).fit(train_data,y)
예제 #53
0
from sklearn.cross_validation import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import time
t=time.time()

#Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_train, test_size=.2,random_state=random_state)#splitting the data for cross validation (80-20)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

from sklearn.linear_model import LogisticRegression
OVR = OneVsRestClassifier(LogisticRegression())#using logistic regression for classsifciation
OVR.fit(X_ptrain, y_ptrain)
print "accuracy", OVR.score(X_ptest,y_ptest)#checking the accuracy
from sklearn.metrics import log_loss#for checing the log loss
Y1_final =OVR.predict_proba(X_ptest)
print (Y1_final[0])
print log_loss(y_ptest,Y1_final)#log loss value
test_ids = [[str(x)] for x in test_photos['id']]#getting the test ids for writing the final submission
print type(test_ids)
a = np.asarray(test_ids)
print type(a),a.shape,a[1]

import csv



np.savetxt("submisoon1.csv",Y1_final.astype(dtype = float),delimiter =",",fmt = '%1.5f')#stores only the posteriror probabiltites into 8 columns for each image
with open(data_root+"submission_label.csv",'w') as wr:#writes only the ids into a csv file and the we have to merge both the files manually
예제 #54
0
파일: knn1.py 프로젝트: naritapandhe/mltest
logreg.fit(X_train, y_train)

# Make an array of predictions on the test set
pred = logreg.predict(X_test)

# Output the hitrate and the confusion matrix for each model
print(logreg.score(X_test, y_test))
#print(confusion_matrix(pred, y_test)) """



from sklearn.neighbors import KNeighborsClassifier
neigh = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=2))
neigh.fit(X_train, y_train) 
pred = neigh.predict(X_test)
print(neigh.score(X_test,y_test))


#print iris
"""
k_means = cluster.KMeans(n_clusters=20)
k_means.fit(X_train,y_train)
pred = k_means.predict(X_test)
print pred
print y_test
#print(k_means.score(X_test, y_test))
"""

from sklearn.tree import DecisionTreeClassifier
clf = OneVsRestClassifier(DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=10))
clf.fit(X_train, y_train) 
예제 #55
0


# ratio imbalance

# train

classifier = OneVsRestClassifier(LinearSVC())
# classifier = OneVsRestClassifier(sklearn.naive_bayes.MultinomialNB())
# classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
# classifier = SGDClassifier(alpha = 0.00001, l1_ratio=0.015)


classifier.fit(train_matrix, negative_cases_train)

predict_sentiment = classifier.predict(test_matrix)
# predict_probs = classifier.predict_proba(test_matrix)

accuracy = classifier.score(test_matrix, negative_cases_test)
precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    negative_cases_test, predict_sentiment)

print(" LogisticRegression, no preprocesing, unigram only")
print ("accuracy = ", accuracy)
print (" precision =", precision)
print ("recal = ", recall)
print ("f1 score = ", f1)

end_time = time.time()
print 'Iterations took %f seconds.' % (end_time - start_time)
예제 #56
0
def test_ovr_fit_predict_svc():
    ovr = OneVsRestClassifier(svm.SVC())
    ovr.fit(iris.data, iris.target)
    assert_equal(len(ovr.estimators_), 3)
    assert_greater(ovr.score(iris.data, iris.target), .9)
def main(inputFileName):
    model = ""
    finalDict={}
    details = {}
    if len(gb.glob("./diction*.pkl")) == 0:

        configuartion=py.SparkConf()                                # setting the Spark Configuration
        sContext=py.SparkContext(conf=configuartion)                # setting the Spark context
        sContext.defaultParallelism
        print ("Data preprocessing start time:", datetime.datetime.now().time())
        traindataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/pos1/*.txt"))
        posData = traindataPos.flatMap(getdata)

        testdataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/pos1/*.txt"))
        postestData = testdataPos.flatMap(getdata)

        newposData = traindataPos + testdataPos

        traindataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/neg1/*.txt"))
        negData = traindataNeg.flatMap(getdata)

        testdataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/neg1/*.txt"))
        negtestData = testdataNeg.flatMap(getdata)

        newNegData = traindataNeg + testdataNeg

        posDataFrequency = newposData.flatMap(mapper).reduceByKey(lambda a,b: a + b)
        negDataFrequency = newNegData.flatMap(mapper).reduceByKey(lambda a,b: a + b)

        dataFrequency = posDataFrequency + negDataFrequency
        dataFrequencySorted = dataFrequency.sortBy(lambda a: a[1],ascending=False)
        finalDict = {}

        newCount= 2

        for key,value in dataFrequencySorted.collect():
            finalDict.update({key:newCount})
            newCount+=1

        dictDump = open("dictionary.pkl", "wb")
        cPickle.dump(finalDict, dictDump, -1)

        finalposData,maxFeatures1 = getIntDataFormat(list(postestData.collect()),finalDict)
        finalnegData,maxFeatures2 = getIntDataFormat(list(negtestData.collect()),finalDict)
        finalposData,maxFeatures1 = getIntDataFormat(list(posData.collect()),finalDict)
        finalnegData,maxFeatures2 = getIntDataFormat(list(negData.collect()),finalDict)
        XtrainNeg,XtrainPos,XtestPos,XtestNeg=[],[],[],[]

        if maxFeatures1 < maxFeatures2:
            XtrainNeg = datapreprocessing(finalnegData,maxFeatures2)
            XtrainPos = datapreprocessing(finalposData,maxFeatures2)
            XtestNeg = datapreprocessing(finalnegData, maxFeatures2)
            XtestPos = datapreprocessing(finalposData, maxFeatures2)
            details.update({"maxfeature":maxFeatures2})
        elif maxFeatures1 > maxFeatures2:
            XtrainNeg = datapreprocessing(finalnegData, maxFeatures1)
            XtrainPos = datapreprocessing(finalposData, maxFeatures1)
            XtestNeg = datapreprocessing(finalnegData, maxFeatures1)
            XtestPos = datapreprocessing(finalposData, maxFeatures1)
            details.update({"maxfeature": maxFeatures1})
        else:
            XtrainNeg = datapreprocessing(finalnegData, maxFeatures1)
            XtrainPos = datapreprocessing(finalposData, maxFeatures1)
            XtestNeg = datapreprocessing(finalnegData, maxFeatures1)
            XtestPos = datapreprocessing(finalposData, maxFeatures1)
            details.update({"maxfeature": maxFeatures1})



        YtrainNeg,YtrainPos,YtestNeg,YtestPos = [],[],[],[]

        if len(XtrainPos)< len(XtrainNeg):
            print "Imbalance Dataset.. Balancing out commencing"
            XtrainNeg = XtrainNeg[0:len(XtrainPos)]
            YtrainNeg = getLabel(len(XtrainPos),"neg")
            YtrainPos = getLabel(len(XtrainPos),"pos")
        elif len(XtrainPos)> len(XtrainNeg):
            print "Imbalance Dataset.. Balancing out commencing"
            XtrainPos = XtrainPos[0:len(XtrainNeg)]
            YtrainNeg = getLabel(len(XtrainNeg), "neg")
            YtrainPos = getLabel(len(XtrainNeg), "pos")
        else:
            print "Balance Dataset"
            YtrainNeg = getLabel(len(XtrainNeg), "neg")
            YtrainPos = getLabel(len(XtrainNeg), "pos")

        if len(XtestPos) < len(XtestNeg):
                print "Imbalance Dataset.. Balancing out commencing"
                XtestNeg = XtestNeg[0:len(XtestPos)]
                YtestNeg = getLabel(len(XtestPos), "neg")
                YtestPos = getLabel(len(XtestPos), "pos")
        elif len(XtestPos) > len(XtestNeg):
                print "Imbalance Dataset.. Balancing out commencing"
                XtestPos = XtrainPos[0:len(XtestNeg)]
                YtestNeg = getLabel(len(XtestNeg), "neg")
                YtestPos = getLabel(len(XtestNeg), "pos")
        else:
                print "Balance Dataset"
                YtestNeg = getLabel(len(XtestNeg), "neg")
                YtestPos = getLabel(len(XtestNeg), "pos")


        Xtrain = XtrainPos+XtrainNeg
        Ytrain = YtrainPos+YtrainNeg

        Xtest = XtestPos + XtestNeg
        Ytest = YtestPos + YtestNeg


        Xtrain = np.array(Xtrain)
        Ytrain = np.array(Ytrain)

        model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=3,C = 0.5,tol=0.0001,cache_size=5000)  )
        model.fit(Xtrain,Ytrain)
        print model.score(Xtest, Ytest)
        details.update({"score":model.score(Xtest, Ytest)})

        dictDump = open("datadetails.pkl", "wb")
        cPickle.dump(details, dictDump, -1)

        joblib.dump(model, "./SpeechTextModels/SVM_SpeechText_Model.pkl")

    else:
        detailsFile = open("./datadetails.pkl", 'rb')
        dictFile = open("./dictionary.pkl", 'rb')
        model = joblib.load("./SpeechTextModels/SVM_SpeechText_Model.pkl")
        details = cPickle.load(detailsFile)
        finalDict = cPickle.load(dictFile)
    ########## End of If Loop MOdel TRAINED ##############################
    dataList=[]
    testDataList = getTestData(inputFileName)
    dataList.append(testDataList)
    finalData, maxFeatures = getIntDataFormat(dataList, finalDict)

    modelTrainFeatures = details.get("maxfeature")
    if modelTrainFeatures > maxFeatures:
        Xtest = np.array(datapreprocessing(finalData, modelTrainFeatures))
    else:
        Xtest = np.array(finalData[0:modelTrainFeatures])
    return model.predict(Xtest)
    pc=0
    nc=0
    classifier = OneVsRestClassifier(LinearSVC(C=2.0,random_state=0))
    classifier.fit(TRAIN_FEATURES,TRAIN_ATTRIBUTE)
    decision = classifier.decision_function(TEST_FEATURES)
    prediction = classifier.predict(TEST_FEATURES)
    for i in range(0,len(TEST_ATTRIBUTE)):
        for j in range(22):
            if prediction[i][j]==TEST_ATTRIBUTE[i][j]:
                pc+=1
            else:
                nc+=1
        # print prediction[i],TEST_ATTRIBUTE[i],TEST_LABELS[i], decision[i]
    print pc,nc
    print classifier.score(TEST_FEATURES,TEST_ATTRIBUTE)

    TRAIN_LABELS = []
    TRAIN_FEATURES = []
    TRAIN_ATTRIBUTE = []

    TEST_LABELS = []
    TEST_FEATURES = []
    TEST_ATTRIBUTE = []

    for feature,label in zip(features['win_feature'],labels['vlabels'][0]):
        if mapping[int(label)]['action'] in NovelClass:
            attributes = classifier.predict(feature.reshape(1, -1))
            TEST_LABELS.append(mapping[int(label)]['action'])
            TEST_FEATURES.append(numpy.array(attributes[0]))
            A = dist.pairwise([attributes[0],numpy.array(attribute_mapping[NovelClass[0]])])[0][1]
예제 #59
0
class speechSVM:
    # Initializing the SVM Model
    def __init__(self):
       self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=2,C = 0.9,tol=0.0001,cache_size=5000)  )     #self.model = OneVsRestClassifier(LinearSVC(random_state=0))
       self.working_directory = os.getcwd()+"/"
       self.model_prediction_score = {}

    # Function to read the emotion prediction probability
    def get_Model_Score(self):
        filename = self.working_directory + "Models/scorefile.txt"
        return pickle.load(open(filename, "rb"))

    # Function to save Emotion prediction probability
    def set_Model_Score(self):
        filename = self.working_directory+"Models/scorefile.txt"
        pickle.dump(self.model_prediction_score, open(filename, "wb"))

    #  Function to load the wav dataset and extract the features from it
    def load_data_file(self):
        outputdata = []         # Variable to store the speech features and emotions

        # Looping all the wave files present in the path
        for f in gb.glob(self.working_directory+"AudioData/*/*.wav"):
            frate, inputdata = sc.read(f)
            # Extracting the pitch from the wav file using Aubio speech API
            pitch=lp.getPitch(f,frate)
            # Extracting loudness of the voice from the Wave file
            loudness = abs(an.loudness(inputdata))

            # Extracting the emotion type from the wave file only for training stage
            filename = f.split("/")[-1].split(".")[0]

            # Condition to differentiate the various types of emotions
            if filename[0] == "s":
                emotion = filename[0:2]
            else:
                emotion = filename[0]
            # Creating the dataset consisting of list of features and corresponding emotion type
            outputdata.append(list([loudness,pitch, emotion]))
        return outputdata

    # Function to split test and train data
    def get_train_test_data(self,data,percent_split):
        noOfSamples = len(data)*(1-percent_split)
        test =  data.iloc[0:int(noOfSamples), 2:]
        testsample=[]
        for i in range(len(test)):
            testsample = np.append(testsample,test.iloc[i].values[0])
        return data.iloc[int(noOfSamples):, 0:2], data.iloc[int(noOfSamples):, 2:],data.iloc[0:int(noOfSamples), 0:2],np.array(testsample)

    # Function to fit the SVM Model
    def trainNNet(self,data,label,feature_name):
        filenamelist =  gb.glob(self.working_directory+"Models/*")
        filename = "Models/SVM_" + feature_name + ".pkl"
        #print filenamelist.count(self.working_directory+"Models/SVM_"+feature_name+".pkl")
        if filenamelist.count(self.working_directory+"Models/SVM_"+feature_name+".pkl") == 0:
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, label.astype(str), test_size = 0.045, random_state = 0)
            self.model.fit(X_train,y_train)
            print("score",self.model.score(X_test,y_test))
            #print (cross_validation.cross_val_score(self.model, data, label.astype(str), cv =4))
            joblib.dump(self.model,  self.working_directory+filename)
        else:
            self.model = joblib.load(self.working_directory+filename)
            print "model already exists for feature "+feature_name+" !! training exiting"

    # Function h to predict batch input
    def predict(self,ftest,ltest,data,feature_name):
        predicted_data = []

        # Loop to traverse through the Test data and predict the corresponding
        for i in range(len(ftest)):
            predicted_data.append(self.model.predict(ftest.iloc[i].values.reshape(1,-1)))
        score = self.model.score(ftest, ltest)
        self.model_prediction_score.update({feature_name:score})

    # Function to predict single input data
    def predict_emotion(self,data):
        emotion_list=[]
        for modelfilepath in gb.glob(self.working_directory+"Models/*.pkl"):
            print modelfilepath
            emotion = modelfilepath.split("/")[-1].split(".")[0]
            model = joblib.load(modelfilepath)
            modelprediction = model.predict(data.values.reshape(1,-1))
            print modelprediction
            if modelprediction[0] !='NA':
                emotion_list.append(modelprediction[0])
            print emotion_list
        return emotion_list

    # converting a single wave file into a List of speech properties
    def load_data(self,filename):
        outputdata=[]
        # Loop to traverse through the input data file path
        for f in gb.glob(filename):
            frate, inputdata = sc.read(f)
            pitch = lp.getPitch(f,frate)
            loudness = abs(an.loudness(inputdata))
            filename = f.split("/")[-1].split(".")[0]
            if filename[0] == "s":
                emotion = filename[0:2]
            else:
                emotion = filename[0]
            outputdata.append(list([loudness, pitch, emotion]))
        return outputdata