def main(): """ Use a linear SVM for multi-class classification. One vs the rest : 77.61% Default : 77.61% One vs one : 85.07% """ seed = 123456789 np.random.seed(seed) ntrain, ntest = 800, 200 (tr_x, tr_y), (te_x, te_y) = load_mnist() x, y = np.vstack((tr_x, te_x)), np.hstack((tr_y, te_y)) cv = MNISTCV(tr_y, te_y, ntrain, ntest, 1, seed) for tr, te in cv: clf = OneVsRestClassifier(LinearSVC(random_state=seed), -1) clf.fit(x[tr], y[tr]) print(clf.score(x[te], y[te])) clf = LinearSVC(random_state=seed) clf.fit(x[tr], y[tr]) print(clf.score(x[te], y[te])) clf = OneVsOneClassifier(LinearSVC(random_state=seed), -1) clf.fit(x[tr], y[tr]) print(clf.score(x[te], y[te]))
def main(): word_vec_dict = readGloveData("./glove.twitter.27B/glove.twitter.27B.25d.txt") tweets = readTweets("./dataset_raw/semeval2016-task6-trainingdata.txt") tweetVectors = getTweetVectors(tweets[0 : len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {"favor": 1, "none": 0, "against": 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC()) # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543) X_train = X[0 : int(0.7 * len(X))] y_train = Y[0 : int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)) : len(X)] y_test = Y[int(0.7 * len(Y)) : len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test) y_pred = clf.predict(X_test) for indexMax in xrange(len(y_test)): print str(y_pred[indexMax]) + " " + str(y_test[indexMax])
def main(): word_vec_dict = readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt') tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {'favor': 1, 'none': 0, 'against': 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC(random_state=0)) # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0) X_train = X[0:int(0.7 * len(X))] y_train = Y[0:int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)) : len(X)] y_test = Y[int(0.7 * len(Y)) : len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test)
def test_solve_primal_l2_svc_with_line_search_optimizers(): X, y = load_iris(return_X_y=True) X_scaled = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=123456) svc = OVR(SVC(loss=squared_hinge, optimizer=SteepestGradientDescent)) svc = svc.fit(X_train, y_train) assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)), estimator.loss.x_star()) for estimator in svc.estimators_) assert svc.score(X_test, y_test) >= 0.57 svc = OVR(SVC(loss=squared_hinge, optimizer=ConjugateGradient)) svc = svc.fit(X_train, y_train) assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)), estimator.loss.x_star()) for estimator in svc.estimators_) assert svc.score(X_test, y_test) >= 0.57 svc = OVR(SVC(loss=squared_hinge, optimizer=Newton)) svc = svc.fit(X_train, y_train) assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)), estimator.loss.x_star()) for estimator in svc.estimators_) assert svc.score(X_test, y_test) >= 0.57 svc = OVR(SVC(loss=squared_hinge, optimizer=BFGS)) svc = svc.fit(X_train, y_train) assert (np.allclose(np.hstack((estimator.coef_, estimator.intercept_)), estimator.loss.x_star()) for estimator in svc.estimators_) assert svc.score(X_test, y_test) >= 0.57
def test_solve_dual_l2_svc_with_AdaGrad(): X, y = load_iris(return_X_y=True) X_scaled = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=123456) svc = OVR( SVC(loss=squared_hinge, kernel=gaussian, reg_intercept=True, dual=True, optimizer=AdaGrad, learning_rate=1.)) svc = svc.fit(X_train, y_train) assert svc.score(X_test, y_test) >= 0.97 svc = OVR( SVC(loss=squared_hinge, kernel=gaussian, reg_intercept=False, dual=True, optimizer=AdaGrad, learning_rate=1.)) svc = svc.fit(X_train, y_train) assert svc.score(X_test, y_test) >= 0.97
def main(): word_vec_dict = readGloveData('./glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = readTweets('./dataset_raw/semeval2016-task6-trainingdata.txt') tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {'favor': 1, 'none': 0, 'against': 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC()) # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543) X_train = X[0:int(0.7 * len(X))] y_train = Y[0:int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)) : len(X)] y_test = Y[int(0.7 * len(Y)) : len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test) y_pred = clf.predict(X_test) for indexMax in xrange(len(y_test)): print str(y_pred[indexMax]) + ' ' + str(y_test[indexMax])
def test_solve_dual_l1_svc_with_proximal_bundle(): X, y = load_iris(return_X_y=True) X_scaled = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=123456) svc = OVR( SVC(loss=hinge, kernel=gaussian, reg_intercept=True, dual=True, optimizer=ProximalBundle, max_iter=150)) svc = svc.fit(X_train, y_train) assert svc.score(X_test, y_test) >= 0.97 svc = OVR( SVC(loss=hinge, kernel=gaussian, reg_intercept=False, dual=True, optimizer=ProximalBundle, max_iter=150)) svc = svc.fit(X_train, y_train) assert svc.score(X_test, y_test) >= 0.97
def fit_multiclass_svm(documents, idfs): model = gensim.models.Word2Vec.load("train_word2vec.model") dim = 50; X = np.zeros([4000, dim]); X_test = np.zeros([490, dim]); y = np.zeros(4000); y_test = np.zeros(490); i = 0 for doc in documents[:4000]: x = np.zeros(dim) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X[i, :] = x/count y[i] = doc["topic_id"] i = i + 1; svm_model = OneVsRestClassifier(LinearSVC(random_state=0, C = 1)).fit(X, y) i = 0 for doc in documents[4000:4490]: x = np.zeros(dim) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X_test[i, :] = x/count y_test[i] = doc["topic_id"] i = i + 1; print svm_model.score(X_test, y_test)
def main(): word_vec_dict = readGloveData( '../glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt') tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {'favor': 1, 'none': 0, 'against': 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC(random_state=0)) # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0) X_train = X[0:int(0.7 * len(X))] y_train = Y[0:int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)):len(X)] y_test = Y[int(0.7 * len(Y)):len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test)
def fit_multiclass_svm1(documents, idfs): model = gensim.models.doc2vec.Doc2Vec.load("train_doc2vec.model") X = np.zeros([4000, 300]); X_test = np.zeros([490, 300]); y = np.zeros(4000); y_test = np.zeros(490); i = 0 for doc in documents[:4000]: x = np.zeros(300) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X[i, :] = x/count y[i] = doc["topic_id"] i = i + 1; svm_model = OneVsRestClassifier(svm.SVC(kernel='poly', gamma=2)).fit(X, y) i = 0 for doc in documents[4000:4490]: x = np.zeros(300) count = 0 for sent in doc["summary"]: for word in sent.split(): if word in model: x = x + (idfs[word] * model[word]) count += 1 X_test[i, :] = x/count y_test[i] = doc["topic_id"] i = i + 1; print svm_model.score(X_test, y_test)
def one_vs_all(X, y, test_size=0.2, run_num = 100, svm_type='linear'): """Trains 15 1 vs all SVM classifiers of specified type""" # Python has a wonderful wrapper function that creates 1 vs all classifiers! if type == 'linear': estimator = LinearSVC() else: # This will automatically use RBF functions estimator = SVC() ovr = OneVsRestClassifier(estimator = estimator) acc_tr = [] acc_tst = [] for i in range(run_num): [X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size=test_size) # Train the classifier ovr.fit(X_train, y_train.ravel()) # Work out the score on the training data. However there is nothing # to optimise for - we are just getting an idea of the accuracy for # training vs test data. box plot opportunity! tr_acc = ovr.score(X_train, y_train.ravel()) tst_acc = ovr.score(X_test, y_test.ravel()) acc_tr.append(tr_acc) acc_tst.append(tst_acc) # All the data isn't used here as it tends to overtrain the classifier. return ovr, acc_tr, acc_tst
def main(): # load train data and test data print("loading data...") train_datas, train_labels = load_mnist_data( file_name='data/train_data/TrainSamples', label_filename='data/train_data/TrainLabels') test_datas, test_labels = load_mnist_data( file_name='data/valid_data/ValidSamples', label_filename='data/valid_data/ValidLabels') print("Train data size: {}, Test data size: {}".format( len(train_datas), len(test_datas))) # 对训练和测试的特征数据进行标准化 ss = StandardScaler() train_datas = ss.fit_transform(train_datas) test_datas = ss.transform(test_datas) # create models and train print("training the model....") clf = OneVsRestClassifier(estimator=svm.SVC(C=10, kernel='rbf', gamma='auto'), n_jobs=4) clf.fit(train_datas, train_labels) print("training is done!") print("fit result: ", clf.score(train_datas, train_labels)) # test on the test data acc = clf.score(test_datas, test_labels) print("test acc: ", acc) predict = clf.predict(test_datas) print("Classification report:\n ", metrics.classification_report(test_labels, predict)) print("Confusion matrix:\n ", metrics.confusion_matrix(test_labels, predict))
def tfidfDoClassify(X_train, X_test, y_train, y_test, labels, label, n_components): resultDict = {} X_train = StandardScaler(with_mean=False).fit_transform(X_train) X_test = StandardScaler(with_mean=False).fit_transform(X_test) # iterate over classifiers for name, aclf in zip(names, classifiers): print n_components, label, name if name != "Logistic Regression": clf = OneVsRestClassifier(aclf) else: clf = aclf clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred_ = clf.predict(X_train) prf1sDict = {} # chi,pval = chi2(X_train, y_train) # prf1sDict["chi2"] = chi # prf1sDict["pval"] = pval precision = 0 recall = 0 fscore = 0 support = 0 try: precision, recall, fscore, support = precision_recall_fscore_support( y_test, y_pred, average='weighted', labels=labels) logging.debug( str(precision) + "," + str(recall) + "," + str(fscore) + "," + str(support) + "," + name + "," + str(n_components) + "," + label) score = clf.score(X_test, y_test) prf1sDict["testReport"] = classification_report(y_test, y_pred, labels=labels) prf1sDict["testConfusionMatrix"] = confusion_matrix( y_train, y_pred) # pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True) prf1sDict["testScore"] = score prf1sDict["testPrecision"] = precision prf1sDict["testRecall"] = recall prf1sDict["testFscore"] = fscore precision_, recall_, fscore_, support_ = precision_recall_fscore_support( y_train, y_pred_, average='weighted', labels=labels) score_ = clf.score(X_train, y_train) prf1sDict["trainReport"] = classification_report(y_train, y_pred_, labels=labels) prf1sDict["trainConfusionMatrix"] = confusion_matrix( y_train, y_pred_) # pd.crosstab(y_train, y_pred_, rownames=['True'], colnames=['Predicted'], margins=True) prf1sDict["trainScore"] = score_ prf1sDict["trainPrecision"] = precision_ prf1sDict["trainRecall"] = recall_ prf1sDict["trainFscore"] = fscore_ resultDict[name] = prf1sDict except ValueError: print name continue return resultDict
class SVMSentiment: def __init__(self): self.max_length = 500 self.batch_size=50 self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=1,C = 1,tol=0.0001,cache_size=5000) ) def configureSVMModel(self,TrainX,TrainY,validX,validY): print('Configuring the SVM Model') currPath = os.getcwd() currFiles = os.listdir(currPath) print('################### Test #####################') print(currFiles.count('SVMScores.pkl')) if(currFiles.count('SVMScores.pkl')==0): self.model.fit(TrainX, TrainY) # Saving model scores joblib.dump(self.model,currPath+'/SVMScores.pkl') else: print('Loading already existing Model') self.model = joblib.load(currPath+'/SVMScores.pkl') def evaluateSVMModel(self,TestX,TestY): print self.model.score(TestX, TestY) predicted_data=[] for i in range(len(TestX)): predicted_data.append(list([self.model.predict (TestX[i].reshape(1,-1)) ,TestY[i]]) ) print "Predicted Data" print predicted_data #print TestY def predictSentiment(self,dataX,dataY): print('@@@@@@@@@@@@@@@@ Length of test data : ',len(dataX)) for i in range(len(dataX)): predicted_data = self.model.predict(dataX[i].reshape(1,-1)) expected_out = dataY[i] print('############### Predicted data :',predicted_data,' ; ; ',expected_out) return predicted_data def getTrainTestData(self): print('Loading Training and Test data') trainX=[] trainY=[] testX=[] testY = [] f= open('trainingdata.pkl','rb') (trainX,trainY) = cPickle.load(f) f= open('testingdata.pkl','rb') (testX,testY) = cPickle.load(f) return ((trainX,trainY),(testX,testY)) def getValidationData(self,dataX,dataY): return dataX[0:self.batch_size,:],dataY[0:self.batch_size,:]
class SVMSentiment: def __init__(self): self.max_length = 500 self.batch_size = 50 self.model = OneVsRestClassifier( svm.SVC(kernel='rbf', gamma=1, C=1, tol=0.0001, cache_size=5000)) def configureSVMModel(self, TrainX, TrainY, validX, validY): print('Configuring the SVM Model') currPath = os.getcwd() currFiles = os.listdir(currPath) print('################### Test #####################') print(currFiles.count('SVMScores.pkl')) if (currFiles.count('SVMScores.pkl') == 0): self.model.fit(TrainX, TrainY) # Saving model scores joblib.dump(self.model, currPath + '/SVMScores.pkl') else: print('Loading already existing Model') self.model = joblib.load(currPath + '/SVMScores.pkl') def evaluateSVMModel(self, TestX, TestY): print self.model.score(TestX, TestY) predicted_data = [] for i in range(len(TestX)): predicted_data.append( list([self.model.predict(TestX[i].reshape(1, -1)), TestY[i]])) print "Predicted Data" print predicted_data #print TestY def predictSentiment(self, dataX, dataY): print('@@@@@@@@@@@@@@@@ Length of test data : ', len(dataX)) for i in range(len(dataX)): predicted_data = self.model.predict(dataX[i].reshape(1, -1)) expected_out = dataY[i] print('############### Predicted data :', predicted_data, ' ; ; ', expected_out) return predicted_data def getTrainTestData(self): print('Loading Training and Test data') trainX = [] trainY = [] testX = [] testY = [] f = open('trainingdata.pkl', 'rb') (trainX, trainY) = cPickle.load(f) f = open('testingdata.pkl', 'rb') (testX, testY) = cPickle.load(f) return ((trainX, trainY), (testX, testY)) def getValidationData(self, dataX, dataY): return dataX[0:self.batch_size, :], dataY[0:self.batch_size, :]
def train_model(reviews, result): X_train, X_test, y_train, y_test = train_test_split(reviews, result, test_size=0.2, random_state=42) svm_classifier = OneVsRestClassifier(LinearSVC(random_state=0)) svm_classifier.fit(X_train, y_train) print svm_classifier.score(X_test, y_test) joblib.dump(svm_classifier, './model_svm/svm_model.pkl')
def doClassify(X, y): resultDict = {} X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=testRatio, random_state=42) X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) # iterate over classifiers for name, clf in zip(names, classifiers): print "Running cliasifer:", name if name != "Logistic Regression": clf = OneVsRestClassifier(clf) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred_ = clf.predict(X_train) prf1sDict = {} precision = 0 recall = 0 fscore = 0 support = 0 try: precision, recall, fscore, support = precision_recall_fscore_support( y_test, y_pred, average="weighted") logging.debug( str(precision) + "," + str(recall) + "," + str(fscore) + "," + str(support) + "," + name) score = clf.score(X_test, y_test) prf1sDict["testReport"] = classification_report(y_test, y_pred) labels = list(set(y_test)) confMat = confusion_matrix(y_test, y_pred, labels=labels) print "confMat type:", type(confMat) print "confMat len:", len(confMat) print "confMat:" print confMat print labels prf1sDict["testConfMat"] = confMat.tolist() prf1sDict["testScore"] = score prf1sDict["testPrecision"] = precision prf1sDict["testRecall"] = recall prf1sDict["testFscore"] = fscore precision_, recall_, fscore_, support_ = precision_recall_fscore_support( y_train, y_pred_, average="weighted") score_ = clf.score(X_train, y_train) prf1sDict["trainReport"] = classification_report(y_train, y_pred_) prf1sDict["trainConfMat"] = confusion_matrix(y_train, y_pred_).tolist() prf1sDict["trainScore"] = score_ prf1sDict["trainPrecision"] = precision_ prf1sDict["trainRecall"] = recall_ prf1sDict["trainFscore"] = fscore_ resultDict[name] = prf1sDict except ValueError: print "Error for claissifier:", name print "Unexpected error in test:", sys.exc_info() continue return resultDict
def OneVsAll(train_data, test_data, train_label, test_label, numFeatures): binary_model = MSE_binary() model = OneVsRestClassifier(binary_model) model.fit(train_data[:, :numFeatures], train_label) print('Using', numFeatures, 'features:') print("Training Accuracy: " + str(model.score(train_data[:, :numFeatures], train_label) * 100) + " %") print("Testing Accuracy: " + str(model.score(test_data[:, :numFeatures], test_label) * 100) + " %")
def doClassify(jsonDict, label, feature, gramIndex): termDocMatrix, allSyscallsVector = cd.createTermDocMatrix( jsonDict, feature) labels = getAppLabelList(termDocMatrix, label) X, y = generateNormalFeatureMatrix(termDocMatrix, allSyscallsVector, label, labels) resultDict = {} X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=testRatio, random_state=42) X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) # iterate over classifiers for name, aclf in zip(names, classifiers): print gramIndex, label, feature, name if name != "Logistic Regression": clf = OneVsRestClassifier(aclf) else: clf = aclf clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred_ = clf.predict(X_train) prf1sDict = {} precision = 0 recall = 0 fscore = 0 support = 0 try: precision, recall, fscore, support = precision_recall_fscore_support( y_test, y_pred, average='weighted') logging.debug( str(precision) + "," + str(recall) + "," + str(fscore) + "," + str(support) + "," + name + "," + str(gramIndex) + "," + label + "," + feature) score = clf.score(X_test, y_test) prf1sDict["testReport"] = classification_report(y_test, y_pred) prf1sDict["testScore"] = score prf1sDict["testPrecision"] = precision prf1sDict["testRecall"] = recall prf1sDict["testFscore"] = fscore precision_, recall_, fscore_, support_ = precision_recall_fscore_support( y_train, y_pred_, average='weighted') score_ = clf.score(X_train, y_train) prf1sDict["trainReport"] = classification_report(y_train, y_pred_) prf1sDict["trainScore"] = score_ prf1sDict["trainPrecision"] = precision_ prf1sDict["trainRecall"] = recall_ prf1sDict["trainFscore"] = fscore_ resultDict[name] = prf1sDict except ValueError: print name continue return resultDict
def clasificar_OVA(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") clfBase=DecisionTreeClassifier() scores = cross_val_score(clfBase, X, y, cv=10) clf=OneVsRestClassifier(clfBase) clf=clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def classifiers(self): #对于多元分类,用SVM,且还要画ROC曲线的,必须使用OneVsRestClassifier,否则能分类但无法画出ROC classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True,)) classifier.fit(self.x_train,self.y_train) #输出四大参数供评估使用 self.y_predict=classifier.predict(self.x_test) #这里用predict_proba和decision_function都能画出ROC曲线,且不同,原因? #self.y_predict_proba=classifier.predict_proba(self.x_test) self.y_predict_proba=classifier.decision_function(self.x_test) self.train_accuracy=classifier.score(self.x_train,self.y_train) self.test_accuracy=classifier.score(self.x_test,self.y_test) #返回classifier,供交叉验证评估使用 return classifier
def start_classification(training_props): # print "training SVM model" vectorizer = DictVectorizer() X = convert_props(training_props) X = vectorizer.fit_transform(X) mb = MultiLabelBinarizer() Y = mb.fit_transform(zip(*training_props)[0]) X_train, X_test, y_train, y_test=train_test_split(X, Y,\ test_size=0.1, random_state=0) clf = OneVsRestClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) clf.score(X_test, y_test) predicted = clf.predict(X_test) print classification_report(y_test, predicted)
def evaluate(train_vector, test_vector): log.info('total training instances: {0}'.format(len(train_vector[0]))) log.info('total testing instances: {0}'.format(len(test_vector[0]))) classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1) classifier.fit(train_vector[0], train_vector[1]) with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f: pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL) train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1]) test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1]) log.info('accuracy train: {0}'.format(train_accuracy)) log.info('accuracy test: {0}'.format(test_accuracy))
def main(): number = [1000, 5000, 10000, 15000, 60000] for data in number: x_train, y_train = m.read_mnist( 'MNIST_data/train-images-idx3-ubyte.gz', 'MNIST_data/train-labels-idx1-ubyte.gz') x_train = x_train[:data] y_train = y_train[:data] X_train = x_train.reshape(-1, 28 * 28).astype(np.float32) X_train = X_train * (2.0 / 255.0) - 1.0 x_test, y_test = m.read_mnist('MNIST_data/t10k-images-idx3-ubyte.gz', 'MNIST_data/t10k-labels-idx1-ubyte.gz') X_test = x_test.reshape(-1, 28 * 28).astype(np.float32) X_test = X_test * (2.0 / 255.0) - 1.0 classif = OneVsRestClassifier(LinearSVC(C=100.)) print "Started learning..." before1 = dt.datetime.now() classif.fit(X_train, y_train) after1 = dt.datetime.now() print "Done learning!" beforeA = dt.datetime.now() scoreA = classif.score(X_test, y_test) afterA = dt.datetime.now() beforeTA = dt.datetime.now() scoreTA = classif.score(X_train, y_train) afterTA = dt.datetime.now() print "Test data accuracy: ", scoreA print "Training data accuracy: ", scoreTA print "Time it took to train once: ", after1 - before1 print "Time it took to verify test: ", afterA - beforeA print "Time it took to verify training: ", afterTA - beforeTA print "Learning again..." before2 = dt.datetime.now() classif.fit(X_train, y_train) after2 = dt.datetime.now() print "Done learning!" beforeB = dt.datetime.now() scoreB = classif.score(X_test, y_test) afterB = dt.datetime.now() beforeTB = dt.datetime.now() scoreTB = classif.score(X_train, y_train) afterTB = dt.datetime.now() print "Test data accuracy: ", scoreB print "Training data accuracy: ", scoreTB print "Time it took to train once: ", after2 - before2 print "Time it took to verify test: ", afterB - beforeB print "Time it took to verify training: ", afterTB - beforeTB
def get_trained_regr_classifier(training_data, test_data): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) X,y,X_test, y_test = [], [], [], [] for key in training_data.keys() & test_data.keys(): # Training samples X += training_data[key] y += [key]*len(training_data[key]) # Test samples X_test += test_data[key] y_test += [key]*len(test_data[key]) ovr.fit(X,y) training_score = ovr.score(X,y) test_score = ovr.score(X_test,y_test) return {"ovr": ovr, "training_score": training_score, "test_score": test_score}
def strat_1_v_rest_rf(X,y): #StratifiedShuffleSplit-OneVsRestClassifier-RandomForestClassifier #best mean accuracy = 0.7571428571428571 sss=sratifier(X,y) sss_rf_scores =[] trees_to_grow=155 depth = None sample_split=2 feature_selection='auto' for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] sss_rf = OneVsRestClassifier(RandomForestClassifier( n_jobs=-1, max_features = feature_selection, max_depth= depth, n_estimators=trees_to_grow, min_samples_split=sample_split,random_state=38)) sss_rf.fit(X_train, y_train) y_hat=sss_rf.predict(X_test) #print(y_hat,y_test) score=sss_rf.score(X_test,y_test) sss_rf_scores.append(score) #print(score) sss_rf_mean_accuracy=sum(sss_rf_scores)/len(sss_rf_scores) print(f'sss_rf mean accuracy= {sss_rf_mean_accuracy}') confuse_mattrix(y_test,y_hat) filename = '../data/models/Strat_oneVRest_Random_Forest.sav' pickle.dump(sss_rf, open(filename, 'wb'))
def oneVsRest_LogReg_TfIdf(X_train, X_test, Y_train, Y_test, word_dict, tags_dict, data_files, test_doc_ids ): print('Processing : oneVsRest_LogReg_TfIdf') print('-'*50) Y_original = Y_test vectorizer = CountVectorizer(min_df=1, vocabulary=word_dict) X_v_train = vectorizer.fit_transform(X_train) X_v_test = vectorizer.fit_transform(X_test) transformer = TfidfTransformer(smooth_idf=False) X_train_tf = transformer.fit_transform(X_v_train) X_test_tf = transformer.fit_transform(X_v_test) uniq_tags_names = list(tags_dict.keys()) mlb = preprocessing.MultiLabelBinarizer(classes=uniq_tags_names) Y_train = mlb.fit_transform(Y_train) Y_test = mlb.fit_transform(Y_test) classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=0.01)) classifier.fit(X_train_tf, Y_train) score = classifier.score(X_test_tf, Y_test) print('-' * 50) print('Score oneVsRest_LogReg_TfIdf : {}'.format(score)) print('-' * 50) Y_pred = classifier.predict(X_v_test) Y_back = mlb.inverse_transform(Y_pred) write_to_file(Y_original, Y_back, 'oneVsRest_LogREg', score, data_files, test_doc_ids)
def test_solve_svc_as_bcqp_with_active_set(): X, y = load_iris(return_X_y=True) X_scaled = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=1) svc = OneVsRestClassifier(DualSVC(kernel=gaussian, optimizer=ActiveSet)) svc = svc.fit(X_train, y_train) assert svc.score(X_test, y_test) >= 0.97
def svm_one_vs_rest_class(x_tr, x_ts, y_tr, y_ts): print("\nSVM One vs Rest Classification") x_train = x_tr x_test = x_ts y_train = y_tr y_test = y_ts print("Fitting Data...") trt_strt = time.time() svm_model_linear_ovr = OneVsRestClassifier( SVC(kernel='rbf', gamma='auto', C=1000000)).fit(x_train, y_train) trt_end = time.time() print("SVM OneVsRest Train Time", str(round(trt_end - trt_strt, 2)) + " sec") print("Predicting Data...") tst_strt = time.time() prediction = svm_model_linear_ovr.predict(x_test) tst_end = time.time() print("SVM OneVsRest Test Time", str(round(tst_end - tst_strt, 2)) + " sec") accuracy = svm_model_linear_ovr.score(x_test, y_test) print("SVM OneVsRest Accuracy", str(accuracy * 100) + ' %') print("SVM OneVsRest MSE", metrics.mean_squared_error(np.asarray(y_test), prediction))
def main(): train = False print('Reading data....') data = pd.read_csv('processed.cleveland.data', names=[ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num' ], header=None) print('Removing invalid values from data....') data = filterDataset(data) ## split data into train and test cols = list(data.columns) cols.remove('num') target = data['num'].copy() input_data = data[cols].copy() dTrain, dTest, targetTrain, targetTest = train_test_split(input_data, target, test_size=0.20) scaler = preprocessing.StandardScaler().fit(dTrain) dTrain = scaler.transform(dTrain) # if train: ovr = OneVsRestClassifier(LinearSVC(random_state=0), n_jobs=-1) print('Training model...') ovr.fit(dTrain, targetTrain) # joblib.dump(ovr, 'oneVsAll.pkl') # print('Model saved!') # else: # ovr = joblib.load('oneVsAll.pkl') dTest = scaler.transform(dTest) pred = ovr.predict(dTest) print(ovr.score(dTest, targetTest)) cm = confusion_matrix(targetTest, pred, labels=[0, 1, 2, 3, 4]) np.savetxt("confusion_matrix_ova.csv", cm, delimiter=",")
def predict(X_train, X_test, y_train, y_test, k, method_name): print('Start knn predicting...') knn = neighbors.KNeighborsClassifier(n_neighbors=k, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1) knn_ovo = OneVsOneClassifier(knn) knn_ovo.fit(X_train, y_train.values.ravel()) print('Accuracy score of knn_ovo: ' + '%.3f' % knn_ovo.score(X_test, y_test)) knn_ovr = OneVsRestClassifier(knn) knn_ovr.fit(X_train, y_train.values.ravel()) print('Accuracy score of knn_ovr: ' + '%.3f' % knn_ovr.score(X_test, y_test)) plot.plot_conf_matrix(X_test, y_test, knn_ovr, method_name + '_ovr') plot.plot_conf_matrix(X_test, y_test, knn_ovo, method_name + '_ovo') plot.plot_roc(X_train, X_test, y_train, y_test, knn_ovr, method_name + '_ovr')
def test_solve_linear_svc_with_proximal_bundle(): X, y = load_iris(return_X_y=True) X_scaled = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.75, random_state=1) svc = OneVsRestClassifier(PrimalSVC(loss=hinge, optimizer=ProximalBundle)) svc = svc.fit(X_train, y_train) assert svc.score(X_test, y_test) >= 0.57
def classification_with_SVM(Fglobal, y): database = joblib.load(open('Metadata/database.pkl', 'rb')) print( "transforming the model with Principle Component Analysis and evaluating the model with k-folds cross-validations..." ) k_folds = 10 sss = StratifiedShuffleSplit(n_splits=k_folds, test_size=0.3, random_state=1) splits = sss.split(Fglobal, y) pp = Preprocessor('standard', n_components=60) n_classes = len(database.classes) clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=10, gamma=0.01)) prfs = [] scores = [] acc = np.zeros(n_classes) for (train, test) in splits: Ftrain = Fglobal[train] Ftest = Fglobal[test] (Ftrain, Ftest) = pp.standardize(Ftrain, Ftest) (Ftrain, Ftest) = pp.project_on_pc(Ftrain, Ftest) clf.fit(Ftrain, y[train]) ypred = clf.predict(Ftest) scores.append(clf.score(Ftest, y[test])) prfs.append(precision_recall_fscore_support(y[test], ypred)) print("\nAccuracy = %0.2f (%0.2f)\n" % (np.mean(scores), np.std(scores))) joblib.dump(pp, open('Metadata/transformation_module.pkl', 'wb')) joblib.dump(clf, open('Metadata/classifier.pkl', 'wb'))
def classify_images_one_v_all(train_df, test_df): """One vs. All linear SVC with Grid Search for image classification""" param_grid = {'C': [0.1, 0.5, 1.0, 5., 10.]} clf = OneVsRestClassifier(GridSearchCV(LinearSVC(), param_grid=param_grid)) with open(train_df, 'rb') as train: train_df = pickle.load(train) with open(test_df, 'rb') as test: test_df = pickle.load(test) clf.fit(train_df.T.ix[:, train_df.T.columns != 'y'], train_df.T['y']) print('score: ' + str( clf.score(test_df.T.ix[:, test_df.T.columns != 'y'], test_df.T['y']))) # Compute confusion matrix cnf_matrix = confusion_matrix( test_df.T['y'], clf.predict(test_df.T.ix[:, test_df.T.columns != 'y'])) np.set_printoptions(precision=2) # Plot normalized confusion matrix plt.figure(figsize=(10, 10)) plot_confusion_matrix(cnf_matrix, classes=[ 'ant', 'bee', 'butterfly', 'centipede', 'dragonfly', 'ladybug', 'tick', 'beetle', 'termite', 'worm' ], normalize=True, title='Normalized confusion matrix') plt.show()
def hardMarginSVM_subclass(features, labels, g_vals, d_vals, k): accuracy_store = np.zeros( (len(np.atleast_1d(d_vals)), len(np.atleast_1d(g_vals)), k)) for g in range(len(np.atleast_1d(d_vals))): print(g) for i in range(len(np.atleast_1d(g_vals))): for j in range(k): features_t = np.zeros( (features[0].shape[0], features[0].shape[1])) labels_t = np.zeros((labels[0].shape[0], labels[0].shape[1])) features_v = np.zeros( (features[0].shape[0], features[0].shape[1])) labels_v = np.zeros((labels[0].shape[0], labels[0].shape[1])) #prep data for m in range(k): if j != m: features_t = np.vstack((features_t, features[m])) labels_t = np.vstack((labels_t, labels[m])) else: features_v = np.vstack((features_v, features[m])) labels_v = np.vstack((labels_v, labels[m])) # SVM classifier = OneVsRestClassifier( SVC(kernel='poly', degree=d_vals[g], gamma=g_vals[i])) classifier.fit(features_t, labels_t) svm_acc = classifier.score(features_v, labels_v) accuracy_store[g, i, j] = svm_acc return accuracy_store
def run(data_path): print "Reading the dataset:", data_path mnist = fetch_mldata('MNIST original') mnist.data, mnist.target = shuffle(mnist.data, mnist.target) # Trunk the data n_train = 600 n_test = 400 # Define training and testing sets indices = arange(len(mnist.data)) random.seed(0) train_idx = random.sample(indices, n_train) test_idx = random.sample(indices, n_test) X_train, y_train = mnist.data[train_idx], mnist.target[train_idx] X_test, y_test = mnist.data[test_idx], mnist.target[test_idx] # Apply a learning algorithm print "Applying a learning algorithm..." clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train) # Make a prediction print "Making predictions..." y_pred = clf.predict(X_test) print y_pred # Evaluate the prediction print "Evaluating results..." print "Precision: \t", metrics.precision_score(y_test, y_pred) print "Recall: \t", metrics.recall_score(y_test, y_pred) print "F1 score: \t", metrics.f1_score(y_test, y_pred) print "Mean accuracy: \t", clf.score(X_test, y_test)
def Main(): directory = 'data' X_train, y_train = parseData('training', directory) X_test, y_test = parseData('testing', directory) num_labels = 10 lamd = 0.1 a = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_test,y_test)#.predict(X_train) #print(a) print(a.score(X_train, y_train))
def svm_Model(X, y, Xt, yt): model = OneVsRestClassifier(svm.LinearSVC(C=1)).fit(X,y) predicted = model.predict(Xt) acc = model.score(Xt,yt) #print 'Actual output = \n',yt #print 'Predicted output = \n',predicted print 'Accuracy = ', acc* 100 ,'%'
class SVMSentiment: def __init__(self): l.getLogger("SVMSentimentAnalysis") l.basicConfig(level=l.ERROR) l.debug('Initializing the SVM Model') self.max_length = 500 self.batch_size=50 self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=3,C = 0.5,tol=0.0001,cache_size=5000)) def configureSVMModel(self,TrainX,TrainY,validX,validY): l.debug('Configuring the SVM Model') currPath = os.getcwd() currFiles = os.listdir(currPath) if(currFiles.count('SVMScores.pkl')==0): self.model.fit(TrainX, TrainY) # Saving model scores joblib.dump(self.model,currPath+'/SpeechTextModels/SVMScores.pkl') else: l.debug('Loading already existing Model') self.model = joblib.load(currPath+'/SpeechTextModels//SVMScores.pkl') def evaluateSVMModel(self,TestX,TestY): l.debug("Model Score:::%s",self.model.score(TestX, TestY)) predicted_data=[] for i in range(len(TestX)): predicted_data.append(list([self.model.predict (TestX[i].reshape(1,-1)) ,TestY[i]]) ) l.debug("Current Model Prediction::: %s",str(predicted_data)) def predictSentiment(self,dataX): for i in range(len(dataX)): predicted_data = self.model.predict(dataX[i].reshape(1,-1)) return predicted_data def getTrainTestData(self): l.debug('Loading Training and Test data') (trainX,trainY) = cPickle.load(open('trainingdata.pkl','rb')) (testX,testY) = cPickle.load(open('testingdata.pkl','rb')) return ((trainX,trainY),(testX,testY)) def getValidationData(self,dataX,dataY): return dataX[0:self.batch_size,:],dataY[0:self.batch_size,:]
def run(): N = 5; #number of classes mat_contents = sio.loadmat('octave_X.mat') X = mat_contents['Norm_X'] mat_contents = sio.loadmat('octave_Y.mat') Y = mat_contents['Y'] mat_contents = sio.loadmat('octave_XT.mat') XT = mat_contents['Norm_XT'] mat_contents = sio.loadmat('octave_YT.mat') YT = mat_contents['Y_Test'] classifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y) prediction = classifier.predict(XT) print("Accuracy : %f" %classifier.score(XT,YT)) confusion_matrix = np.zeros((N, N)) for (ind,r) in enumerate(prediction): confusion_matrix[int(YT[ind][0])][int(r)] +=1 print("Confusion Matrix") for line in confusion_matrix: print(line)
class SVM(ContinuousModel): """C-Support Vector Machine Classifier When decision_function_shape == 'ovr', we use OneVsRestClassifier(SVC) from sklearn.multiclass instead of the output from SVC directory since it is not exactly the implementation of One Vs Rest. References ---------- http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html """ def __init__(self, *args, **kwargs): self.model = sklearn.svm.SVC(*args, **kwargs) if self.model.decision_function_shape == 'ovr': self.decision_function_shape = 'ovr' # sklearn's ovr isn't real ovr self.model = OneVsRestClassifier(self.model) def train(self, dataset, *args, **kwargs): return self.model.fit(*(dataset.format_sklearn() + args), **kwargs) def predict(self, feature, *args, **kwargs): return self.model.predict(feature, *args, **kwargs) def score(self, testing_dataset, *args, **kwargs): return self.model.score(*(testing_dataset.format_sklearn() + args), **kwargs) def predict_real(self, feature, *args, **kwargs): dvalue = self.model.decision_function(feature, *args, **kwargs) if len(np.shape(dvalue)) == 1: # n_classes == 2 return np.vstack((-dvalue, dvalue)).T else: if self.decision_function_shape != 'ovr': LOGGER.warn("SVM model support only 'ovr' for multiclass" "predict_real.") return dvalue
def multiclass_SVC(X, y): from sklearn.svm import LinearSVC from sklearn import cross_validation # first move: split data X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.35) # one-vs-rest implementation from sklearn.multiclass import OneVsRestClassifier ovr = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train) # one-vs-all implementation from sklearn.multiclass import OneVsOneClassifier ovo = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X_train, y_train) one_vs_rest = ovr.score(X_test, y_test) one_vs_one = ovo.score(X_test, y_test) return one_vs_rest, one_vs_one
def ova(train_values, train_classes, test_values, test_classes, desc, deg=2, perc=100): # scaler2 = preprocessing.MinMaxScaler((-1, 1)).fit(train_values) # train_values_scaled = scaler2.transform(train_values) # test_data_scaled = scaler2.transform(test_data) # scaler = preprocessing.StandardScaler().fit(train_values) # train_values_scaled = scaler.transform(train_values) # test_data_scaled = scaler.transform(test_data) all_values = numpy.concatenate((train_values, test_values)) all_classes = numpy.concatenate((train_classes, test_classes)) x = SelectPercentile(f_classif, percentile=perc).fit_transform(all_values, all_classes) svm = SVC(kernel='poly', degree=deg, random_state=0) ova = OneVsRestClassifier(svm) ova.fit(x[:144, :], train_classes) score = ova.score(x[144:, :], test_classes) cm = confusion_matrix(test_classes, ova.predict(x[144:, :])) print('Confusion matrix') print(cm) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis] print('Normalized confusion matrix') print(cm_normalized) plot_confusion_matrix(desc, cm_normalized, title='Normalized confusion matrix') # score = ova.score(x[:144, :], train_classes) # ova.fit(train_values, train_classes) # score = ova.score(test_values, test_classes) # ova.fit(train_values_scaled, train_classes) # score = ova.score(test_data_scaled, test_classes) return score
# class_weight=None, # all classes are treated equally # verbose=False, # print the logs # max_iter=-1, # no limit, let it run # decision_function_shape=None, # will use one vs rest explicitly # random_state=None) svc_model = OneVsRestClassifier (classifier, n_jobs=1) svc_model.fit(X_train,y_train) # knc_model =KNeighborsClassifier(n_neighbors=5) # knc_model.fit(X_train,y_train) # predicted = svc_model.predict(X_test) scores = cross_val_score(knc_model, X_train, y_train, cv=5) print 'RF: Accuracy with a single train/test split', svc_model.score(y_test, predicted) predicted = svc_model.predict(X_test) print 'RF: Accuracy with a single train/test split', accuracy_score(y_test, predicted) scores = cross_val_score(svc_model, X_train, y_train, cv=5) print 'RF: the mean of Accuracy with a cross value train/test split is: ', scores.mean() print 'RF:The std of Accuracy with a cross value train/test split is', scores.std() ############################ Predict the test ################################### sub = pd.read_csv("../input/sample_submission.csv") sub['id'] = test_data.sort_values(by='id' , ascending=True)
class speechLSTM: # Initializing the LSTM Model def __init__(self): self.prevData = 100 self.batchsize=200 self.model = OneVsRestClassifier(svm.SVC(kernel='poly',gamma=1,C = 1,tol=0.0001,cache_size=5000) ) #self.model = OneVsRestClassifier(LinearSVC(random_state=0)) def load_data_file(self): outputdata = [] for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/DC/*.wav"): frate, inputdata = sc.read(f) pitch=lp.getPitch(f) emotion = "" loudness = abs(an.loudness(inputdata)) filename = f.split("/")[-1].split(".")[0] if filename[0] == "s": emotion = filename[0:2] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) else: emotion = filename[0] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) outputdata.append(list([loudness,pitch, emotion])) for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/JE/*.wav"): frate, inputdata = sc.read(f) pitch = lp.getPitch(f) emotion = "" loudness = abs(an.loudness(inputdata)) filename = f.split("/")[-1].split(".")[0] if filename[0] == "s": emotion = filename[0:2] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) else: emotion = filename[0] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) outputdata.append(list([loudness, pitch, emotion])) for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/JK/*.wav"): frate, inputdata = sc.read(f) pitch = lp.getPitch(f) emotion = "" loudness = abs(an.loudness(inputdata)) filename = f.split("/")[-1].split(".")[0] if filename[0] == "s": emotion = filename[0:2] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) else: emotion = filename[0] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) outputdata.append(list([loudness, pitch, emotion])) for f in gb.glob("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/KL/*.wav"): frate, inputdata = sc.read(f) pitch = lp.getPitch(f) emotion = "" loudness = abs(an.loudness(inputdata)) filename = f.split("/")[-1].split(".")[0] if filename[0] == "s": emotion = filename[0:2] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) else: emotion = filename[0] ##emotion = float(int(hashlib.md5(emotion).hexdigest(), 16)) outputdata.append(list([loudness, pitch, emotion])) return outputdata def get_train_test_data(self,data,percent_split): noOfSamples = len(data)*(1-percent_split) print("No of Samples", noOfSamples) test = data.iloc[0:int(noOfSamples), 2:] test1=[] for i in range(len(test)): test1 = np.append(test1,test.iloc[i].values[0]) return data.iloc[int(noOfSamples):, 0:2], data.iloc[int(noOfSamples):, 2:],data.iloc[0:int(noOfSamples), 0:2],np.array(test1) def trainNNet(self,data_,label_): #data = data_/data_.max(axis=0) #label = label_/label_.max(axis=0) data=data_ label=label_ X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, label.astype(str), test_size = 0.045, random_state = 0) self.model.fit(X_train,y_train) print("score",self.model.score(X_test,y_test)) #print (cross_validation.cross_val_score(self.model, data, label.astype(str), cv =4)) def predict(self,ftest_,ltest_): #ltest=ltest_/ltest_.max(axis=0) #ftest=ftest_/ftest_.max(axis=0) ftest=ftest_ ltest=ltest_ predicted_data = [] count=0 for i in range(len(ftest)): predicted_data.append(self.model.predict(ftest.iloc[i].values.reshape(1,-1))) print predicted_data print ltest
logistic=LogisticRegression() logistic.fit(X,y) print 'Predicted class %s, real class %s' % (logistic.predict(iris.data[-1,:]), iris.target[-1]) print 'Probabilities for each class from 0 to 2: %s' % (logistic.predict_proba(iris.data[-1,:])) #*******************Logistic Regression on multiClass***************************** from sklearn.datasets import load_digits digits=load_digits() X,y=digits.data[:1700,:],digits.target[:1700] tX,ty=digits.data[1700:,:],digits.target[1700:] from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsOneClassifier OVR=OneVsRestClassifier(LogisticRegression()).fit(X,y) OVO=OneVsOneClassifier(LogisticRegression()).fit(X, y) print 'One vs rest accuracy: %.3f'% OVR.score(tX, ty)
def doAnalysis(city, state, category, filterConditions, k=10): with open('projectdata\\'+city + category + 'income.json','r') as r_file: data = json.load(r_file) keyset =set() unkeyset = set() unkvalueset =set() newdata = [] nbnewdata = [] flabels = [] fnewdata = [] labels = [] nblabels = [] maxfeats = 0 filtercondition = True for dno,datum in enumerate(data): # #filter # if 'Chinese' not in datum['categories'] and 'Pizza' not in datum['categories'] and 'Italian' not in datum['categories']: # continue # if 'Wi-Fi' in datum['attributes'] and datum['attributes']['Wi-Fi'] == 'no': # continue # #if 'Ambience' in datum['attributes'] and 'casual' in datum['Ambience'] and datum['Ambience']['casual'] == False: # # continue # if 'Price Range' in datum['attributes'] and datum['attributes']['Price Range'] != 2: # continue #if datum["stars"]>=3.0: if evalFilter(datum, filterConditions): continue newdatum = {} label = {} featcnt = 0 for key in datum: if key == "stars": #labels.append(datum[key]) #label[key] = str(datum[key]) #labels.append(str(datum[key])) ##jj # if datum[key] <= 2.5: # #if datum[key]==3.0: #and datum[key] <= 3.5: # labels.append("B") # #nblabels.append("na") # # elif datum[key] >= 4.5: # # labels.append("G") # #else: # #elif datum[key] == 3.5: if datum[key] > 2.5: labels.append("NB") if datum[key] >= 4.5: nblabels.append("EG") # elif datum[key] >= 3.5: # nblabels.append("G") else: nblabels.append("G") if datum[key]==3.0: flabels.append("F") else: flabels.append("AAvg") # else: # labels.append("G") else: labels.append("PB") elif key!="name" and key!="business_id" and key!="full_address" and key!="latitude" and key!="longitude" and key!="hours" and key[0]!=" " and key[0]!="$" and key!='review_count' and key!='categories': if isinstance(datum[key], int) or isinstance(datum[key], long) or isinstance(datum[key], float) \ or isinstance(datum[key], str) or isinstance(datum[key], bool): newdatum[key] = datum[key] keyset.add(key) featcnt+=1 elif isinstance(datum[key],dict): for subkey in datum[key]: if isinstance(datum[key][subkey], int) or isinstance(datum[key][subkey], long) or \ isinstance(datum[key][subkey], float) or isinstance(datum[key][subkey], str) or \ isinstance(datum[key][subkey], bool): newdatum[key+" "+subkey] = datum[key][subkey] keyset.add(subkey) featcnt+=1 elif isinstance(datum[key][subkey], unicode): ascval = unicodedata.normalize('NFKD', datum[key][subkey]).encode('ascii','ignore') newdatum[key+" "+subkey] = ascval keyset.add(subkey) featcnt+=1 elif isinstance(datum[key][subkey],dict): for subsubkey in datum[key][subkey]: if isinstance(datum[key][subkey][subsubkey], int) or \ isinstance(datum[key][subkey][subsubkey], long) or \ isinstance(datum[key][subkey][subsubkey], float) or \ isinstance(datum[key][subkey][subsubkey], str) or \ isinstance(datum[key][subkey][subsubkey], bool): newdatum[key+" "+subkey+" "+subsubkey] = datum[key][subkey][subsubkey] keyset.add(subsubkey) featcnt+=1 elif isinstance(datum[key][subkey][subsubkey], unicode): ascval = unicodedata.normalize('NFKD', datum[key][subkey][subsubkey]).encode('ascii','ignore') newdatum[key+" "+subkey+" "+subsubkey] = ascval keyset.add(subsubkey) featcnt+=1 else: unkeyset.add(key+ " "+subsubkey) unkvalueset.add(type(datum[key][subkey][subsubkey])) else: unkeyset.add(key+ " "+subkey) unkvalueset.add(type(datum[key][subkey])) elif isinstance(datum[key],list): for itnum, item in enumerate(datum[key]): if isinstance(item, int) or isinstance(item, long) or isinstance(item, float) or \ isinstance(item, str) or isinstance(item, bool): newdatum[key +" "+str(item)] = True keyset.add(key) featcnt+=1 elif isinstance(item, unicode): ascval = unicodedata.normalize('NFKD', item).encode('ascii','ignore') newdatum[key +" "+ascval] = True keyset.add(key) featcnt+=1 else: unkeyset.add(key) unkvalueset.add(type(item)) else: if isinstance(datum[key], unicode): ascval = unicodedata.normalize('NFKD', datum[key]).encode('ascii','ignore') newdatum[key] = ascval keyset.add(key) featcnt+=1 else: unkeyset.add(key) unkvalueset.add(type(datum[key])) newdata.append(newdatum) if datum["stars"] > 2.5: nbnewdata.append(newdatum) if datum["stars"] < 4.5: fnewdata.append(newdatum) if featcnt > maxfeats: maxfeats = featcnt #labels.append(label) #print keyset print unkeyset print unkvalueset # for unk in unkeyset: # if isinstance(unk, unicode): # print "yeah" # else: # print "nah" dv = DictVectorizer(sparse=False) vectdata = dv.fit_transform(newdata) #print vectdata[0] nbdv = DictVectorizer(sparse=False) nbvectdata = nbdv.fit_transform(nbnewdata) #print nbvectdata[0] fdv = DictVectorizer(sparse=False) fvectdata = fdv.fit_transform(fnewdata) #print nbvectdata[0] print len(flabels), len(fnewdata), len(fvectdata) # split = (int)(round(len(vectdata)*0.7)) # print split # traindata = vectdata[0:split] # trainlabels = labels[0:split] # testdata = vectdata[split:] # testlabels = labels[split:] #traindata, testdata, trainlabels, testlabels = train_test_split(vectdata, labels, test_size=0.33, random_state=42, stratify=) #nbtraindata, nbtestdata, nbtrainlabels, nbtestlabels = train_test_split(vectdata, nblabels, test_size=0.33, random_state=42) traindata = [] trainlabels =[] testdata = [] testlabels = [] sssidxs = StratifiedShuffleSplit(labels, n_iter=1, test_size=0.7, random_state=0) #print len(sssidxs) for train_index, test_index in sssidxs: #print train_index#, test_index #print("TRAIN:", train_index, "TEST:", test_index) traindata, testdata = vectdata[train_index], vectdata[test_index] for tr_idx in train_index: trainlabels.append(labels[tr_idx]) for ts_idx in test_index: testlabels.append(labels[ts_idx]) #nbvectdata = [] #nbpoplabels= [] nbtraindata = [] nbtrainlabels =[] nbtestdata = [] nbtestlabels = [] # for nbd, nbdatum in enumerate(vectdata): # if nblabels[nbd] != "na": # nbpoplabels.append(nblabels[nbd]) # nbvectdata.append(nbdatum) nbsssidxs = StratifiedShuffleSplit(nblabels, n_iter=1, test_size=0.7, random_state=0) for train_index, test_index in nbsssidxs: #print("TRAIN:", train_index, "TEST:", test_index) nbtraindata, nbtestdata = nbvectdata[train_index], nbvectdata[test_index] #nbtrainlabels, nbtestlabels = nblabels[train_index], nblabels[test_index] for tr_idx in train_index: #nbtraindata.append(nbvectdata[tr_idx]) nbtrainlabels.append(nblabels[tr_idx]) for ts_idx in test_index: #nbtestdata.append(nbvectdata[ts_idx]) nbtestlabels.append(nblabels[ts_idx]) ftraindata = [] ftrainlabels =[] ftestdata = [] ftestlabels = [] # for nbd, nbdatum in enumerate(vectdata): # if nblabels[nbd] != "na": # nbpoplabels.append(nblabels[nbd]) # nbvectdata.append(nbdatum) fsssidxs = StratifiedShuffleSplit(flabels, n_iter=1, test_size=0.7, random_state=0) for train_index, test_index in fsssidxs: #print("TRAIN:", train_index, "TEST:", test_index) ftraindata, ftestdata = fvectdata[train_index], fvectdata[test_index] #nbtrainlabels, nbtestlabels = nblabels[train_index], nblabels[test_index] for tr_idx in train_index: #nbtraindata.append(nbvectdata[tr_idx]) ftrainlabels.append(flabels[tr_idx]) for ts_idx in test_index: #nbtestdata.append(nbvectdata[ts_idx]) ftestlabels.append(flabels[ts_idx]) # vectpca = PCA(n_components=4) # vectpca.fit(vectdata) # pcacorr = [] # for var in vectdata: # for comp in vectpca.components_: # corr = pearsonr(var, comp) # #print corr # pcacorr.append(corr) # vectlabels = dv.fit_transform(labels) # #jj # print "LDA" # vectlda = LinearDiscriminantAnalysis(n_components=10) # ldacomps = vectlda.fit(traindata, trainlabels).transform(traindata) # print "===========================" # print maxfeats # #end jj # # for jdx, coef in enumerate(vectlda.coef_): # print vectlda.classes_[jdx] # for idx,(k,v) in enumerate(sorted(dv.vocabulary_.items(),key=itemgetter(1))): # print k, coef[idx] # print "===============" # #jj # ldaacc = vectlda.score(testdata,testlabels) # print "ldaacc ", ldaacc # #end jj # ldacorr = [] # for var in vectdata: # for comp in ldacomps: # corr = pearsonr(var, comp) # #print corr # ldacorr.append(corr) # vectlabels = dv.fit_transform(labels) # #jj # print "QDA" # vectqda = QuadraticDiscriminantAnalysis() # qdacomps = vectqda.fit(traindata, trainlabels)#.transform(vectdata) # print "===========================" # print maxfeats # qdaacc = vectqda.score(testdata, testlabels) # print "qdaacc ", qdaacc # #end jj # for jdx, coef in enumerate(vectqda.coef_): # print vectqda.classes_[jdx] # for idx,(k,v) in enumerate(sorted(dv.vocabulary_.items(),key=itemgetter(1))): # print k, coef[idx] # print "===============" # ldacorr = [] # for var in vectdata: # for comp in ldacomps: # corr = pearsonr(var, comp) # #print corr # ldacorr.append(corr) # rcf = RandomForestClassifier(n_estimators=200, warm_start=True,oob_score=True) # rcfcomps = rcf.fit(traindata,trainlabels).transform(traindata) #rcfacc = rcf #print "rcfacc ", rcfacc # #jj # adb = AdaBoostClassifier(n_estimators=200) # adcomps = adb.fit(traindata, trainlabels)#.transform(traindata) # adbacc = adb.score(testdata, testlabels) # print "adbacc ", adbacc # print adb.feature_importances_ # # gdb = GradientBoostingClassifier(n_estimators=200) # gdcomps = gdb.fit(traindata,trainlabels)#.transform(traindata) # gdbacc = gdb.score(testdata,testlabels) # print "gdbacc ", gdbacc # print gdb.feature_importances_ # #end jj ovs = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)) ovcomps = ovs.fit(traindata, trainlabels) ovsacc = ovs.score(testdata, testlabels) print "ovsacc ", ovsacc ovsestimators = ovs.estimators_ ovsfeatimps = [] for i,ovsest in enumerate(ovsestimators): #print ovs.classes_[i], ovsest.feature_importances_ ovsfeatimps = ovsest.feature_importances_ #print len(dv.feature_names_) #print dv.feature_names_ #print dv.vocabulary_[' $100000 to $124999'] ovsfimps = sorted(zip(dv.feature_names_,ovsfeatimps),key=itemgetter(1),reverse=True)[:k] #print sum(ovsfimps[:][2]) print ovsfimps # rfecv = RFECV(estimator=rfc, step=100, cv=StratifiedKFold(trainlabels, 2), # scoring='accuracy') # rfecv.fit(traindata, trainlabels) # # print("Optimal number of features : %d" % rfecv.n_features_) # # # Plot number of features VS. cross-validation scores # plt.figure() # plt.xlabel("Number of features selected") # plt.ylabel("Cross validation score (nb of correct classifications)") # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.show() # print rfecv.ranking_ rfc = RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True) rfe = RFE(estimator=rfc, n_features_to_select=k,step = 0.1) rfe.fit(traindata, trainlabels) print rfe.n_features_ print len(rfe.ranking_) rfetestdata = [[each_list[i] for i, supp in enumerate(rfe.support_) if supp == True ] for each_list in testdata] print "rfeacc ", rfe.estimator_.score(rfetestdata, testlabels) # plt.figure() # plt.xlabel("Number of features selected") # plt.ylabel("Cross validation score (nb of correct classifications)") # plt.plot(range(1, len(rfe.scores_) + 1), rfe.scores_) # plt.show() rfefeatimps = [] rfefeatnames = [dv.feature_names_[i] for i, supp in enumerate(rfe.support_) if supp == True ] impsums = sum(rfe.estimator_.feature_importances_) rfeimps = sorted(zip(rfefeatnames,rfe.estimator_.feature_importances_/impsums),key=itemgetter(1),reverse=True) print rfeimps print sum([pair[1] for pair in rfeimps]) # #jj # ovs2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=200)) # ovcomps2 = ovs2.fit(traindata, trainlabels) # ovsacc2 = ovs2.score(testdata, testlabels) # print "ovsacc2 ", ovsacc2 # # ovo = OneVsOneClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)) # ovocomps = ovo.fit(traindata, trainlabels) # ovoacc = ovo.score(testdata, testlabels) # print "ovoacc ", ovoacc # # # ovo2 = OneVsOneClassifier(AdaBoostClassifier(n_estimators=200)) # ovocomps2 = ovo2.fit(traindata, trainlabels) # ovoacc2 = ovo2.score(testdata, testlabels) # print "ovoacc2 ", ovoacc2 # #print ovs.coef_ # # ovosvm = OneVsOneClassifier(NuSVC(nu=0.1,kernel='poly',random_state=0)) # ovosvm.fit(traindata, trainlabels) # ovosvmacc = ovosvm.score(testdata, testlabels) # print "ovosvmacc ", ovosvmacc # #end jj # clf = NuSVR(kernel = 'rbf',C=1.0, nu=0.5) # clf.fit(traindata, trainlabels) # nusvacc = clf.score(testdata, testlabels) # print "nusvacc ", nusvacc print "===========================NB=================================" # #jj # nbadb = AdaBoostClassifier(n_estimators=200) # nbadcomps = nbadb.fit(nbtraindata, nbtrainlabels)#.transform(traindata) # nbadbacc = nbadb.score(nbtestdata, nbtestlabels) # print "nbadbacc ", nbadbacc # print nbadb.feature_importances_ # # gdb = GradientBoostingClassifier(n_estimators=200) # gdcomps = gdb.fit(nbtraindata,nbtrainlabels)#.transform(traindata) # gdbacc = gdb.score(nbtestdata,nbtestlabels) # print "gdbacc ", gdbacc # print gdb.feature_importances_ # #end jj nbovs = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)) nbovcomps = nbovs.fit(nbtraindata, nbtrainlabels) nbovsacc = nbovs.score(nbtestdata, nbtestlabels) print "nbovsacc ", nbovsacc nbovsestimators = nbovs.estimators_ nbovsfeatimps = [] for i,nbovsest in enumerate(nbovsestimators): # print nbovs.classes_[i], nbovsest.feature_importances_ nbovsfeatimps = nbovsest.feature_importances_ nbovsfimps = sorted(zip(nbdv.feature_names_,nbovsfeatimps),key=itemgetter(1),reverse=True)[:k] print nbovsfimps nbrfc = RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True) nbrfe = RFE(estimator=nbrfc, n_features_to_select=k,step = 0.1) nbrfe.fit(nbtraindata, nbtrainlabels) print nbrfe.n_features_ print len(nbrfe.ranking_) nbrfetestdata = [[each_list[i] for i, supp in enumerate(nbrfe.support_) if supp == True ] for each_list in nbtestdata] print "nbrfeacc ", nbrfe.estimator_.score(nbrfetestdata, nbtestlabels) nbrfefeatnames = [dv.feature_names_[i] for i, supp in enumerate(nbrfe.support_) if supp == True ] nbimpsums = sum(nbrfe.estimator_.feature_importances_) nbrfeimps = sorted(zip(nbrfefeatnames,nbrfe.estimator_.feature_importances_/nbimpsums),key=itemgetter(1),reverse=True) print nbrfeimps print sum([pair[1] for pair in nbrfeimps]) # #jj # nbovs2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=200)) # nbovcomps2 = nbovs2.fit(nbtraindata, nbtrainlabels) # nbovsacc2 = nbovs2.score(nbtestdata, nbtestlabels) # print "nbovsacc2 ", nbovsacc2 # # nbovo = OneVsOneClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)) # nbovocomps = nbovo.fit(nbtraindata, nbtrainlabels) # nbovoacc = nbovo.score(nbtestdata, nbtestlabels) # print "nbovoacc ", nbovoacc # # nbovo2 = OneVsOneClassifier(AdaBoostClassifier(n_estimators=200)) # nbovocomps2 = nbovo2.fit(nbtraindata, nbtrainlabels) # nbovoacc2 = nbovo2.score(nbtestdata, nbtestlabels) # print "nbovoacc2 ", nbovoacc2 # #print ovs.coef_ # # nbovosvm = OneVsOneClassifier(LinearSVC(random_state=0)) # nbovosvm.fit(nbtraindata, nbtrainlabels) # nbovosvmacc = nbovosvm.score(nbtestdata, nbtestlabels) # print "nbovosvmacc ", nbovosvmacc # #end jj print "===========================f=================================" # #jj # fadb = AdaBoostClassifier(n_estimators=200) # fadcomps = fadb.fit(ftraindata, ftrainlabels)#.transform(traindata) # fadbacc = fadb.score(ftestdata, ftestlabels) # print "fadbacc ", fadbacc # print fadb.feature_importances_ # # gdb = GradientBoostingClassifier(n_estimators=200) # gdcomps = gdb.fit(ftraindata,ftrainlabels)#.transform(traindata) # gdbacc = gdb.score(ftestdata,ftestlabels) # print "gdbacc ", gdbacc # print gdb.feature_importances_ # #end jj fovs = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)) fovcomps = fovs.fit(ftraindata, ftrainlabels) fovsacc = fovs.score(ftestdata, ftestlabels) print "fovsacc ", fovsacc fovsestimators = fovs.estimators_ fovsfeatimps = [] for i,fovsest in enumerate(fovsestimators): #print fovs.classes_[i], fovsest.feature_importances_ fovsfeatimps = fovsest.feature_importances_ fovsfimps = sorted(zip(fdv.feature_names_,fovsfeatimps),key=itemgetter(1),reverse=True)[:k] print fovsfimps owfile = 'output.json' with open(owfile,'w') as owrfile: owrfile.write(json.dumps([dict(ovsfimps), dict(nbovsfimps), dict(fovsfimps)])) frfc = RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True) frfe = RFE(estimator=frfc, n_features_to_select=k,step = 0.1) frfe.fit(ftraindata, ftrainlabels) print frfe.n_features_ print len(frfe.ranking_) frfetestdata = [[each_list[i] for i, supp in enumerate(frfe.support_) if supp == True ] for each_list in ftestdata] print "frfeacc ", frfe.estimator_.score(frfetestdata, ftestlabels) frfefeatnames = [dv.feature_names_[i] for i, supp in enumerate(frfe.support_) if supp == True ] fimpsums = sum(frfe.estimator_.feature_importances_) frfeimps = sorted(zip(frfefeatnames,frfe.estimator_.feature_importances_/fimpsums),key=itemgetter(1),reverse=True) print frfeimps print sum([pair[1] for pair in frfeimps]) # # #jj # fovs2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=200)) # fovcomps2 = fovs2.fit(ftraindata, ftrainlabels) # fovsacc2 = fovs2.score(ftestdata, ftestlabels) # print "fovsacc2 ", fovsacc2 # # fovo = OneVsOneClassifier(RandomForestClassifier(n_estimators=200, warm_start=True, oob_score=True)) # fovocomps = fovo.fit(ftraindata, ftrainlabels) # fovoacc = fovo.score(ftestdata, ftestlabels) # print "fovoacc ", fovoacc # # fovo2 = OneVsOneClassifier(AdaBoostClassifier(n_estimators=200)) # fovocomps2 = fovo2.fit(ftraindata, ftrainlabels) # fovoacc2 = fovo2.score(ftestdata, ftestlabels) # print "fovoacc2 ", fovoacc2 # #print ovs.coef_ # # fovosvm = OneVsOneClassifier(NuSVC(nu=0.435,kernel='poly',random_state=0)) # fovosvm.fit(ftraindata, ftrainlabels) # fovosvmacc = fovosvm.score(ftestdata, ftestlabels) # print "fovosvmacc ", fovosvmacc # #end jj #return ovsfimps, nbovsfimps, fovsfimps return rfeimps, nbrfeimps, frfeimps
# Define training and testing sets print "Splitting into a training and a testing set..." indices = arange(len(raw_data)) random.seed(0) train_idx = random.sample(indices, n_train) test_idx = random.sample(indices, n_test) X_train, y_train = raw_data[train_idx, 1:], raw_data[train_idx, 0] X_test, y_test = raw_data[test_idx, 1:], raw_data[test_idx, 0] # Apply a learning algorithm print "Applying a learning algorithm..." clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train) # Make a prediction print "Making predictions..." y_pred = clf.predict(X_test) print y_pred # Evaluate the prediction print "Evaluating results..." print "Precision: \t", metrics.precision_score(y_test, y_pred) print "Recall: \t", metrics.recall_score(y_test, y_pred) print "F1 score: \t", metrics.f1_score(y_test, y_pred) print "Mean accuracy: \t", clf.score(X_test, y_test) # Calculate overall time end_time = time.time() print "Overall running time:", end_time - start_time
df = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv') df['Interest.Rate'] = df['Interest.Rate'].map(lambda x: round(float(x.rstrip('%'))/100, 4)) df['FICO.Score'] = df['FICO.Range'].map(lambda x: int(x.split('-')[0])) k = df['Interest.Rate'] >= .12 df['IR_TF'] = k.astype(int) X = np.column_stack((df['FICO.Score'], df['Amount.Requested'])) Y = df['IR_TF'] classifier = OneVsRestClassifier(LogisticRegression(penalty='l1')).fit(X, Y) print 'Coefficients: ', classifier.coef_ print 'Intercept" ', classifier.intercept_ print 'Accuracy: ', classifier.score(X, Y) coeff = classifier.coef_ intercept = classifier.intercept_ FICOScore_IRTF0 = df.loc[df['IR_TF']==0, 'FICO.Score'] FICOScore_IRTF1 = df.loc[df['IR_TF']==1, 'FICO.Score'] AmountRequested_IRTF0 = df.loc[df['IR_TF']==0, 'Amount.Requested'] AmountRequested_IRTF1 = df.loc[df['IR_TF']==1, 'Amount.Requested'] fig = plt.figure(figsize = (10, 8)) plt.plot(FICOScore_IRTF0, AmountRequested_IRTF0, '.', label = 'Interest rate < 12% (class 0)',mfc = 'None', mec='coral') plt.plot(FICOScore_IRTF1, AmountRequested_IRTF1, '.', label = 'Interest rate >= 12% (class 1)',mfc = 'None', mec='steelblue')
100XP Import LogisticRegression from sklearn.linear_model and OneVsRestClassifier from sklearn.multiclass. Instantiate the classifier clf by placing LogisticRegression() inside OneVsRestClassifier(). Fit the classifier to the training data X_train and y_train. Compute and print the accuracy of the classifier using its .score() method, which accepts two arguments: X_test and y_test. ''' # Import classifiers from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier # Create the DataFrame: numeric_data_only numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000) # Get labels and convert to dummy variables: label_dummies label_dummies = pd.get_dummies(df[LABELS]) # Create training and test sets X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only, label_dummies, size=0.2, seed=123) # Instantiate the classifier: clf clf = OneVsRestClassifier(LogisticRegression()) # Fit the classifier to the training data clf.fit(X_train, y_train) # Print the accuracy print("Accuracy: {}".format(clf.score(X_test, y_test)))
precision = 100.0*tp/(tp+fp); recall = 100.0*tp/(tp+fn); F1 = 2*precision *recall /(precision + recall) ; #fpr,tpr,tresholds = roc_curve(y_actual,y_predict) area = roc_auc_score(y_predict,y_actual) return (precision,recall,F1,area) if __name__=="__main__": (train_data,y)= getData("trainingData2.csv"); (test_data,y_test)=getData("CVData2.csv"); # Start OneVsRest Stratergy cOneVsRest = OneVsRestClassifier(LinearSVC()).fit(train_data,y) oVrPreds = cOneVsRest.predict(test_data) print "oVr Percentage Goods :",100.0*sum(oVrPreds == y_test)/len(y_test) print "Score of oVr is :",cOneVsRest.score(test_data,y_test) #scores = cross_val_score(cOneVsRest, test_data, y_test,cv=10) #print "Value of mean is :",scores.mean() # Start SVM Stratergy print "Started SVM Training and Classification" cSVM = svm.SVC().fit(train_data,y) SVMPreds = cSVM.predict(test_data) print "SVM Percentage Goods :",100.0*sum(SVMPreds == y_test) /len(y_test) #Start KNN Ways of prediction cKNN = KNeighborsClassifier(n_neighbors=4).fit(train_data,y)
from sklearn.cross_validation import train_test_split from sklearn.multiclass import OneVsRestClassifier import time t=time.time() #Convert list of labels to binary matrix random_state = np.random.RandomState(0) X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_train, test_size=.2,random_state=random_state)#splitting the data for cross validation (80-20) print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec" from sklearn.linear_model import LogisticRegression OVR = OneVsRestClassifier(LogisticRegression())#using logistic regression for classsifciation OVR.fit(X_ptrain, y_ptrain) print "accuracy", OVR.score(X_ptest,y_ptest)#checking the accuracy from sklearn.metrics import log_loss#for checing the log loss Y1_final =OVR.predict_proba(X_ptest) print (Y1_final[0]) print log_loss(y_ptest,Y1_final)#log loss value test_ids = [[str(x)] for x in test_photos['id']]#getting the test ids for writing the final submission print type(test_ids) a = np.asarray(test_ids) print type(a),a.shape,a[1] import csv np.savetxt("submisoon1.csv",Y1_final.astype(dtype = float),delimiter =",",fmt = '%1.5f')#stores only the posteriror probabiltites into 8 columns for each image with open(data_root+"submission_label.csv",'w') as wr:#writes only the ids into a csv file and the we have to merge both the files manually
logreg.fit(X_train, y_train) # Make an array of predictions on the test set pred = logreg.predict(X_test) # Output the hitrate and the confusion matrix for each model print(logreg.score(X_test, y_test)) #print(confusion_matrix(pred, y_test)) """ from sklearn.neighbors import KNeighborsClassifier neigh = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=2)) neigh.fit(X_train, y_train) pred = neigh.predict(X_test) print(neigh.score(X_test,y_test)) #print iris """ k_means = cluster.KMeans(n_clusters=20) k_means.fit(X_train,y_train) pred = k_means.predict(X_test) print pred print y_test #print(k_means.score(X_test, y_test)) """ from sklearn.tree import DecisionTreeClassifier clf = OneVsRestClassifier(DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=10)) clf.fit(X_train, y_train)
# ratio imbalance # train classifier = OneVsRestClassifier(LinearSVC()) # classifier = OneVsRestClassifier(sklearn.naive_bayes.MultinomialNB()) # classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0)) # classifier = SGDClassifier(alpha = 0.00001, l1_ratio=0.015) classifier.fit(train_matrix, negative_cases_train) predict_sentiment = classifier.predict(test_matrix) # predict_probs = classifier.predict_proba(test_matrix) accuracy = classifier.score(test_matrix, negative_cases_test) precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support( negative_cases_test, predict_sentiment) print(" LogisticRegression, no preprocesing, unigram only") print ("accuracy = ", accuracy) print (" precision =", precision) print ("recal = ", recall) print ("f1 score = ", f1) end_time = time.time() print 'Iterations took %f seconds.' % (end_time - start_time)
def test_ovr_fit_predict_svc(): ovr = OneVsRestClassifier(svm.SVC()) ovr.fit(iris.data, iris.target) assert_equal(len(ovr.estimators_), 3) assert_greater(ovr.score(iris.data, iris.target), .9)
def main(inputFileName): model = "" finalDict={} details = {} if len(gb.glob("./diction*.pkl")) == 0: configuartion=py.SparkConf() # setting the Spark Configuration sContext=py.SparkContext(conf=configuartion) # setting the Spark context sContext.defaultParallelism print ("Data preprocessing start time:", datetime.datetime.now().time()) traindataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/pos1/*.txt")) posData = traindataPos.flatMap(getdata) testdataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/pos1/*.txt")) postestData = testdataPos.flatMap(getdata) newposData = traindataPos + testdataPos traindataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/neg1/*.txt")) negData = traindataNeg.flatMap(getdata) testdataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/neg1/*.txt")) negtestData = testdataNeg.flatMap(getdata) newNegData = traindataNeg + testdataNeg posDataFrequency = newposData.flatMap(mapper).reduceByKey(lambda a,b: a + b) negDataFrequency = newNegData.flatMap(mapper).reduceByKey(lambda a,b: a + b) dataFrequency = posDataFrequency + negDataFrequency dataFrequencySorted = dataFrequency.sortBy(lambda a: a[1],ascending=False) finalDict = {} newCount= 2 for key,value in dataFrequencySorted.collect(): finalDict.update({key:newCount}) newCount+=1 dictDump = open("dictionary.pkl", "wb") cPickle.dump(finalDict, dictDump, -1) finalposData,maxFeatures1 = getIntDataFormat(list(postestData.collect()),finalDict) finalnegData,maxFeatures2 = getIntDataFormat(list(negtestData.collect()),finalDict) finalposData,maxFeatures1 = getIntDataFormat(list(posData.collect()),finalDict) finalnegData,maxFeatures2 = getIntDataFormat(list(negData.collect()),finalDict) XtrainNeg,XtrainPos,XtestPos,XtestNeg=[],[],[],[] if maxFeatures1 < maxFeatures2: XtrainNeg = datapreprocessing(finalnegData,maxFeatures2) XtrainPos = datapreprocessing(finalposData,maxFeatures2) XtestNeg = datapreprocessing(finalnegData, maxFeatures2) XtestPos = datapreprocessing(finalposData, maxFeatures2) details.update({"maxfeature":maxFeatures2}) elif maxFeatures1 > maxFeatures2: XtrainNeg = datapreprocessing(finalnegData, maxFeatures1) XtrainPos = datapreprocessing(finalposData, maxFeatures1) XtestNeg = datapreprocessing(finalnegData, maxFeatures1) XtestPos = datapreprocessing(finalposData, maxFeatures1) details.update({"maxfeature": maxFeatures1}) else: XtrainNeg = datapreprocessing(finalnegData, maxFeatures1) XtrainPos = datapreprocessing(finalposData, maxFeatures1) XtestNeg = datapreprocessing(finalnegData, maxFeatures1) XtestPos = datapreprocessing(finalposData, maxFeatures1) details.update({"maxfeature": maxFeatures1}) YtrainNeg,YtrainPos,YtestNeg,YtestPos = [],[],[],[] if len(XtrainPos)< len(XtrainNeg): print "Imbalance Dataset.. Balancing out commencing" XtrainNeg = XtrainNeg[0:len(XtrainPos)] YtrainNeg = getLabel(len(XtrainPos),"neg") YtrainPos = getLabel(len(XtrainPos),"pos") elif len(XtrainPos)> len(XtrainNeg): print "Imbalance Dataset.. Balancing out commencing" XtrainPos = XtrainPos[0:len(XtrainNeg)] YtrainNeg = getLabel(len(XtrainNeg), "neg") YtrainPos = getLabel(len(XtrainNeg), "pos") else: print "Balance Dataset" YtrainNeg = getLabel(len(XtrainNeg), "neg") YtrainPos = getLabel(len(XtrainNeg), "pos") if len(XtestPos) < len(XtestNeg): print "Imbalance Dataset.. Balancing out commencing" XtestNeg = XtestNeg[0:len(XtestPos)] YtestNeg = getLabel(len(XtestPos), "neg") YtestPos = getLabel(len(XtestPos), "pos") elif len(XtestPos) > len(XtestNeg): print "Imbalance Dataset.. Balancing out commencing" XtestPos = XtrainPos[0:len(XtestNeg)] YtestNeg = getLabel(len(XtestNeg), "neg") YtestPos = getLabel(len(XtestNeg), "pos") else: print "Balance Dataset" YtestNeg = getLabel(len(XtestNeg), "neg") YtestPos = getLabel(len(XtestNeg), "pos") Xtrain = XtrainPos+XtrainNeg Ytrain = YtrainPos+YtrainNeg Xtest = XtestPos + XtestNeg Ytest = YtestPos + YtestNeg Xtrain = np.array(Xtrain) Ytrain = np.array(Ytrain) model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=3,C = 0.5,tol=0.0001,cache_size=5000) ) model.fit(Xtrain,Ytrain) print model.score(Xtest, Ytest) details.update({"score":model.score(Xtest, Ytest)}) dictDump = open("datadetails.pkl", "wb") cPickle.dump(details, dictDump, -1) joblib.dump(model, "./SpeechTextModels/SVM_SpeechText_Model.pkl") else: detailsFile = open("./datadetails.pkl", 'rb') dictFile = open("./dictionary.pkl", 'rb') model = joblib.load("./SpeechTextModels/SVM_SpeechText_Model.pkl") details = cPickle.load(detailsFile) finalDict = cPickle.load(dictFile) ########## End of If Loop MOdel TRAINED ############################## dataList=[] testDataList = getTestData(inputFileName) dataList.append(testDataList) finalData, maxFeatures = getIntDataFormat(dataList, finalDict) modelTrainFeatures = details.get("maxfeature") if modelTrainFeatures > maxFeatures: Xtest = np.array(datapreprocessing(finalData, modelTrainFeatures)) else: Xtest = np.array(finalData[0:modelTrainFeatures]) return model.predict(Xtest)
pc=0 nc=0 classifier = OneVsRestClassifier(LinearSVC(C=2.0,random_state=0)) classifier.fit(TRAIN_FEATURES,TRAIN_ATTRIBUTE) decision = classifier.decision_function(TEST_FEATURES) prediction = classifier.predict(TEST_FEATURES) for i in range(0,len(TEST_ATTRIBUTE)): for j in range(22): if prediction[i][j]==TEST_ATTRIBUTE[i][j]: pc+=1 else: nc+=1 # print prediction[i],TEST_ATTRIBUTE[i],TEST_LABELS[i], decision[i] print pc,nc print classifier.score(TEST_FEATURES,TEST_ATTRIBUTE) TRAIN_LABELS = [] TRAIN_FEATURES = [] TRAIN_ATTRIBUTE = [] TEST_LABELS = [] TEST_FEATURES = [] TEST_ATTRIBUTE = [] for feature,label in zip(features['win_feature'],labels['vlabels'][0]): if mapping[int(label)]['action'] in NovelClass: attributes = classifier.predict(feature.reshape(1, -1)) TEST_LABELS.append(mapping[int(label)]['action']) TEST_FEATURES.append(numpy.array(attributes[0])) A = dist.pairwise([attributes[0],numpy.array(attribute_mapping[NovelClass[0]])])[0][1]
class speechSVM: # Initializing the SVM Model def __init__(self): self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=2,C = 0.9,tol=0.0001,cache_size=5000) ) #self.model = OneVsRestClassifier(LinearSVC(random_state=0)) self.working_directory = os.getcwd()+"/" self.model_prediction_score = {} # Function to read the emotion prediction probability def get_Model_Score(self): filename = self.working_directory + "Models/scorefile.txt" return pickle.load(open(filename, "rb")) # Function to save Emotion prediction probability def set_Model_Score(self): filename = self.working_directory+"Models/scorefile.txt" pickle.dump(self.model_prediction_score, open(filename, "wb")) # Function to load the wav dataset and extract the features from it def load_data_file(self): outputdata = [] # Variable to store the speech features and emotions # Looping all the wave files present in the path for f in gb.glob(self.working_directory+"AudioData/*/*.wav"): frate, inputdata = sc.read(f) # Extracting the pitch from the wav file using Aubio speech API pitch=lp.getPitch(f,frate) # Extracting loudness of the voice from the Wave file loudness = abs(an.loudness(inputdata)) # Extracting the emotion type from the wave file only for training stage filename = f.split("/")[-1].split(".")[0] # Condition to differentiate the various types of emotions if filename[0] == "s": emotion = filename[0:2] else: emotion = filename[0] # Creating the dataset consisting of list of features and corresponding emotion type outputdata.append(list([loudness,pitch, emotion])) return outputdata # Function to split test and train data def get_train_test_data(self,data,percent_split): noOfSamples = len(data)*(1-percent_split) test = data.iloc[0:int(noOfSamples), 2:] testsample=[] for i in range(len(test)): testsample = np.append(testsample,test.iloc[i].values[0]) return data.iloc[int(noOfSamples):, 0:2], data.iloc[int(noOfSamples):, 2:],data.iloc[0:int(noOfSamples), 0:2],np.array(testsample) # Function to fit the SVM Model def trainNNet(self,data,label,feature_name): filenamelist = gb.glob(self.working_directory+"Models/*") filename = "Models/SVM_" + feature_name + ".pkl" #print filenamelist.count(self.working_directory+"Models/SVM_"+feature_name+".pkl") if filenamelist.count(self.working_directory+"Models/SVM_"+feature_name+".pkl") == 0: X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, label.astype(str), test_size = 0.045, random_state = 0) self.model.fit(X_train,y_train) print("score",self.model.score(X_test,y_test)) #print (cross_validation.cross_val_score(self.model, data, label.astype(str), cv =4)) joblib.dump(self.model, self.working_directory+filename) else: self.model = joblib.load(self.working_directory+filename) print "model already exists for feature "+feature_name+" !! training exiting" # Function h to predict batch input def predict(self,ftest,ltest,data,feature_name): predicted_data = [] # Loop to traverse through the Test data and predict the corresponding for i in range(len(ftest)): predicted_data.append(self.model.predict(ftest.iloc[i].values.reshape(1,-1))) score = self.model.score(ftest, ltest) self.model_prediction_score.update({feature_name:score}) # Function to predict single input data def predict_emotion(self,data): emotion_list=[] for modelfilepath in gb.glob(self.working_directory+"Models/*.pkl"): print modelfilepath emotion = modelfilepath.split("/")[-1].split(".")[0] model = joblib.load(modelfilepath) modelprediction = model.predict(data.values.reshape(1,-1)) print modelprediction if modelprediction[0] !='NA': emotion_list.append(modelprediction[0]) print emotion_list return emotion_list # converting a single wave file into a List of speech properties def load_data(self,filename): outputdata=[] # Loop to traverse through the input data file path for f in gb.glob(filename): frate, inputdata = sc.read(f) pitch = lp.getPitch(f,frate) loudness = abs(an.loudness(inputdata)) filename = f.split("/")[-1].split(".")[0] if filename[0] == "s": emotion = filename[0:2] else: emotion = filename[0] outputdata.append(list([loudness, pitch, emotion])) return outputdata