def run_test(**kwargs): b = fetch_sw_orl() tic = time.time() # split the data in X_train, X_test, y_train, y_true = train_test_split(b.data, b.target, test_size=0.2, stratify=b.target) hog_train = [] for img_array in X_train: fd, _ = hog(img_array.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_train.append(fd) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2) clf.fit(hog_train, y_train) tok = time.time() hog_test = [] for img_arry in X_test: fd, _ = hog(img_arry.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_test.append(fd) y_pred = clf.predict(hog_test) return tok - tic, accuracy_score(y_true, y_pred)
def test_ecoc_fit_predict(): # A classifier which implements decision_function. ecoc = OutputCodeClassifier(LinearSVC(), code_size=2) ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2) # A classifier which implements predict_proba. ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2) ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2)
def train(corpus): time = datetime.datetime.now() logging.info('Static Embedding Oracle') Y, X_dic = EmbeddingOracle.parseCorpus(corpus.trainingSents, EmbeddingOracle) vec = DictVectorizer() X = vec.fit_transform(X_dic) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X, Y) logging.info('Traingin Time: ' + str(int((datetime.datetime.now() - time).seconds / 60.))) return clf, vec
def test_ecoc_float_y(): # Test that the OCC errors on float targets X = iris.data y = iris.data[:, 0] ovo = OutputCodeClassifier(LinearSVC()) msg = "Unknown label type" with pytest.raises(ValueError, match=msg): ovo.fit(X, y) ovo = OutputCodeClassifier(LinearSVC(), code_size=-1) msg = "code_size should be greater than 0, got -1" with pytest.raises(ValueError, match=msg): ovo.fit(X, y)
def train_svm(labels,array, num_folds, num_jobs, params = 2): #obtain the best parameter settings for an svm outputcode classifier bestParameters = dict() if len(labels) > 2: print("outputcodeclassifier") #param_grid = {'estimator__C': [0.001, 0.005, 0.01,0.1, 0.5, 1,2.5, 5, 10,15,25, 50,75, 100, 500, 1000], # 'estimator__kernel': ['linear','rbf','poly'], # 'estimator__gamma': [0.0005,0.001, 0.002, 0.008,0.016, 0.032,0.064, 0.128,0.256, 0.512, 1.024, 2.048], # 'estimator__degree': [1,2,3,4]} param_grid = {'estimator__C': [0.001, 0.005], 'estimator__kernel': ['linear','rbf'], 'estimator__gamma': [0.0005,0.001], 'estimator__degree': [1]} model = OutputCodeClassifier(svm.SVC(probability=True)) else: print("svc model") param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'kernel': ['linear','rbf','poly'], 'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'degree': [1,2,3,4]} model = svm.SVC(probability=True) paramsearch = RandomizedSearchCV(model, param_grid, cv=num_folds, verbose=2,n_iter = params,n_jobs=num_jobs) print("Grid search...") paramsearch.fit(array,numpy.asarray(labels)) print("Prediction...") parameters = paramsearch.best_params_ for parameter in parameters.keys(): print(parameter + ": " + str(parameters[parameter]) + "\n") print("best score: " + str(paramsearch.best_score_) + "\n\n") #for score in paramsearch.grid_scores_: # print 'mean score:',score.mean_validation_score # print 'list scores:',score.cv_validation_scores #train an svm outputcode classifier using the best parameters if len(labels) > 2: test = svm.SVC(probability=True, C=parameters['estimator__C'], kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'], degree=parameters['estimator__degree']) out_test = OutputCodeClassifier(test,n_jobs=1) out_test.fit(array,labels) else: test = svm.SVC(probability=True, C=parameters['C'], kernel=parameters['kernel'],gamma=parameters['gamma'], degree=parameters['degree']) #test.fit(array,labels) return test
def ECOC(): print('Aplicando metodo multiclase ERROR CORRECTING OUTPUT CODES') for indice in lista_datasets: print('Base de datos: ' + str(indice)) dataset = arff.loadarff('./datasets/' + str(indice)) df = pd.DataFrame(dataset[0]) input = df.iloc[:, df.columns != 'class'] output = pd.factorize(df['class'])[0] X_train, X_test, Y_train, Y_test = train_test_split(input, output, test_size=0.25) clf = OutputCodeClassifier(KNeighborsClassifier(n_neighbors=5), code_size=2, random_state=0) clf.fit(X_train, Y_train) print('Porcentaje de bien clasificados ERROR CORRECTING OUTPUT CODES') print(clf.score(X_test, Y_test)) print('--------------------------')
class SVMClf: def __init__(self, labels, data, load=False, save=False): if load: with open(clfData, 'rb') as input: self.classifier = pickle.load(input) with open(vecData, 'rb') as input: self.verctorizer = pickle.load(input) return self.verctorizer = DictVectorizer() featureVec = self.verctorizer.fit_transform(data) self.classifier = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) # self.classifier = LogisticRegression( solver='sag') self.classifier.fit(featureVec, labels) if save: with open(clfData, 'wb') as output: pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL) with open(vecData, 'wb') as output: pickle.dump(self.verctorizer, output, pickle.HIGHEST_PROTOCOL)
def evaluateOutputCode(X, Y, printReport=False): time = datetime.datetime.now() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X_train, Y_train) if printReport: print 'Training time:' + str(datetime.datetime.now() - time) print 'Evaluation result: OneVsOne: ' + str( clf.score(X_test, Y_test)) Y_test = clf.predict(X_test) if printReport: print '0: ' + str((Y_test == 0).sum()) print '1: ' + str((Y_test == 1).sum()) print '2: ' + str((Y_test == 2).sum()) return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") kernelRBF=1.0*RBF(1.0) clf=OutputCodeClassifier(estimator = DecisionTreeClassifier()) clf=clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def test_ecoc_delegate_sparse_base_estimator(): # Non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/17218 X, y = iris.data, iris.target X_sp = sp.csc_matrix(X) # create an estimator that does not support sparse input base_estimator = CheckingClassifier( check_X=check_array, check_X_params={ "ensure_2d": True, "accept_sparse": False }, ) ecoc = OutputCodeClassifier(base_estimator, random_state=0) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.fit(X_sp, y) ecoc.fit(X, y) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.predict(X_sp) # smoke test to check when sparse input should be supported ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) ecoc.fit(X_sp, y).predict(X_sp) assert len(ecoc.estimators_) == 4
def OutputCodeClassifier(data, label, pred_data, pred_last): ''' 0.76473194506 Number of mislabeled points out of a total 841 points : 211 0.749108204518 需要规范化 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(data, label) print clf.score(data, label) pred_result = clf.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print clf.score(pred_data, pred_last) return pred_result
# -*- coding: utf-8 -*- """ Created on Fri May 24 20:38:46 2019 @author: pathouli """ import pandas as pd from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC the_path = 'C:/Users/pathouli/myStuff/academia/torhea/projects/groupC/' allstate_data = pd.read_csv(the_path + 'train.csv', sep=",") clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) label_cols = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] X_cols = allstate_data.columns.difference(label_cols) X = allstate_data[X_cols][1:10000] y = allstate_data[label_cols][1:10000] #small sample to test clf.fit(X, y).predict(X) # https://www.kaggle.com/c/allstate-purchase-prediction-challenge/data
""" from sklearn import datasets from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC from sklearn.metrics import accuracy_score #数据获取 iris = datasets.load_iris() x, y = iris.data, iris.target print('样本数量,%d,特征数量%d' % x.shape) #模型对象创建 #code_size 指定最终使用多少个子模型,实际的子模型数量=code_size*label_number clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=30, random_state=0) #模型构建 clf.fit(x, y) #输出预测结果值 print(clf.predict(x)) print('准确率%.3f' % accuracy_score(y, clf.predict(x))) #模型属性输出 k = 1 for item in clf.estimators_: print('第%d个模型' % k) print(item) k += 1 print(clf.classes_)
def main(): filenameLB = 'mfcc_lb.csv' allsongcat = pickle.load(open('mfcc_fv.p', 'rb')) hcdf = pickle.load(open('hcdf_fv.p', 'rb')) with open('mfcc_lb.csv') as f: reader = csv.reader(f) for row in reader: labels = row # select training and test sets ''' TEidx = np.array(random.sample(range(0,1000), 100)) training = [] test = [] trainingLB = [] testLB = [] # make numpy arrays for i in range(1000): if i in TEidx: test.append(featureDict[i]) testLB.append(int(labels[i])) else: training.append(featureDict[i]) trainingLB.append(int(labels[i])) # fit with classifier and predict X = np.array(training) Y = np.array(trainingLB) ''' l = [allsongcat, hcdf] all_feats = combineFeatures(l) feats_shuf = [] labels_shuf = [] index_shuf = range(len(labels)) shuffle(index_shuf) for i in index_shuf: feats_shuf.append(all_feats[i]) labels_shuf.append(labels[i]) X = np.array(feats_shuf) Y = np.array(labels_shuf) kf = KFold(1000, n_folds=10) #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2') sgd = SGDClassifier(loss="hinge", penalty="l2") #svc = svm.SVC(kernel='linear') dtree = DecisionTreeClassifier(max_depth=3) lsvc = LinearSVC(random_state=0) cla = OutputCodeClassifier(sgd, code_size=128, random_state=0) cm_all = np.zeros((10, 10), dtype=np.int) cb = np.zeros((10, 20)) losses = [] with open('ECOC_sgd_error.csv', 'w') as f1: wrtest = csv.writer(f1, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') scores = 0.0 for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[ test] cla.fit(X_train, y_train) predictions = cla.predict(X_test) loss = zero_one_loss(predictions, y_test) losses.append(loss) scores += loss # print y_test # print predictions cb = cla.code_book_ np.savetxt('codebook.csv', cb, delimiter=',') # Compute confusion matrix cm = confusion_matrix( y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) np.set_printoptions(precision=2) #print(cm_all) cm_all = np.add(cm_all, cm) # make ECOC coding matrix 0-1 binary cb[cb <= 0] = 0 wrtest.writerow(losses) print cb print scores / 10
row = [] for (top_left, bottom_right) in rectangles: row += get_haar_features(im, top_left, bottom_right) train_ecoc_table[ind] = row test_ecoc_table = np.zeros(shape=(np.shape(test_images)[0], 200)) for ind, im in enumerate(test_images): row = [] for (top_left, bottom_right) in rectangles: row += get_haar_features(im, top_left, bottom_right) test_ecoc_table[ind] = row clf = OutputCodeClassifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200), code_size=5, random_state=0) clf.fit(train_ecoc_table, labels) train_pred = np.array(clf.predict(train_ecoc_table)) print "Digits Training Accuracy: %f" % (np.sum(train_pred == np.array(labels)).astype(np.float)/np.shape(train_pred)[0]) test_pred = np.array(clf.predict(test_ecoc_table)) print "Digits Testing Accuracy: %f" % (np.sum(test_pred == np.array(test_labels)).astype(np.float)/np.shape(test_pred)[0]) # ecoc_table = [] # for im in images: # # im_preprocess = np.matrix([[np.sum(im[:i,:j]) for i in range(1, 29)] for j in range(1, 29)]) # # def get_black_rectangle(top_left, bottom_right): # x1, y1 = top_left # x2, y2 = bottom_right
@author: 凯风 """ from sklearn.datasets import load_iris from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVR from sklearn.model_selection import train_test_split iris_data = load_iris() X, Y = iris_data.data, iris_data.target trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3) ''' 纠错输出码 和O-vs-O、O-vs-Rest不太一样的方法 主要是在欧几里得空间表示 具体的文本解释,看《机器学习》周治平的那个本里面有提到 ''' clf = LinearSVR(random_state=0) ovrc = OutputCodeClassifier(clf, code_size=1.5, random_state=None, n_jobs=1) ovrc.fit(trainX, trainY) ovrc.predict(testX) ovrc.code_book_ ''' estimator 评估器 code_size 空间尺寸? random_state 随机器 n_jobs CPU的作业数量 '''
x_train, x_test, y_train, y_test = train_test_split(breast.data, breast.target, test_size=0.2) # creating a classification clf_1 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42) clf_2 = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=42) # train the classifier with training data clf_1.fit(x_train, y_train) clf_2.fit(x_train, y_train) # find y_pred prediction best on x_test data y_pred_1 = clf_1.predict(x_test) y_pred_2 = clf_2.predict(x_test) # calculate accuracy of y_pred using y_test print(f'accuracy {accuracy_score(y_test, y_pred_1)}') print(f'accuracy {accuracy_score(y_test, y_pred_2)}') # use classification_report function to print more information print( f'\n\nClassification report for MLPClassifier is\n {classification_report(y_test, y_pred_2)}' ) print( f'\n\nClassification report for MLPClassifierOutpuCodeClassifier is\n {classification_report(y_test, y_pred_2)}'
X_train, X_test, y_train, y_test = train_test_split(features_minmax, labels, test_size=0.2, random_state=42) samples_num = y_test.shape[0] predictions_one_vs_rest = OneVsRestClassifier(LinearSVC(random_state=0)).fit( X_train, y_train).predict(X_test) predictions_one_vs_one = OneVsOneClassifier(LinearSVC(random_state=0)).fit( X_train, y_train).predict(X_test) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) prediction_outputCode = clf.fit(X_train, y_train).predict(X_test) correct_onevsone = 0 correct_onevsrest = 0 correct_output = 0 y_test = np.array(y_test) for i in range(samples_num): if predictions_one_vs_rest[i] == y_test[i]: correct_onevsrest = correct_onevsrest + 1 if predictions_one_vs_one[i] == y_test[i]: correct_onevsone = correct_onevsone + 1 if prediction_outputCode[i] == y_test[i]: correct_output = correct_output + 1 print("Accuracy for one vs one classifier") acc_oneVsone = float(correct_onevsone) / samples_num
# apply HoG to all the images in b.data hog_train = [] for img_array in X_train: img = img_array.reshape(b.shape) fd, _ = hog(img, orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_train.append(fd) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=42) clf.fit(hog_train, y_train) tok = time.time() if control[1]: # create the hog fro the X_test hog_test = [] for img_arry in X_test: fd, _ = hog(img_arry.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_test.append(fd) y_pred = clf.predict(hog_test)
def ml_models(train, test, lab, labt): #Random Forest forest = RandomForestClassifier(n_estimators=200, max_leaf_nodes=50, criterion="entropy") forest = forest.fit(train, lab) output_rf = forest.predict(test).astype(int) suc_rf = 0 totals_rf = [0 for m in range(num)] preds_rf = [0 for m in range(num)] for i in range(0, len(labt)): totals_rf[labt[i]] += 1 if output_rf[i] == labt[i]: suc_rf = suc_rf + 1 preds_rf[labt[i]] += 1 accuracy_rf = suc_rf / len(labt) #KNearest Neighbour neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(train, lab) output_kn = neigh.predict(test) suc_kn = 0 totals_kn = [0 for m in range(num)] preds_kn = [0 for m in range(num)] for i in range(0, len(labt)): totals_kn[labt[i]] += 1 if output_kn[i] == labt[i]: suc_kn = suc_kn + 1 preds_kn[labt[i]] += 1 accuracy_kn = suc_kn / len(labt) # Logistic Regression model = LogisticRegression() model.fit(train, lab) output_lr = model.predict(test) suc_lr = 0 totals_lr = [0 for m in range(num)] preds_lr = [0 for m in range(num)] for i in range(0, len(labt)): totals_lr[labt[i]] += 1 if output_lr[i] == labt[i]: suc_lr = suc_lr + 1 preds_lr[labt[i]] += 1 accuracy_lr = suc_lr / len(labt) # Naive Bayes model = GaussianNB() model.fit(train, lab) # print(model) # make predictions # expected = y output_nb = model.predict(test) suc_nb = 0 totals_nb = [0 for m in range(num)] preds_nb = [0 for m in range(num)] for i in range(0, len(labt)): totals_nb[labt[i]] += 1 if output_nb[i] == labt[i]: suc_nb = suc_nb + 1 preds_nb[labt[i]] += 1 accuracy_nb = suc_nb / len(labt) # Decision Tree Classifier model = DecisionTreeClassifier() model.fit(train, lab) output_dt = model.predict(test) suc_dt = 0 totals_dt = [0 for m in range(num)] preds_dt = [0 for m in range(num)] for i in range(0, len(labt)): totals_dt[labt[i]] += 1 if output_dt[i] == labt[i]: suc_dt = suc_dt + 1 preds_dt[labt[i]] += 1 accuracy_dt = suc_dt / len(labt) # Support Vector Machine clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(train, lab) output_sv = clf.predict(test) suc_sv = 0 totals_sv = [0 for m in range(num)] preds_sv = [0 for m in range(num)] for i in range(0, len(labt)): totals_sv[labt[i]] += 1 if output_sv[i] == labt[i]: suc_sv = suc_sv + 1 preds_sv[labt[i]] += 1 accuracy_sv = suc_sv / len(labt) # Majority voting def Most_Common(lst): data = Counter(lst) return data.most_common(1)[0][0] output_mv = [] for i in range(0, len(labt)): c = [output_dt[i], output_rf[i], output_lr[i]] output_mv.append(Most_Common(c)) suc_mv = 0 totals_mv = [0 for m in range(num)] preds_mv = [0 for m in range(num)] for i in range(0, len(labt)): totals_mv[labt[i]] += 1 if output_mv[i] == labt[i]: suc_mv = suc_mv + 1 preds_mv[labt[i]] += 1 accuracy_mv = suc_mv / len(labt) return accuracy_rf, accuracy_kn, accuracy_lr, accuracy_nb, accuracy_dt, accuracy_sv, accuracy_mv, \ preds_rf, preds_kn, preds_lr, preds_nb, preds_dt, preds_sv, preds_mv, \ totals_rf, totals_kn, totals_lr, totals_nb, totals_dt, totals_sv, totals_mv
x= np.array(np.zeros(15050), ndmin=1) #label 0 for benign y= np.array(np.ones(15050), ndmin=1) #label 1 for malignant y_train=np.concatenate((x,y), axis=0) ''' #labeling y_test x1= np.array(np.zeros(50), ndmin=1) #label 0 for benign y1= np.array(np.ones(50), ndmin=1) #label 1 for malignant y_test=np.concatenate((x1,y1), axis=0) ################Using LinearSVC param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000]} clf = OutputCodeClassifier(LinearSVC(random_state=0, verbose=5), code_size=3, random_state=0) clf.fit(X_train, y_train) predictions=clf.predict(X_test) print(confusion_matrix(y_test, predictions)) print(classification_report(y_test, predictions)) ###########Using GridSearchCV param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000], 'gamma':[100,10,1,0.1,0.01,0.001,0.0001]} model_grid = GridSearchCV(SVC(), param_grid, verbose=5,cv=10) ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0) Cs = [0.0001,0.001, 0.01,0.5, 0.8, 0.1, 1, 10, 100, 1000, 5000, 10000] cv = GridSearchCV(ecoc, {'estimator__C': Cs}, verbose=5, cv=10) cv.fit(X_train,y_train)
def train_by_OutputCodeClassifier(X, y): clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) return clf.fit(X, y)
threshold_train] # Test threshold_test = np.where((y_test == 0) | (y_test == 1) | (y_test == 7) | (y_test == 8)) y_test_thres, x_test_thres = y_test[threshold_test], x_test[threshold_test] ################################################################################################### ################################# Training a classifier (4 numbers) ############################## num_iter = 5 start_time_OCC = time.time() OCC = OutputCodeClassifier(Perceptron(max_iter=num_iter, random_state=0)) OCC.fit(x_train_thres, y_train_thres) predictionsOCC = OCC.predict(x_test_thres) scoreOCC = OCC.score(x_test_thres, y_test_thres) cmOCC = metrics.confusion_matrix(y_test_thres, predictionsOCC) plt.figure(figsize=(9, 9)) sns.heatmap(cmOCC, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r') plt.ylabel('Actual label') plt.xlabel('Predicted label') all_sample_title = 'OCC - Accuracy Score: {0}'.format(scoreOCC) plt.title(all_sample_title, size=15)
print(welfare_word) welfare_data = tfidf_vectorizer.fit_transform(welfare_word).toarray() print(welfare_data) # 划分数据集合 welfare_data_train,welfare_data_test,welfare_target_train,welfare_target_test = \ train_test_split(welfare_data,welfare_target,test_size=0.2,random_state=666) # 数据标准化 # stdScaler = StandardScaler().fit(welfare_data_train) # welfare_data_train_std = stdScaler.transform(welfare_data_train) # welfare_data_test_std = stdScaler.transform(welfare_data_test) # 建立svm模型,使用线性核函数 model = OutputCodeClassifier(LinearSVC()) model = model.fit(welfare_data_train,welfare_target_train) # 保存模型 joblib.dump(model, 'welfare_predict.pkl') welfare_target_predict = model.predict(welfare_data_test) print('预测前20个结果为:\n',welfare_target_predict[:20]) print('使用SVM预测数据的准确率为:', accuracy_score(welfare_target_test,welfare_target_predict)) print('使用SVM预测数据的精确率为:', precision_score(welfare_target_test,welfare_target_predict,average='micro')) print('使用SVM预测数据的召回率为:', recall_score(welfare_target_test,welfare_target_predict,average='micro')) print('使用SVM预测数据的F1值为:', f1_score(welfare_target_test,welfare_target_predict,average='micro')) print('使用SVM预测数据的Cohen’s Kappa系数为:',
knn.fit(train_ft, train_label).score(test_ft, test_label)) print('LogisticRegression score: %f' % logistic.fit(train_ft, train_label).score(test_ft, test_label)) # SVM list_of_acc = list() accur = 0 # for c in np.logspace(-2, 10, 5): c = 1000 # for c in np.logspace(-2, 10, 5): # for c in np.logspace(-2, 10, 5): for c in [100, 1000, 10000, 100000]: for g in np.logspace(-9, 3, 13): clf = OutputCodeClassifier(svm.SVC(random_state=0, gamma=g, C=c), code_size=10, random_state=0) accur_temp = clf.fit(svmtrain, svmtrainlabel).score(svmtest, svmtestlabel) if accur < accur_temp: accur = accur_temp gamma = g print(c, g, accur) list_of_acc.append(accur) print(np.mean(list_of_acc))
y_train = labels[100:172,i] X_test = sample2 y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] box = np.zeros([6,6]) accuracy = np.zeros(100) for m in range(0,100): posterior = np.empty([100,72,6]) gbc = GradientBoostingClassifier(n_estimators=60, max_depth=3) occ = OutputCodeClassifier(gbc) y_pred = occ.fit(X_train, y_train).predict(X_test) n=0 for i in range(0,len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n+1 accuracy[m] = accuracy[m]+1 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1 #posterior[m] = knc.predict_proba(X_test) print np.mean(accuracy)/0.72, np.std(accuracy)/0.72 #print sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0 ''' means = np.empty([72,6]) stds = np.empty([72,6]) grid = np.empty([6,6])
# -- coding: utf-8 -- # Problem 8, Python code # 1530200066 赵一勤 # SVM 分类代码 import h5py import numpy as np from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OutputCodeClassifier # 读取数据 f = h5py.File('./pca_data.mat', 'r') data = {} for k in f.keys(): data[k] = f[k][:] test_data = data['test_data'].transpose() test_label = np.ravel(data['test_label'].transpose()) train_data = data['train_data'].transpose() train_label = np.ravel(data['train_label'].transpose()) # 开始训练 clf = OutputCodeClassifier(SVC(kernel='rbf')) model = clf.fit(train_data, train_label) train_acc = model.score(train_data, train_label) test_acc = model.score(test_data, test_label) # 输出训练和测试正确率 print('[Train accuracy]: %s, [Test accuracy]: %s' %(train_acc, test_acc))
# random_search = RandomizedSearchCV(estimator=svc, # param_distributions=random_grid, # n_iter=10, # scoring='accuracy', # cv=3, # verbose=1, # random_state=12) ##----------------------End of Uncomment block for applying random search grid to find best parameters ##----------------------Uncomment block for using multiclass learning using output-codes occ = OutputCodeClassifier(svc,code_size=2, random_state=8) ##----------------------End of Uncomment block for using multiclass learning using output-codes ## Fit your chosen model by changing the vaiable before the period to either - svc, random_search, grid_search or occ occ.fit(features_train, labels_train) ##----------------------Uncomment required block for finding out best parameters if using the random search or grid search for best accuracy # print("The best hyperparameters from Random Search are:") # print(random_search.best_params_) # print("") # print("The mean accuracy of a model with these hyperparameters is:") # print(random_search.best_score_) ##----------------------End of Uncomment block for finding out best parameters if using the random search for best accuracy def get_key(val): identifiedKey = [k for k,v in category_codes.items() if v == val] if len(identifiedKey) == 0: return "No value" return identifiedKey[0]
train_ingredients.append(' '.join(ings)) #construct test_ingredients for entry in test_set: ings = [WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients']] test_ingredients.append(' '.join(ings)) #used to encode labels as numbers for use with RandomForestClassifier le = LabelEncoder() #encode cuisines as numbers train_cuisines = le.fit_transform(train_cuisines) #used to create bag of ingredients vocabulary and create features for each entry vectorizer = CountVectorizer() train_features = vectorizer.fit_transform(train_ingredients).toarray() test_features = vectorizer.transform(test_ingredients).toarray() clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2) result = clf.fit(train_features, train_cuisines).predict(test_features) output = pd.DataFrame(data={'id':test_ids, 'cuisine':le.inverse_transform(result)}) #force explicit ordering of columns output = output[['id', 'cuisine']] output.to_csv('ecoc.csv', index=False)
print() data = loadmat('ex3data1.mat') X = data['X'] y = data['y'] y = y.T y = y[0] # n_classes = 10 # code_size = np.log2(n_classes) / n_classes # yields .332 clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) ind2 = clf.fit(X, y).predict(X) error = [] count = 0 for i in range(0, len(y)): if y[i] == ind2[i]: count += 1 # Good - increment count else: error.append(i) # Record index of bad read print('The number predicted correctly = ', count) print('The percentage accuracy is ','{:.2%}'.format(count/len(y))) print() # Display a selection of the mis-classified m = 0 # Display size
from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC from sklearn.datasets import load_svmlight_file import numpy as np import sklearn TEST_SPLIT = .2 X, Y = load_svmlight_file("ablated_features.txt") num_instances = len(Y) num_test = int((1 - TEST_SPLIT) * num_instances) indices = np.arange(num_instances) np.random.shuffle(indices) X = X[indices] Y = Y[indices] X_train = X[:num_test] Y_train = Y[:num_test] X_test = X[num_test:] Y_test = Y[num_test:] # print X_train.shape[0], X_test.shape[0] clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=20, random_state=0) preds = clf.fit(X_train, Y_train).predict(X_test) print sklearn.metrics.accuracy_score(Y_test, preds)
def oc_classify(X,Y): size = np.count_nonzero(sp.unique(Y)) clf = OutputCodeClassifier(LinearSVC(),code_size=size) clf.fit(X,Y) return clf
from sklearn import datasets from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.preprocessing import MultiLabelBinarizer from sklearn.multiclass import OutputCodeClassifier iris = datasets.load_iris() print iris X, y = iris.data, iris.target clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) print X print y print clf.fit(X, y).predict(X)
class Classifier(): def __init__(self,trainlist,testlist,scaling = "binary",jobs=16,directory=False, features = False, feature_info = False): self.training = trainlist self.test = testlist #self.test should be a list with multiple lists for each testset self.scaling = scaling self.jobs = jobs self.directory = directory self.feature_status = {} self.outstring = False self.features = features self.feature_info = feature_info def count_feature_frequency(self): def ff(instances,queue): feature_frequency = defaultdict(int) for i,instance in enumerate(instances): for feature in instance["ngrams"]: feature_frequency[feature] += 1 queue.put(feature_frequency) print(len(self.training)) q = multiprocessing.Queue() chunks = gen_functions.make_chunks(self.training,self.jobs) for chunk in chunks: p = multiprocessing.Process(target=ff,args=[chunk,q]) p.start() ds = [] while True: l = q.get() ds.append(l) if len(ds) == len(chunks): break self.feature_frequency = defaultdict(int) for d in ds: for k in d: self.feature_frequency[k] += d[k] self.features = sorted(self.feature_frequency, key=self.feature_frequency.get, reverse=True) def make_feature_labellist(self): feature_labellist = defaultdict(list) for instance in self.training: try: label = int(instance["label"]) for feature in instance["ngrams"]: feature_labellist[feature].append(label) except: continue self.feature_labellist = feature_labellist def prune_features(self): for instance in self.training: new_features = [] #print feature_status for f in instance["ngrams"]: try: if self.feature_status[f]: new_features.append(f) except: continue instance["ngrams"] = new_features # queue.put(instance) def convert_features(self,convert_list): for instance in self.training: new_features = [] #print feature_status #print instance["features"] for i,f in enumerate(instance["ngrams"]): if f in convert_list.keys(): instance["ngrams"][i] = convert_list[f] #print instance["features"] def filter_stdev(self,threshold,prop): self.make_feature_labellist() feature_convert = {} new_features = [] for feature in self.feature_labellist.keys(): if re.search(r"^" + prop,feature): if gen_functions.return_standard_deviation(self.feature_labellist[feature]) > threshold or len(self.feature_labellist[feature]) <= 2: self.feature_status[feature] = False else: new_feature = str(abs(int(numpy.median(self.feature_labellist[feature])))) + "_days" feature_convert[feature] = new_feature new_features.append(new_feature) self.feature_status[new_feature] = True else: self.feature_status[feature] = True new_features.append(feature) self.convert_features(feature_convert) self.prune_features() self.features = list(set(new_features)) def prune_features_topfrequency(self,n): #generate feature_frequency dict for f in self.features[:n]: self.feature_status[f] = True for f in self.features[n:]: self.feature_status[f] = False self.features = self.features[:n] self.prune_features() def balance_data(self): label_instances = defaultdict(list) new_training = [] for instance in self.training: label = instance["label"] label_instances[label].append(instance) if len(label_instances.keys()) > 2: median = int(numpy.median(numpy.array([len(label_instances[x]) for \ x in label_instances.keys()]))) for label in label_instances.keys(): if len(label_instances[label]) == median: new_training.extend(label_instances[label]) else: instances = lineconverter.Lineconverter(label_instances[label]) if len(instances.lines) < median: instances.sample(median-len(instances.lines),sample_type="up") else: instances.sample(len(instances.lines)-median) new_training.extend(instances.lines) self.training = new_training def index_features(self,ind = 0): feature_frequency=defaultdict(int) self.feature_info={} #print self.features for i,feature in enumerate(self.features): self.feature_info[feature]=i+ind def sparsify(instances,writelist): for instance in instances: sparse_features = defaultdict(int) for feature in instance["ngrams"]: try: sparse_features[self.feature_info[feature]] += 1 except: continue instance["sparse"] = sparse_features writelist.append(instance) new_instances = [] sparsify(self.training,new_instances) self.training = new_instances for tset in self.test: for instance in tset["instances"]: sparse_features = defaultdict(int) for feature in instance["ngrams"]: try: sparse_features[self.feature_info[feature]] += 1 except: continue instance["sparse"] = sparse_features def vectorize(self,instances): zerolist = [float(0)] * len(self.feature_info.keys()) matrix = [] for instance in instances: featurev = zerolist[:] for feature in instance["sparse"].keys(): if self.scaling == "binary": featurev[feature] = float(1) elif self.scaling == "log": featurev[feature] = math.log(instance["sparse"][feature],10) elif self.scaling == "tfidf": featurev[feature] = instance["sparse"][feature] * self.idf[feature] for feat in instance["features"]: featurev.append(feat) matrix.append(featurev) return matrix def model_necessities(self): #generate scipy libsvm input self.trainlabels_raw = [x["label"] for x in self.training] self.labels = set(self.trainlabels_raw) labeldict = dict(zip(self.labels,range(len(self.labels)))) self.labeldict_back = dict(zip(range(len(self.labels)),self.labels)) if self.scaling == "tfidf": self.idf = weight_features.return_idf(self.training) self.trainingvectors = self.vectorize(self.training) self.training_csr = csr_matrix(self.trainingvectors) self.trainlabels = [labeldict[x["label"]] for x in self.training] def predict(self,ts): testvectors = self.vectorize(ts) predictions = [] for i,t in enumerate(testvectors): classification = self.clf.predict(t) proba = self.clf.predict_proba(t) classification_label = self.labeldict_back[classification[0]] if len(ts[0]["meta"]) == 6: predictions.append([ts[i]["meta"][5], ts[i]["label"] + " " + classification_label, \ " ".join([str(round(x,2)) for x in proba.tolist()[0]])]) else: predictions.append([" ".join([x for x in ts[i]["ngrams"] if not re.search("_",x)]), ts[i]["label"] + " " + classification_label, \ " ".join([str(round(x,2)) for x in proba.tolist()[0]])]) return predictions def train_svm(self,params = 10): #obtain the best parameter settings for an svm outputcode classifier if len(self.labels) > 2: print("outputcodeclassifier") param_grid = {'estimator__C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'estimator__kernel': ['linear','rbf','poly'], 'estimator__gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'estimator__degree': [1,2,3,4]} model = OutputCodeClassifier(svm.SVC(probability=True)) else: print("svc model") param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'kernel': ['linear','rbf','poly'], 'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'degree': [1,2,3,4]} model = svm.SVC(probability=True) paramsearch = RandomizedSearchCV(model, param_grid, cv=5, verbose=2,n_iter = params,n_jobs=self.jobs) print("Grid search...") paramsearch.fit(self.training_csr,numpy.asarray(self.trainlabels)) print("Prediction...") #print the best parameters to the file parameters = paramsearch.best_params_ self.outstring = "best parameter settings:\n" for parameter in parameters.keys(): self.outstring += (parameter + ": " + str(parameters[parameter]) + "\n") self.outstring += ("best score: " + str(paramsearch.best_score_) + "\n\n") #train an svm outputcode classifier using the best parameters if len(self.labels) > 2: clf = svm.SVC(probability=True, C=parameters['estimator__C'], kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'], degree=parameters['estimator__degree']) self.clf = OutputCodeClassifier(clf,n_jobs=self.jobs) self.clf.fit(self.training_csr,self.trainlabels) else: self.clf = svm.SVC(probability=True, C=parameters['C'], kernel=parameters['kernel'],gamma=parameters['gamma'], degree=parameters['degree']) self.clf.fit(self.training_csr,self.trainlabels) def train_nb(self): self.clf = naive_bayes.MultinomialNB() self.clf.fit(self.training_csr,self.trainlabels) def train_decisiontree(self): self.clf = tree.DecisionTreeClassifier() self.clf.fit(self.training_csr.toarray(),self.trainlabels) def tenfold_train(self,voting,classifiers = [],p = 10): kf = cross_validation.KFold(len(self.training), n_folds=10) training = deepcopy(self.training) feat = deepcopy(self.features) fi = deepcopy(self.feature_info) if voting == "weighted": self.feature_info = {} self.features = [] for instance in self.training: instance["sparse"] = defaultdict(int) instance["ngrams"] = [] len_features = len(self.features) for i,fn in enumerate(classifiers): featurename = "___" + fn self.feature_info[featurename] = len_features + i self.features.append(featurename) for train_index, test_index in kf: train = deepcopy([training[x] for x in train_index]) test = deepcopy([training[y] for y in test_index]) cl = Classifier(train,test,features = feat,feature_info = fi) cl.model_necessities() if "svm" in classifiers: cl.train_svm(params = p) predictions = cl.predict(test) for i,j in enumerate(test_index): prediction = int(float(predictions[i][1].split()[1])) self.training[j]["sparse"][self.feature_info["___svm"]] = prediction if prediction == 1: self.training[j]["ngrams"].append("___svm") if "nb" in classifiers: cl.train_nb() predictions = cl.predict(test) for i,j in enumerate(test_index): prediction = int(float(predictions[i][1].split()[1])) self.training[j]["sparse"][self.feature_info["___nb"]] = prediction if prediction == 1: self.training[j]["ngrams"].append("___nb") if "dt" in classifiers: cl.train_decisiontree() predictions = cl.predict(test) for i,j in enumerate(test_index): prediction = int(float(predictions[i][1].split()[1])) self.training[j]["sparse"][self.feature_info["___dt"]] = prediction if prediction == 1: self.training[j]["ngrams"].append("___dt") def return_classification_features(self): prediction_features_testset = [] for tset in self.test: prediction_features = [] predictions = self.predict(tset["instances"]) for i,prediction in enumerate(predictions): prediction_features.append(int(float(predictions[i][1].split()[1]))) prediction_features_testset.append(prediction_features) return prediction_features_testset def add_classification_features(self,featuredict,featurenames,voter): if voter == "majority": self.feature_info = {} len_features = len(self.feature_info.keys()) for i,fn in enumerate(featurenames): self.feature_info[fn] = len_features + i self.features.append(fn) for i,tset in enumerate(self.test): for j,instance in enumerate(tset["instances"]): if voter != "arbiter": tset["instances"][j]["sparse"] = defaultdict(int) tset["instances"][j]["ngrams"] = [] for fn in featurenames: tset["instances"][j]["sparse"][self.feature_info[fn]] = featuredict[i][j][fn] tset["instances"][j]["ngrams"].append(fn) def append_classifier_labelings(self): len_features = len(self.feature_info.keys()) self.feature_info["___append"] = len_features self.features.append("___append") for instance in self.training: instance["sparse"][self.feature_info["___append"]] = instance["append"] if instance["append"] == 1: instance["features"].append("___append") for tset in self.test: for instance in tset["instances"]: instance["sparse"][self.feature_info["___append"]] = instance["append"] if instance["append"] == 1: instance["features"].append("___append") def output_data(self): if re.search(".txt",self.test[0]["out"]): outdir = self.test[0]["out"][:-4] + "_" else: outdir = self.test[0]["out"] #output features #featureout = codecs.open(outdir + "features.txt","w","utf-8") featureout = open(outdir + "features.txt", "w", encoding = "utf-8") for feature in sorted(self.feature_info, key=self.feature_info.get): featureout.write(feature + "\t" + str(self.feature_info[feature]) + "\n") featureout.close() #output trainfile #trainout = codecs.open(outdir + "train.txt","w","utf-8") trainout = open(outdir + "train.txt", "w", encoding = "utf-8") for instance in self.training: trainout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + ",".join([str(x) for x in instance["sparse"].keys()]) + "\n") trainout.close() #output testfile #testout = codecs.open(outdir + "test.txt","w","utf-8") testout = open(outdir + "test.txt", "w", encoding = "utf-8") for i,tset in enumerate(self.test): #testout = codecs.open(outdir + "test" + str(i) + ".txt","w","utf-8") for instance in tset["instances"]: testout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + ",".join([str(x) for x in instance["sparse"].keys()]) + "\n") def test_model(self): for tset in self.test: testresults = self.predict(tset["instances"]) #outfile = codecs.open(tset["out"] + "predictions.txt","w","utf-8") if re.search(".txt",tset["out"]): outstring = tset["out"][:-4] + "_predictions.txt" else: outstring = tset["out"] + "predictions.txt" # outfile = codecs.open(outstring,"w","utf-8") outfile = open(outstring, "w", encoding = "utf-8") if self.outstring: outfile.write(self.outstring) for instance in testresults: outfile.write("\t".join(instance) + "\n") outfile.close() def save_model(self): for tset in self.test: outfile = tset["out"][:-4] + "_model.joblib.pkl" #with open(outfile, 'wb') as fid: # cPickle.dump(self.clf, fid) with open(outfile, 'wb') as fid: pickle.dump(self.clf, fid) #_ = joblib.dump(, outfile, compress=9) #outvocabulary = codecs.open(tset["out"] + "vocabulary.txt","w","utf-8") outstring = tset["out"][:-4] + "_vocabulary.txt" #outvocabulary = codecs.open(outstring,"w","utf-8") outvocabulary = open(outstring, "w", encoding = "utf-8") for feature in self.features: outvocabulary.write(feature + "\n") outvocabulary.close() #outidf = codecs.open(tset["out"][:-4] + "_idfs.txt","w","utf-8") outidf = open(tset["out"][:-4] + "_idfs.txt", "w", encoding = "utf-8") for key in self.idf.keys(): outidf.write(str(key) + "\t" + str(self.idf[key]) + "\n") outidf.close()
WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients'] ] test_ingredients.append(' '.join(ings)) #used to encode labels as numbers for use with RandomForestClassifier le = LabelEncoder() #encode cuisines as numbers train_cuisines = le.fit_transform(train_cuisines) #used to create bag of ingredients vocabulary and create features for each entry vectorizer = CountVectorizer() train_features = vectorizer.fit_transform(train_ingredients).toarray() test_features = vectorizer.transform(test_ingredients).toarray() clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2) result = clf.fit(train_features, train_cuisines).predict(test_features) output = pd.DataFrame(data={ 'id': test_ids, 'cuisine': le.inverse_transform(result) }) #force explicit ordering of columns output = output[['id', 'cuisine']] output.to_csv('ecoc.csv', index=False)
for w in words: for i, word in enumerate(vocab): if word == w: bag_vector[i] += 1 print("{0} \n{1}\n".format(sentence, numpy.array(bag_vector))) allsentences = [ "Joe waited`s for the train", "The train was late", "Mary and Samantha took the bus", "I looked for Mary and Samantha at the bus station", "Mary and Samantha arrived at the bus station early but waited until noon for the bus" ] generate_bow(allsentences) from sklearn import datasets from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC iris = datasets.load_iris() X, y = iris.data, iris.target print(X) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X, y) m = clf.predict(X) print(m)