def expert_training(self): history_context, history_action = self.data_simulation() logreg = OneVsRestClassifier(LogisticRegression()) mnb = OneVsRestClassifier(MultinomialNB(),) logreg.fit(history_context, history_action) mnb.fit(history_context, history_action) return [logreg, mnb]
def _calculate(self, X, y, categorical): import sklearn.discriminant_analysis if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. try: for train, test in kf.split(X, y): lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis() if len(y.shape) == 1 or y.shape[1] == 1: lda.fit(X[train], y[train]) else: lda = OneVsRestClassifier(lda) lda.fit(X[train], y[train]) predictions = lda.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10 except scipy.linalg.LinAlgError as e: self.logger.warning("LDA failed: %s Returned 0 instead!" % e) return np.NaN except ValueError as e: self.logger.warning("LDA failed: %s Returned 0 instead!" % e) return np.NaN
def experienceSVMTrain(trainData, testData, testCounts, classifierNumber = 0): if classifierNumber == 0: classifier = OneVsRestClassifier(svm.SVC()) algorithmName = 'OneVsRestClassifier' elif classifierNumber == 1: classifier = svm.SVC() algorithmName = 'SupportVectorClassifier' elif classifierNumber == 2: classifier = RandomForestClassifier(n_estimators= 1000, n_jobs = 4) algorithmName = 'RandomForestClassifier' else: classifier = KNeighborsClassifier(n_neighbors=3) algorithmName = 'KNeighborsClassifier' print_(algorithmName, 'has been started to train the data by', nowStr()) classifier.fit(preprocessing.scale(trainData['X']), trainData['Y']) print_(algorithmName, 'has been started to predict the test data by', nowStr()) predictions = classifier.predict(preprocessing.scale(testData['X'])) truePositives = 0 truePositiveCounts = {genre: 0 for genre in genreSet} predictionCount = len(predictions) for i in range(predictionCount): if predictions[i] == testData['Y'][i]: truePositives += 1 truePositiveCounts[genreSet[testData['Y'][i]]] += 1 print_(algorithmName, 'Experiment has been finished by', nowStr()) print_('\nGeneral Test Accuracy = %.3f' % (truePositives / float(predictionCount))) print('\nTotal Number of predictions:', predictionCount) print('Number of true predictions: ', truePositives) print('Number of false predictions: ', predictionCount-truePositives) print_('\nTesting distribution: ', {genre: testCounts[genre] for genre in genreSet}) print_('Distribution of true predictions: ', truePositiveCounts) falseNegativeCounts = {genre: testCounts[genre]-truePositiveCounts[genre] for genre in genreSet} print_('Distribution of false predictions:', falseNegativeCounts, '\n')
def main(): img_dir = 'images/' images = [img_dir + f for f in os.listdir(img_dir)] labels = [f.split('/')[-1].split('_')[0] for f in images] label2ids = {v: i for i, v in enumerate(sorted(set(labels), key=labels.index))} y = np.array([label2ids[l] for l in labels]) data = [] for image_file in images: img = img_to_matrix(image_file) img = flatten_image(img) data.append(img) data = np.array(data) # training samples is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_X, train_y = data[is_train], y[is_train] # training a classifier pca = RandomizedPCA(n_components=5) train_X = pca.fit_transform(train_X) multi_svm = OneVsRestClassifier(LinearSVC()) multi_svm.fit(train_X, train_y) # evaluating the model test_X, test_y = data[is_train == False], y[is_train == False] test_X = pca.transform(test_X) print pd.crosstab(test_y, multi_svm.predict(test_X), rownames=['Actual'], colnames=['Predicted'])
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, return_indicator=True, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) decision_only.fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, class_weight=None), n_jobs=-1 ) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
def main(): dataTuples=getDataInFormat() print "Length of dataTuples is: ", len(dataTuples) shuffle(dataTuples) trainTuples=dataTuples del dataTuples ids, labels, vectors= getLabelsAndVectors(trainTuples) del trainTuples followerCountsList = loadFollowerCountsFromFile() space=getSpace(vectors) reducedSpace=getReducedSpace(vectors, space) spaceWithMetaFeatures= augmentSpace(reducedSpace, emotionFeatures) print "Total # of features in your space is: ", len(space) print "Total # of features in your reducedSpace is: ", len(reducedSpace) oneHotVectors=getOneHotVectors(ids, labels, vectors,spaceWithMetaFeatures , followerCountsList) trainVectors, trainLabels=getOneHotVectorsAndLabels(oneHotVectors) del oneHotVectors clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear',gamma=0.1, verbose= False, probability=False)) clf.fit(trainVectors, trainLabels) print "\nDone fitting classifier on training data...\n" print "\nDone fitting classifier on training data...\n" print "="*50, "\n" print "Results with 10-fold cross validation:\n" print "="*50, "\n" predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=10) print "*"*20 print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted) print "*"*20 print "precision_score\t", metrics.precision_score(trainLabels, predicted) print "recall_score\t", metrics.recall_score(trainLabels, predicted) print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted) print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)
def test_ovr_partial_fit(): # Test if partial_fit is working as intented X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovr.partial_fit(iris.data[60:], iris.target[60:]) pred = ovr.predict(iris.data) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred), 0.65)
def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def main(): word_vec_dict = readGloveData("./glove.twitter.27B/glove.twitter.27B.25d.txt") tweets = readTweets("./dataset_raw/semeval2016-task6-trainingdata.txt") tweetVectors = getTweetVectors(tweets[0 : len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {"favor": 1, "none": 0, "against": 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC()) # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543) X_train = X[0 : int(0.7 * len(X))] y_train = Y[0 : int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)) : len(X)] y_test = Y[int(0.7 * len(Y)) : len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test) y_pred = clf.predict(X_test) for indexMax in xrange(len(y_test)): print str(y_pred[indexMax]) + " " + str(y_test[indexMax])
def svm(): #load data x_train,y_train=load_svmlight_file("12trainset") x_train.todense() x_test,y_test=load_svmlight_file("12testdata") x_test.todense() sk=SelectKBest(f_classif,9).fit(x_train,y_train) x_new=sk.transform(x_train) x_newtest=sk.transform(x_test) print(sk.scores_) print(x_new.shape) print(sk.get_support()) #classfier clf=SVC(C=2,gamma=2) ovrclf=OneVsRestClassifier(clf,-1) ovrclf.fit(x_train,y_train) y_pred=ovrclf.predict(x_test) # write result with open("result.txt","w") as fw: for st in y_pred.tolist(): fw.write(str(st)+'\n') print(np.array(y_pred).shape) target_names=['0','1','2','3'] #result #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) #print(classification_report(y_test,y_pred,target_names=target_names)) #print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def train_svm(X, y): """ Create and train the Support Vector Machine. """ svm = OneVsRestClassifier(SVC(C=1000000.0, gamma='auto', kernel='rbf')) svm.fit(X, y) return svm
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def train(self, trainfile_name): print >>sys.stderr, "Reading data.." train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")] shuffle(train_data) filter_feature = get_filter() train_labels, train_clauses = zip(*train_data) train_labels = [tl.lower() for tl in train_labels] print >>sys.stderr, "Indexing features.." self.fp.index_data(train_clauses, filter_feature) X = numpy.asarray([self.fp.featurize(clause, filter_feature) for clause in train_clauses]) tagset = list(set(train_labels)) tag_index = {l:i for (i, l) in enumerate(tagset)} Y = numpy.asarray([[tag_index[label]] for label in train_labels]) classifier = OneVsRestClassifier(SVC(kernel='linear')) if self.cv: print >>sys.stderr, "Starting Cross-validation for %d folds.."%(self.folds) y = [l[0] for l in Y] scores = cross_validation.cross_val_score(classifier, X, y, cv=self.folds, scoring='f1_weighted') print >>sys.stderr, "Scores:", scores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(scores.mean(), scores.std() * 2) print >>sys.stderr, "Starting training.." classifier.fit(X, Y) pickle.dump(classifier, open(self.trained_model_name, "wb")) pickle.dump(self.fp.feat_index, open(self.feat_index_name, "wb")) pickle.dump(tagset, open(self.stored_tagset, "wb")) print >>sys.stderr, "Done"
def trainAndPredictLR(trainX, trainY, testX): """ Logistic regression is used for predicting the target labels of the test data The probability of belonging to each of the labels is predicted for every test data and the labels with the top 10 probability values are extracted Input: 1. trainX: ntrainingSamples * 2000 numpy matrix representing training data features 2. trainY: ntrainingSamples * 185 numpy matrix representing the training data labels 3. testX: ntestSamples * 2000 numpy matrix representing test data features Output: testY: ntestSamples * 19 numpy matrix representing the labels for the test data """ clf = OneVsRestClassifier(LogisticRegression(C = 1.0)) clf.fit(trainX, trainY) actY = clf.predict_proba(testX) testY = [] # fetch the labels with max probability for prob in actY: y = [] for i in range(10): index = np.argmax(prob, axis=0) classVal = classOrder[index] y.append(classVal) prob[index] = -1 testY.append(y) return np.array(testY)
def benchmark(clf_current): print('_' * 80) print("Test performance for: ") clf_descr = str(clf_current).split('(')[0] print(clf_descr) t0 = time() classif = OneVsRestClassifier(clf_current) classif.fit(X_train, Y_train.toarray()) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() if hasattr(clf_current,"decision_function"): dfmatrix = classif.decision_function(X_test) score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5)) else: probsmatrix = classif.predict_proba(X_test) score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5)) test_time = time() - t0 print("f1-score: %0.7f" % score) print("test time: %0.3fs" % test_time) print('_' * 80) return clf_descr, score, train_time, test_time
def run_classifier(sentences, labels, test_doc_list, output_file_path_list): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC estimator = LinearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) for test_doc, output_file_path in zip(test_doc_list, output_file_path_list): test_sentences = doc2sentences([test_doc]) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) predictions = classifier.predict(sentence_matrix) from lxml import etree document = etree.Element('doc') doc_tree = etree.ElementTree(document) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i] doc_tree.write(output_file_path)
class ClassDistanceMapper(TransformerMixin): """ Fit a OneVsRestClassifier for each sentiment class (against all others combined) and return the distances from the decision boundary for each class. Hence, this transformation can be seen as a dimensionality reduction from #words to #sentiment_classes (=5). """ def __init__(self): """ Initialize a one-vs-rest multiclass classifer with a SGDClassifier. The choice of the SGDclassifier here is arbitrary, any other classifier might work as well. """ self.clf = OneVsRestClassifier(LogisticRegression()) def fit(self, X, y): """ Fit the multiclass classifier. """ self.clf.fit(X, y) return self def transform(self, X): """ Return the distance of each sample from the decision boundary for each class. """ return self.clf.decision_function(X)
def fit(self, df_X, df_y): if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") le = LabelEncoder() y = le.fit_transform(df_y.iloc[:,0].values) clf = RandomForestClassifier(n_estimators=100) # Multiclass if len(le.classes_) > 2: orc = OneVsRestClassifier(clf) orc.fit(df_X.values, y) importances = np.array([c.feature_importances_ for c in orc.estimators_]).T else: # Only two classes clf.fit(df_X.values, y) importances = np.array([ clf.feature_importances_, clf.feature_importances_ ]).T for i,c in enumerate(le.classes_): diff = df_X.loc[y == c].quantile(q=0.75) - df_X.loc[y != c].quantile(q=0.75) sign = (diff >= 0) * 2 - 1 importances[:,i] *= sign # create output DataFrame self.act_ = pd.DataFrame(importances, columns=le.inverse_transform(range(len(le.classes_))), index=df_X.columns)
def setUp(self): import sklearn.svm as svm import sklearn.preprocessing as pp from sklearn.multiclass import OneVsRestClassifier # 2 class iris = datasets.load_iris() self.data = iris.data self.target = pp.LabelBinarizer().fit_transform(iris.target) self.df = pdml.ModelFrame(self.data, target=self.target) self.assertEqual(self.df.shape, (150, 7)) svc1 = svm.SVC(probability=True, random_state=self.random_state) estimator1 = OneVsRestClassifier(svc1) self.df.fit(estimator1) self.df.predict(estimator1) self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame)) svc2 = svm.SVC(probability=True, random_state=self.random_state) estimator2 = OneVsRestClassifier(svc2) estimator2.fit(self.data, self.target) self.pred = estimator2.predict(self.data) self.proba = estimator2.predict_proba(self.data) self.decision = estimator2.decision_function(self.data) # argument for classification reports self.labels = np.array([2, 1, 0])
def one_vs_all(X, y, test_size=0.2, run_num = 100, svm_type='linear'): """Trains 15 1 vs all SVM classifiers of specified type""" # Python has a wonderful wrapper function that creates 1 vs all classifiers! if type == 'linear': estimator = LinearSVC() else: # This will automatically use RBF functions estimator = SVC() ovr = OneVsRestClassifier(estimator = estimator) acc_tr = [] acc_tst = [] for i in range(run_num): [X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size=test_size) # Train the classifier ovr.fit(X_train, y_train.ravel()) # Work out the score on the training data. However there is nothing # to optimise for - we are just getting an idea of the accuracy for # training vs test data. box plot opportunity! tr_acc = ovr.score(X_train, y_train.ravel()) tst_acc = ovr.score(X_test, y_test.ravel()) acc_tr.append(tr_acc) acc_tst.append(tst_acc) # All the data isn't used here as it tends to overtrain the classifier. return ovr, acc_tr, acc_tst
def train_linear(X, Y, splits, model_config, results_dir, best_k=10, validation_score='f1', threshold_score='f1', threshold_criterion='zack', fn_prefix='', label_idx=None): label_idx = np.arange(Y.shape[1]) if label_idx is None else label_idx best_perf = None best_C = None best_model = None for C in np.logspace(-3,3, num=20): sys.stdout.write('Training Ridge Regression with C={0}...'.format(C)) sys.stdout.flush() model = OneVsRestClassifier(LogisticRegression(C=C)) try: model.fit(X[splits[0]], Y[splits[0]]) except KeyboardInterrupt: sys.stdout.write('training interrupted...') break except: raise Yp = model.predict_proba(X[splits[1]]) perf = compute_micro_evaluations(Y[splits[1]][:,label_idx], Yp[:,label_idx], k=best_k, threshold_score=threshold_score, criterion=threshold_criterion) sys.stdout.write(' {0}={1:.4f}'.format(validation_score, perf[validation_score])) sys.stdout.flush() if best_perf is None or perf[validation_score] > best_perf[validation_score]: best_perf = perf best_model = model best_C = C sys.stdout.write(' *BEST') sys.stdout.write('\n') model_config['C'] = best_C cPickle.dump(best_model, open(os.path.join(results_dir, fn_prefix + '-model.pkl'), 'wb')) return best_model, model_config
def make_classifier(): test_size=0 X, y = make_X_Y() X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size) X_train = X_train.astype(int) X_test = X_test.astype(int) y_train = y_train.astype(int) y_test = y_test.astype(int) clf = OneVsRestClassifier(SVC(kernel='linear', class_weight='auto', probability=True)) clf.fit(X_train, y_train) try: y_suggest = clf.predict_proba(X_test) nn = 0 n = 0 for y_s, y_t in zip(y_suggest, y_test): s1 = chords_Y[np.argmax(y_s)] y_s[np.argmax(y_s)]=0 s2 = chords_Y[np.argmax(y_s)] t = chords_Y[np.argmax(y_t)] print 'Suggest: ' + s1 + ' or ' + s2 + ' Real: ' + t n = n+1 if s1==t: nn = nn+1 if n>0: print 'Accuracy is ' + str(float(nn)/n) except ValueError: pass #print classification_report(clf.predict(X_test), y_test) pickle.dump(clf, open("classifier.bin", "wb"))
def main(): word_vec_dict = readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt') tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict) print tweets[0] print getSumVectors(tweets[0], word_vec_dict) tweetClasses = set(tweets[-1]) mapping = {'favor': 1, 'none': 0, 'against': 1} tweetClasses = np.asarray([mapping[x] for x in tweets[-1]]) tweetData = np.asarray(tweetVectors) print tweetClasses.shape print tweetData.shape X = tweetData Y = tweetClasses clf = OneVsRestClassifier(LinearSVC(random_state=0)) # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0) X_train = X[0:int(0.7 * len(X))] y_train = Y[0:int(0.7 * len(Y))] X_test = X[int(0.7 * len(X)) : len(X)] y_test = Y[int(0.7 * len(Y)) : len(Y)] clf.fit(X_train, y_train) print clf.score(X_test, y_test)
def train_data_SVC(X, y): """ Create and train the Support Vector Machine. """ classif = OneVsRestClassifier(LinearSVC()) classif.fit(X,y) return classif
def prepare_multiclass_clf(X, y): clf = GridSearchCV(LogisticRegression(penalty='l1'), {'C': np.logspace(-4, 2, 10)}, scoring='accuracy', cv=5) multi_clf = OneVsRestClassifier(clf) multi_clf.fit(X, y) return multi_clf
def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath): logger.info("start of training and testing phase") classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE) logger.info("loading data set") dataset, features_names = load_dataset(datasetFilePath) #limited_dataset = limit_dataset(dataset) limited_dataset = dataset ml_dataset = split_dataset(limited_dataset, len(features_names)) logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape)) classifier.fit(ml_dataset.X_train, ml_dataset.y_train) logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape)) y_pred = classifier.predict(ml_dataset.X_test) y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test) y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities) validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"]) plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1") plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2") plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3") produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath) logger.info("exporting classifier model") joblib.dump(classifier, classifierFilePath) logger.info("end of training and testing phase")
def run_naive_bayes(cls, train, test, binarizer, labels, alpha): # logging logging = configure_log(__file__) logging.info("alpha = %s" % (str(alpha))) logging.info("Fitting Naive Bayes...") train_data, train_labels = train test_data, test_labels = test classifier = OneVsRestClassifier(MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)) with warnings.catch_warnings(): # FIXME: split the data set in a way that the train set has every label warnings.simplefilter("ignore") classifier.fit(train_data, train_labels) possible_labels = set() [map(possible_labels.add, row) for row in [label.nonzero()[0] for label in labels]] logging.info("Predicting test set...") test_predictions = cls.predict( classifier=classifier, data=test_data, labels=test_labels, possible_labels=possible_labels, binarizer=binarizer, ) # logging.info('Predicting train set...') # train_predictions = cls.predict(classifier=classifier, data=train_data, labels=train_labels, # possible_labels=possible_labels, binarizer=binarizer) test_precision = precision_score(y_true=test_labels, y_pred=test_predictions, average="samples") # train_precision = precision_score(y_true=train_labels, y_pred=train_predictions, average='samples') # return train_precision, test_precision return test_precision
def run_classifier(sentences, labels, test_docs): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) test_sentences = doc2sentences(test_docs) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import linearSVC # estimator = SVC(kernel='linear') estimator = linearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) predictions = classifier.predict(sentence_matrix) import csv with open("classified.csv", "w") as fl: writer = csv.writer(fl) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] writer.writerow((test_sentences[i], curr_pred))
def svm_fixed(train_X, train_Y): C = 1. kernel = 'linear' gamma = 0.01 estimator = SVC(C=C, kernel=kernel, gamma=gamma) classifier = OneVsRestClassifier(estimator) classifier.fit(train_X, train_Y) return classifier
from sklearn import metrics from sklearn.preprocessing import MultiLabelBinarizer from sklearn.multiclass import OneVsRestClassifier from sklearn.cross_validation import train_test_split from sklearn.svm import SVC x = [[1,2,3],[3,3,2],[8,8,7],[3,7,1],[4,5,6]] y = [['bar','foo'],['bar'],['foo'],['foo','jump'],['bar','fox','jump']] mlb = MultiLabelBinarizer() y_enc = mlb.fit_transform(y) train_x, test_x, train_y, test_y = train_test_split(x, y_enc, test_size=0.33) clf = OneVsRestClassifier(SVC(probability=True)) clf.fit(train_x, train_y) predictions = clf.predict(test_x) my_metrics = metrics.classification_report( test_y, predictions) print(my_metrics)
def main(): # Read JSON files into Pandas DataFrames print('Reading data into DataFrames...') omdb_filename = "./movies/data/omdb-data.json.gz" rotten_filename = "./movies/data/rotten-tomatoes.json.gz" wikidata_filename = "./movies/data/wikidata-movies.json.gz" genres_filename = "./movies/data/genres.json.gz" omdb = pd.read_json(omdb_filename, lines=True) rotten = pd.read_json(rotten_filename, lines=True) wikidata = pd.read_json(wikidata_filename, lines=True) genres = pd.read_json(genres_filename, lines=True) # Convert genres DataFrame to a dictionary of genre_code:genre_label pairs. genre_map = pd.Series(genres.genre_label.values, index=genres.wikidata_id).to_dict() # Create DataFrame of plot summaries with corresponding imdb id plot_summaries = omdb[['imdb_id', 'omdb_plot']] plot_summaries = plot_summaries.sort_values(by=['imdb_id']) plot_summaries = plot_summaries.set_index('imdb_id') wikidata = wikidata.sort_values(by=['imdb_id']) wikidata = wikidata.set_index('imdb_id') wikidata = wikidata[[ # 'publication_date', # 'wikidata_id', 'genre' ]] # Clean data. print('Cleaning data...') movies_data = pd.merge(wikidata, plot_summaries, on='imdb_id') # Remove movies with no plot summary. movies_data = movies_data[movies_data['omdb_plot'] != 'N/A'] # Convert plot summaries to lowercase. movies_data['omdb_plot'] = movies_data['omdb_plot'].str.lower() # Remove all punctuations in plot summaries. movies_data['omdb_plot'] = movies_data['omdb_plot'].apply( remove_punctuations) # Tokenize strings movies_data['omdb_plot'] = movies_data['omdb_plot'].apply(tokenize) # Remove stop words. stop_words = stopwords.words('english') stop_words.append('platform') stop_words.append('film') movies_data['omdb_plot'] = movies_data['omdb_plot'].apply( lambda x: [word for word in x if word not in stop_words]) movies_data['clean_summary'] = movies_data['omdb_plot'].apply( lambda x: ' '.join(x)) plot_summaries_words = movies_data['omdb_plot'].apply( lambda x: ' '.join(x)) # Create and generate word cloud image. # generate_word_cloud(plot_summaries_words) # Get all the genres from the movies. genres_all = [] for index, row in movies_data.iterrows(): genres_all.append(row['genre']) # Flatten the genres list. genres = [] for sublist in genres_all: for item in sublist: genres.append(item) # Get the distinct genres. genres = list(set(genres)) # This DataFrame will separate all the distinct genres into separate columns, and show which movie is associated with which individual genre. movies_data2 = movies_data for genre_code in genres: movies_data2[genre_code] = 0 for index, row in movies_data2.iterrows(): for genre_code in row['genre']: movies_data2.loc[index, genre_code] = movies_data2.loc[index, genre_code] + 1 # Get the number of counts for each genre. # genres_count = [] # for col in movies_data2.columns[2:]: # genres_count.append(movies_data2[col].sum()) # Get the English label names for each genre. # genre_labels = [] # for label in genres: # genre_labels.append(genre_map.get(label)) # print(genre_labels) multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(movies_data2['genre']) # Extract features from cleaned plot summaries by usng tf-idf. tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=500) # Split data into train and validation data sets. X = movies_data2['clean_summary'] y = multilabel_binarizer.transform(movies_data2['genre']) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=9) # X_train, X_valid, y_train, y_valid = train_test_split(X, y) # Create tf-idf features. X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) X_valid_tfidf = tfidf_vectorizer.transform(X_valid) # Build the genre prediction model. print('Building prediction model...') lr = LogisticRegression() model = OneVsRestClassifier(lr) # Train the model. print('Training model...') model.fit(X_train_tfidf, y_train) # Predict on validation data set. print('Making predictions...') y_prediction = model.predict_proba(X_valid_tfidf) y_prediction = (y_prediction >= 0.25).astype(int) predictions = multilabel_binarizer.inverse_transform(y_prediction) print('\nPredicted genre codes: ') res = pd.Series(predictions) print(res) print('\nf1 score: {}\n'.format( f1_score(y_valid, y_prediction, average="micro"))) # Show 10 genre predictions, and compare it with the actual genres. def make_predictions(data): data_tfidf = tfidf_vectorizer.transform([data]) data_prediction = model.predict_proba(data_tfidf) data_prediction = (data_prediction >= 0.25).astype(int) return multilabel_binarizer.inverse_transform(data_prediction) for i in range(10): data = X_valid.sample(1).index[0] predicted_genre = make_predictions(X_valid[data]) actual_genre = movies_data2['genre'][data] summary = movies_data2['clean_summary'][data] predicted_genre_labels = [] for code_set in predicted_genre: for code in code_set: predicted_genre_labels.append(genre_map.get(code)) actual_genre_labels = [] for code in actual_genre: actual_genre_labels.append(genre_map.get(code)) print('IMDB ID: {}'.format(data)) # print('Plot summary: {}'.format(summary)) print('\tReal genre(s): {}'.format(actual_genre_labels)) print('\tPredicted genre(s): {}\n'.format(predicted_genre_labels))
y = data[["Encodings"]] X = data.drop(["Encodings", "Pictures"], axis=1) clf = OneVsRestClassifier(SVC(gamma="auto", probability=False, C=400)) scaling = MinMaxScaler(feature_range=(-1, 1)).fit(X) X_scaled = scaling.transform(X) print("CV =", np.mean(cross_val_score(clf, X_scaled, y.values.ravel(), cv=5))) def accuracy(pred, actual): # Calculate the accuracy percentage of the predicted values return sum(pred == actual) / len(actual) clf = OneVsRestClassifier(SVC(gamma="auto", probability=False, C=400)) clf.fit(X_scaled, y.values.ravel()) X_test = pd.read_pickle("test_with_feature.pkl").drop(["Pictures"], axis=1) X_test_scaled = scaling.transform(X_test) pred = clf.predict(X_test_scaled) test_acc = accuracy(pred, ground_truth) print(test_acc) pred # Random Forest clf = RandomForestClassifier(n_estimators=1000, max_depth=18, random_state=42) # print("CV =", np.mean(cross_val_score(clf, X, y.values.ravel(), cv=5))) clf.fit(X, y.values.ravel()) pred = clf.predict(X_test) test_acc = accuracy(pred, ground_truth) print(test_acc)
def model1(self): """SVM model.""" X, y = self._split_data() # Binarize the output y = label_binarize(y, classes=[0, 1, 2, 3]) n_classes = y.shape[1] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=self.random_seed) # Learn to predict each class against the other classifier = OneVsRestClassifier( svm.SVC(kernel='poly', degree=2, probability=True, tol=1e-6, random_state=self.random_seed)) # , gamma= 0.1)) y_score = classifier.fit(X_train, y_train).decision_function(X_test) # Binarize the output y = label_binarize(y, classes=[0, 1, 2, 3]) n_classes = y.shape[1] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=self.random_seed) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['cyan', 'magenta', 'yellow', 'cornflowerblue']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc="lower right") plt.savefig(self.path_to_file + f'Figures/Funk-OneVsRestClassifier-SVM-poly-ROC_.png') plt.show() y_prob = classifier.predict_proba(X_test) macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="macro") weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="weighted") macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro") weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo)) print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr))
print(titanic_df.head()) if (cat == 1): X = titanic_df.drop("Survived", axis=1) Y = titanic_df["Survived"] X = X.as_matrix().astype(np.int) Y = Y.as_matrix().astype(np.int) return X, Y else: X = titanic_df X = X.as_matrix().astype(np.int) return X train_location = "C:/Users/Oliver Crosbie Higgs/Documents/personal projects/train.csv" test_location = "C:/Users/Oliver Crosbie Higgs/Documents/personal projects/test.csv" X_train, Y_train = transform_data(train_location, 1) X_test = transform_data(test_location, 0) forest = RandomForestClassifier(n_estimators=250, random_state=0) model1a = DTC(max_depth=10) classifier = OneVsRestClassifier(model1a) y_score = classifier.fit(X_train, Y_train).predict(X_test.astype(int)) np.savetxt("y_score.csv", y_score, delimiter=",")
np.random.set_state(state) np.random.shuffle(label) train_num=1500 test_num=1500 data_train=data[0:train_num,] label_train=label[0:train_num,] data_test=data[train_num:train_num+test_num,] label_test=label[train_num:train_num+test_num,] ## multi classification model_0 =OneVsRestClassifier(SVC(kernel='linear', probability=True,gamma='scale')) model_0.fit(data_train, label_train) pre_0 = model_0.predict_proba(data_test) max_ind=np.argmax(pre_0,axis=1) # print(max_ind) pre=np.zeros_like(pre_0) for i in range(pre.shape[0]): pre[i,max_ind[i]]=1 # print(pre) pre_train0=model_0.predict_proba(data_train) max_ind_train=np.argmax(pre_train0,axis=1) # print(max_ind) pre_train=np.zeros_like(pre_0) for i in range(max_ind_train.shape[0]): pre_train[i,max_ind_train[i]]=1
# Use label_binarize to be multi-label like settings Y = label_binarize(y, classes=[0, 1, 2]) n_classes = Y.shape[1] # Split into training and test X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, random_state=random_state) # We use OneVsRestClassifier for multi-label prediction from sklearn.multiclass import OneVsRestClassifier # Run classifier classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state)) classifier.fit(X_train, Y_train) y_score = classifier.decision_function(X_test) ############################################################################### # The average precision score in multi-label settings # .................................................... from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score # For each class precision = dict() recall = dict() average_precision = dict() for i in range(n_classes): precision[i], recall[i], _ = precision_recall_curve( Y_test[:, i], y_score[:, i])
x_all = np.hstack(( x_num_all, fac_x_cat_all )) #train-test split x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=.75,random_state=24) #NOTE: change classifier here clf1 = OneVsRestClassifier(RandomForestClassifier(n_estimators=250, max_features='auto', n_jobs=4, max_depth=5)) clf2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=250, algorithm='SAMME')) clf3 = OneVsRestClassifier(GaussianNB()) clf4 = OneVsRestClassifier(DecisionTreeClassifier()) #clf5 = OneVsRestClassifier(svm.SVC(gamma=2)) #training st = time.time() print "training started" clf1.fit( x_train, y_train ) clf2.fit( x_train, y_train ) clf3.fit( x_train, y_train ) clf4.fit( x_train, y_train ) print "training ended" et = time.time() tt = et - st print "Training Time = " + str(tt) + "\n" #predictions pred1 = clf1.predict( x_test ) pred2 = clf2.predict( x_test ) pred3 = clf3.predict( x_test ) pred4 = clf4.predict( x_test ) pred = pred2; #NOTE: change to decision_function or predict_proba depending on the classifier
class DialectIdentifier(object): """A class for training, evaluating and running the dialect identification model described by Salameh et al. After initializing an instance, you must run the train method once before using it. Args: labels (:obj:`set` of :obj:`str`, optional): The set of dialect labels used in the training data in the main model. If None, the default labels are used. Defaults to None. labels_extra (:obj:`set` of :obj:`str`, optional): The set of dialect labels used in the training data in the extra features model. If None, the default labels are used. Defaults to None. char_lm_dir (:obj:`str`, optional): Path to the directory containing the character-based language models. If None, use the language models that come with this package. Defaults to None. word_lm_dir (:obj:`str`, optional): Path to the directory containing the word-based language models. If None, use the language models that come with this package. Defaults to None. """ def __init__(self, labels=None, labels_extra=None, char_lm_dir=None, word_lm_dir=None): if labels is None: labels = _DEFAULT_LABELS if labels_extra is None: labels_extra = _DEFAULT_LABELS_EXTRA if char_lm_dir is None: char_lm_dir = _CHAR_LM_DIR if word_lm_dir is None: word_lm_dir = _WORD_LM_DIR self._labels = labels self._labels_extra = labels_extra self._labels_sorted = sorted(labels) self._labels_extra_sorted = sorted(labels_extra) self._char_lms = collections.defaultdict(kenlm.Model) self._word_lms = collections.defaultdict(kenlm.Model) self._load_lms(char_lm_dir, word_lm_dir) self._is_trained = False def _load_lms(self, char_lm_dir, word_lm_dir): config = kenlm.Config() config.show_progress = False config.arpa_complain = kenlm.ARPALoadComplain.NONE for label in self._labels: char_lm_path = Path(char_lm_dir, '{}.arpa'.format(label)) word_lm_path = Path(word_lm_dir, '{}.arpa'.format(label)) self._char_lms[label] = kenlm.Model(str(char_lm_path), config) self._word_lms[label] = kenlm.Model(str(word_lm_path), config) def _get_char_lm_scores(self, txt): chars = _word_to_char(txt) return np.array([ self._char_lms[label].score(chars, bos=True, eos=True) for label in self._labels_sorted ]) def _get_word_lm_scores(self, txt): return np.array([ self._word_lms[label].score(txt, bos=True, eos=True) for label in self._labels_sorted ]) def _get_lm_feats(self, txt): word_lm_scores = self._get_word_lm_scores(txt).reshape(1, -1) word_lm_scores = _normalize_lm_scores(word_lm_scores) char_lm_scores = self._get_char_lm_scores(txt).reshape(1, -1) char_lm_scores = _normalize_lm_scores(char_lm_scores) feats = np.concatenate((word_lm_scores, char_lm_scores), axis=1) return feats def _get_lm_feats_multi(self, sentences): feats_list = collections.deque() for sentence in sentences: feats_list.append(self._get_lm_feats(sentence)) feats_matrix = np.array(feats_list) feats_matrix = feats_matrix.reshape((-1, 52)) return feats_matrix def _prepare_sentences(self, sentences): tokenized = [ ' '.join(simple_word_tokenize(dediac_ar(s))) for s in sentences ] sent_array = np.array(tokenized) x_trans = self._feat_union.transform(sent_array) x_trans_extra = self._feat_union_extra.transform(sent_array) x_predict_extra = self._classifier_extra.predict_proba(x_trans_extra) x_lm_feats = self._get_lm_feats_multi(sentences) x_final = sp.sparse.hstack((x_trans, x_lm_feats, x_predict_extra)) return x_final def train(self, data_path=None, data_extra_path=None, char_ngram_range=(1, 3), word_ngram_range=(1, 1), n_jobs=None): """Trains the model on a given data set. Args: data_path (:obj:`str`, optional): Path to main training data. If None, use the provided training data. Defaults to None. data_extra_path (:obj:`str`, optional): Path to extra features training data. If None,cuse the provided training data. Defaults to None. char_ngram_range (:obj:`tuple`, optional): The n-gram ranges to consider in the character-based language models. Defaults to (1, 3). word_ngram_range (:obj:`tuple`, optional): The n-gram ranges to consider in the word-based language models. Defaults to (1, 1). n_jobs (:obj:`int`, optional): The number of parallel jobs to use for computation. If None, then only 1 job is used. If -1 then all processors are used. Defaults to None. """ if data_path is None: data_path = _TRAIN_DATA_PATH if data_extra_path is None: data_extra_path = _TRAIN_DATA_EXTRA_PATH # Load training data and extract train_data = pd.read_csv(data_path, sep='\t', index_col=0) train_data_extra = pd.read_csv(data_extra_path, sep='\t', index_col=0) x = train_data['ar'].values y = train_data['dialect'].values x_extra = train_data_extra['ar'].values y_extra = train_data_extra['dialect'].values # Build and train extra classifier self._label_encoder_extra = LabelEncoder() self._label_encoder_extra.fit(y_extra) y_trans = self._label_encoder_extra.transform(y_extra) word_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=word_ngram_range, analyzer='word', tokenizer=lambda x: x.split(' ')) char_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=char_ngram_range, analyzer='char', tokenizer=lambda x: x.split(' ')) self._feat_union_extra = FeatureUnion([('wordgrams', word_vectorizer), ('chargrams', char_vectorizer)]) x_trans = self._feat_union_extra.fit_transform(x_extra) self._classifier_extra = OneVsRestClassifier(MultinomialNB(), n_jobs=n_jobs) self._classifier_extra.fit(x_trans, y_trans) # Build and train main classifier self._label_encoder = LabelEncoder() self._label_encoder.fit(y) y_trans = self._label_encoder.transform(y) word_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=word_ngram_range, analyzer='word', tokenizer=lambda x: x.split(' ')) char_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=char_ngram_range, analyzer='char', tokenizer=lambda x: x.split(' ')) self._feat_union = FeatureUnion([('wordgrams', word_vectorizer), ('chargrams', char_vectorizer)]) self._feat_union.fit(x) x_prepared = self._prepare_sentences(x) self._classifier = OneVsRestClassifier(MultinomialNB(), n_jobs=n_jobs) self._classifier.fit(x_prepared, y_trans) self._is_trained = True def eval(self, data_path=None, data_set='VALIDATION'): """Evaluate the trained model on a given data set. Args: data_path (:obj:`str`, optional): Path to an evaluation data set. If None, use one of the provided data sets instead. Defaults to None. data_set (:obj:`str`, optional): Name of the provided data set to use. This is ignored if data_path is not None. Can be either 'VALIDATION' or 'TEST'. Defaults to 'VALIDATION'. Returns: :obj:`dict`: A dictionary mapping an evaluation metric to its computed value. The metrics used are accuracy, f1_micro, f1_macro, recall_micro, recall_macro, precision_micro and precision_macro. """ if not self._is_trained: raise UntrainedModelError('Can\'t evaluate an untrained model.') if data_path is None: if data_set == 'VALIDATION': data_path = _VAL_DATA_PATH elif data_set == 'TEST': data_path = _TEST_DATA_PATH else: raise InvalidDataSetError(data_set) # Load eval data eval_data = pd.read_csv(data_path, sep='\t', index_col=0) x = eval_data['ar'].values y_true = eval_data['dialect'].values # Generate predictions x_prepared = self._prepare_sentences(x) y_pred = self._classifier.predict(x_prepared) y_pred = self._label_encoder.inverse_transform(y_pred) # Get scores scores = { 'accuracy': accuracy_score(y_true, y_pred), 'f1_micro': f1_score(y_true, y_pred, average='micro'), 'f1_macro': f1_score(y_true, y_pred, average='macro'), 'recall_micro': recall_score(y_true, y_pred, average='micro'), 'recall_macro': recall_score(y_true, y_pred, average='macro'), 'precision_micro': precision_score(y_true, y_pred, average='micro'), 'precision_macro': precision_score(y_true, y_pred, average='macro') } return scores def predict(self, sentences): """Predict the dialect probability scores for a given list of sentences. Args: sentences (:obj:`list` of :obj:`str`): The list of sentences. Returns: :obj:`list` of :obj:`DIDPred`: A list of prediction results, each corresponding to its respective sentence. """ if not self._is_trained: raise UntrainedModelError( 'Can\'t predict with an untrained model.') x_prepared = self._prepare_sentences(sentences) predicted_scores = self._classifier.predict_proba(x_prepared) result = collections.deque() for sentence, scores in zip(sentences, predicted_scores): score_tups = list(zip(self._labels_sorted, scores)) predicted_dialect = _max_score(score_tups) dialect_scores = dict(score_tups) result.append(DIDPred(predicted_dialect, dialect_scores)) return list(result) @staticmethod def pretrained(): """Load the default pre-trained model provided with camel-tools. Raises: :obj:`PretrainedModelError`: When a pre-trained model compatible with the current Python version isn't available. Returns: :obj:`DialectIdentifier`: The loaded model. """ suffix = '{}{}'.format(sys.version_info.major, sys.version_info.minor) model_file_name = 'did_pretrained_{}.dill'.format(suffix) model_path = Path(_DATA_DIR, model_file_name) if not model_path.is_file(): raise PretrainedModelError( 'No pretrained model for current Python version found.') with model_path.open('rb') as model_fp: model = dill.load(model_fp) # We need to reload LMs since they were set to None when # serialized. model._char_lms = collections.defaultdict(kenlm.Model) model._word_lms = collections.defaultdict(kenlm.Model) model._load_lms(_CHAR_LM_DIR, _WORD_LM_DIR) return model
def test_ovr_fit_predict_svc(): ovr = OneVsRestClassifier(svm.SVC()) ovr.fit(iris.data, iris.target) assert_equal(len(ovr.estimators_), 3) assert_greater(ovr.score(iris.data, iris.target), .9)
def temporal_holdout(X, y, indx, bootstrap, fname, goterms=None, go_fname=None): """Perform temporal holdout validation""" X_train = X[indx['train'].tolist()] X_test = X[indx['test'].tolist()] X_valid = X[indx['valid'].tolist()] y_train = y['train'].tolist() y_test = y['test'].tolist() y_valid = y['valid'].tolist() if goterms is not None: goterms = goterms['terms'].tolist() # range of hyperparameters C_range = 10.**np.arange(-1, 3) gamma_range = 10.**np.arange(-3, 1) # pre-generating kernels print("### Pregenerating kernels...") K_rbf_train = {} K_rbf_test = {} K_rbf_valid = {} for gamma in gamma_range: K_rbf_train[gamma] = rbf_kernel(X_train, gamma=gamma) K_rbf_test[gamma] = rbf_kernel(X_test, X_train, gamma=gamma) K_rbf_valid[gamma] = rbf_kernel(X_valid, X_train, gamma=gamma) print("### Done.") print("Train samples=%d; #Test samples=%d" % (y_train.shape[0], y_test.shape[0])) # parameter fitting C_opt = None gamma_opt = None max_aupr = 0 for C in C_range: for gamma in gamma_range: # Multi-label classification clf = OneVsRestClassifier(svm.SVC(C=C, kernel='precomputed', probability=False), n_jobs=-1) clf.fit(K_rbf_train[gamma], y_train) y_score_valid = clf.decision_function(K_rbf_valid[gamma]) y_pred_valid = clf.predict(K_rbf_valid[gamma]) perf = evaluate_performance(y_valid, y_score_valid, y_pred_valid) micro_aupr = perf['m-aupr'] print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" % (gamma, C, micro_aupr)) if micro_aupr > max_aupr: C_opt = C gamma_opt = gamma max_aupr = micro_aupr print("### Optimal parameters: ") print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt)) print("### Train dataset: AUPR = %0.3f" % (max_aupr)) print("### Computing performance on test dataset...") clf = OneVsRestClassifier(svm.SVC(C=C_opt, kernel='precomputed', probability=False), n_jobs=-1) clf.fit(K_rbf_train[gamma_opt], y_train) # Compute performance on test set y_score = clf.decision_function(K_rbf_test[gamma_opt]) y_pred = clf.predict(K_rbf_test[gamma_opt]) # performance measures for bootstrapping perf = dict() pr_micro = [] pr_macro = [] fmax = [] acc = [] # individual goterms pr_goterms = {} for i in range(0, len(goterms)): pr_goterms[goterms[i]] = [] for ind in bootstrap: perf_ind = evaluate_performance(y_test[ind], y_score[ind], y_pred[ind]) pr_micro.append(perf_ind['m-aupr']) pr_macro.append(perf_ind['M-aupr']) fmax.append(perf_ind['F1']) acc.append(perf_ind['acc']) for i in range(0, len(goterms)): pr_goterms[goterms[i]].append(perf_ind[i]) perf['m-aupr_avg'] = np.mean(pr_micro) perf['m-aupr_std'] = std(pr_micro) perf['M-aupr_avg'] = np.mean(pr_macro) perf['M-aupr_std'] = std(pr_macro) perf['F1_avg'] = np.mean(fmax) perf['F1_std'] = std(fmax) perf['acc_avg'] = np.mean(acc) perf['acc_std'] = std(acc) # trials fout = open(fname, 'w') fout.write("aupr[micro], aupr[macro], F_max, accuracy\n") for it in range(0, len(bootstrap)): fout.write(pr_micro[it], pr_macro[it], fmax[it], acc[it], "\n") fout.close() # write performance on individual GO terms if go_fname is not None: fout = open(go_fname, 'wb') print >> fout, "GO_id, AUPRs" for i in range(0, len(goterms)): print >> fout, goterms[i], sum(y_train[:, i]) / float( y_train.shape[0]), for pr in pr_goterms[goterms[i]]: print >> fout, pr, print >> fout fout.close() return perf
def test_ovr_coef_(): ovr = OneVsRestClassifier(LinearSVC()) ovr.fit(iris.data, iris.target) shape = ovr.coef_.shape assert_equal(shape[0], n_classes) assert_equal(shape[1], iris.data.shape[1])
percentage = np.arange(0.1, 1, 0.1) classif = OneVsRestClassifier(lr) for p in percentage: random.shuffle(lbl) train_ins = int(len(lbl) * p) test_ins = lbl[train_ins:] train_ins = lbl[0:train_ins] X = np.zeros((len(train_ins), fea.shape[1])) Y = np.zeros((len(train_ins))) X_test = np.zeros((len(test_ins), fea.shape[1])) Y_test = np.zeros((len(test_ins))) for idx, tup in enumerate(train_ins): X[idx, :] = fea[tup[0], :] Y[idx] = tup[1] for idx, tup in enumerate(test_ins): X_test[idx, :] = fea[tup[0], :] Y_test[idx] = tup[1] classif.fit(X, Y) Y_pred = classif.predict(X_test) f1_a = f1_score(Y_test, Y_pred, average='macro') f1_i = f1_score(Y_test, Y_pred, average='micro') print 'Macro', f1_a print 'Micro', f1_i
def cross_validation(X, y, n_trials=5, trial_splits=None, fname=None): """Perform model selection via 5-fold cross validation""" # filter samples with no annotations del_rid = np.where(y.sum(axis=1) == 0)[0] y = np.delete(y, del_rid, axis=0) X = np.delete(X, del_rid, axis=0) # range of hyperparameters C_range = 10.**np.arange(-1, 3) gamma_range = 10.**np.arange(-3, 1) # pre-generating kernels print("### Pregenerating kernels...") K_rbf = {} for gamma in gamma_range: K_rbf[gamma] = rbf_kernel(X, gamma=gamma) print("### Done.") # performance measures perf = dict() pr_micro = [] pr_macro = [] fmax = [] acc = [] if trial_splits is None: # shuffle and split training and test sets trials = ShuffleSplit(n_splits=n_trials, test_size=0.2, random_state=None) ss = trials.split(X) trial_splits = [] for train_idx, test_idx in ss: trial_splits.append((train_idx, test_idx)) it = 0 for jj in range(0, n_trials): train_idx = trial_splits[jj][0] test_idx = trial_splits[jj][1] it += 1 y_train = y[train_idx] y_test = y[test_idx] print("### [Trial %d] Perfom cross validation...." % (it)) print("Train samples=%d; #Test samples=%d" % (y_train.shape[0], y_test.shape[0])) # setup for neasted cross-validation splits = ml_split(y_train) # parameter fitting C_opt = None gamma_opt = None max_aupr = 0 for C in C_range: for gamma in gamma_range: # Multi-label classification cv_results = [] for train, valid in splits: clf = OneVsRestClassifier(svm.SVC(C=C, kernel='precomputed', probability=False), n_jobs=-1) K_train = K_rbf[gamma][ train_idx[train], :][:, train_idx[train]] K_valid = K_rbf[gamma][ train_idx[valid], :][:, train_idx[train]] y_train_t = y_train[train] y_train_v = y_train[valid] y_score_valid = np.zeros(y_train_v.shape, dtype=float) y_pred_valid = np.zeros_like(y_train_v) idx = np.where(y_train_t.sum(axis=0) > 0)[0] clf.fit(K_train, y_train_t[:, idx]) y_score_valid[:, idx] = clf.decision_function(K_valid) y_pred_valid[:, idx] = clf.predict(K_valid) perf_cv = evaluate_performance(y_train_v, y_score_valid, y_pred_valid) cv_results.append(perf_cv['m-aupr']) cv_aupr = np.median(cv_results) print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" % (gamma, C, cv_aupr)) if cv_aupr > max_aupr: C_opt = C gamma_opt = gamma max_aupr = cv_aupr print("### Optimal parameters: ") print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt)) print("### Train dataset: AUPR = %0.3f" % (max_aupr)) print("### Using full training data...") clf = OneVsRestClassifier(svm.SVC(C=C_opt, kernel='precomputed', probability=False), n_jobs=-1) y_score = np.zeros(y_test.shape, dtype=float) y_pred = np.zeros_like(y_test) idx = np.where(y_train.sum(axis=0) > 0)[0] clf.fit(K_rbf[gamma_opt][train_idx, :][:, train_idx], y_train[:, idx]) # Compute performance on test set y_score[:, idx] = clf.decision_function( K_rbf[gamma_opt][test_idx, :][:, train_idx]) y_pred[:, idx] = clf.predict(K_rbf[gamma_opt][test_idx, :][:, train_idx]) perf_trial = evaluate_performance(y_test, y_score, y_pred) pr_micro.append(perf_trial['m-aupr']) pr_macro.append(perf_trial['M-aupr']) fmax.append(perf_trial['F1']) acc.append(perf_trial['acc']) print( "### Test dataset: AUPR['micro'] = %0.3f, AUPR['macro'] = %0.3f, F1 = %0.3f, Acc = %0.3f" % (perf_trial['m-aupr'], perf_trial['M-aupr'], perf_trial['F1'], perf_trial['acc'])) perf['m-aupr_avg'] = np.mean(pr_micro) perf['m-aupr_std'] = std(pr_micro) perf['M-aupr_avg'] = np.mean(pr_macro) perf['M-aupr_std'] = std(pr_macro) perf['F1_avg'] = np.mean(fmax) perf['F1_std'] = std(fmax) perf['acc_avg'] = np.mean(acc) perf['acc_std'] = std(acc) if fname is not None: fout = open(fname, 'w') fout.write("aupr[micro], aupr[macro], F_max, accuracy\n") for ii in range(0, n_trials): fout.write(pr_micro[ii], pr_macro[ii], fmax[ii], acc[ii]) fout.close() return perf
/* With split data in hand, you're only a few lines away from training a model. In this exercise, you will import the logistic regression and one versus rest classifiers in order to fit a multi-class logistic regression model to the NUMERIC_COLUMNS of your feature data. Then you'll test and print the accuracy with the .score() method to see the results of training. Before you train! Remember, we're ultimately going to be using logloss to score our model, so don't worry too much about the accuracy here. Keep in mind that you're throwing away all of the text data in the dataset - that's by far most of the data! So don't get your hopes up for a killer performance just yet. We're just interested in getting things up and running at the moment. All data necessary to call multilabel_train_test_split() has been loaded into the workspace. */ # Import classifiers from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier # Create the DataFrame: numeric_data_only numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000) # Get labels and convert to dummy variables: label_dummies label_dummies = pd.get_dummies(df[LABELS]) # Create training and test sets X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only, label_dummies, size=0.2, seed=123) # Instantiate the classifier: clf clf = OneVsRestClassifier(LogisticRegression()) # Fit the classifier to the training data clf.fit(X_train, y_train) # Print the accuracy print("Accuracy: {}".format(clf.score(X_test, y_test)))
from sklearn.multiclass import OneVsRestClassifier # from sklearn.preprocessing import MultiLabelBinarizer # mlb = MultiLabelBinarizer(classes=np.unique(y)) # y_train = mlb.fit_transform([[el] for el in y_train]) # y_test = mlb.fit_transform([[el] for el in y_test]) # pickle.dump(tag_classifier, open('mlb.pkl', 'wb')) ###################################### ######### YOUR CODE HERE ############# ###################################### tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0)) tag_classifier.fit(X_train_tfidf, y_train) # print(mlb.classes_) # print(mlb.inverse_transform(y_test_pred[:5, :])) # Check test accuracy. y_test_pred = tag_classifier.predict(X_test_tfidf) test_accuracy = accuracy_score(y_test, y_test_pred) print('Test accuracy = {}'.format(test_accuracy)) """Dump the classifier to use it in the running bot.""" pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb')) """## Part II. Ranking questions with embeddings
Y = np.array(train['label'].values, dtype=np.int32) test = pd.read_csv('ftest.csv') X_test = test[[str(i) for i in range(4096)]].values Y_test = np.array(test['label'].values, dtype=np.int32) Y_test = label_binarize(Y_test, classes=[i for i in range(193)]) Y = label_binarize(Y, classes=[i for i in range(193)]) n_classes = Y.shape[1] random_state = np.random.RandomState(0) n_samples, n_features = X.shape classifier = OneVsRestClassifier(svm.SVC(kernel='poly', probability=True, random_state=random_state)) y_score = classifier.fit(X, Y).decision_function(X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) lw = 2 all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i])
from src.utils.initialize import * from sklearn.model_selection import train_test_split import pickle with open('data/processed/target_train.pkl', 'rb') as f: Y_train = pickle.load(f) print( "Loaded the training target variable Y from data/processed/target_train.pkl." ) with open('data/processed/raw_count_features_train.pkl', 'rb') as f: X_train = pickle.load(f) print("Loaded X from data/processed/raw_count_features_train.pkl.\n") print("Shape of X_train is {X_train}.\n".format(X_train=X_train.shape)) ###### Naive Bayes ######## from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import f1_score from sklearn.metrics import make_scorer from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB classifnb = OneVsRestClassifier(MultinomialNB()) classifnb.fit(X_train, Y_train) print("Trained using Multinomial Naive Bayes.") with open('models/classifier_nb.pkl', 'wb') as f: pickle.dump(classifnb, f)
for j in range(7): block = x_luv[32 * i:32 * (i + 1), 32 * j:32 * (j + 1)] mean, var = np.mean(block, axis=tuple(range(block.ndim - 1))), np.var( block, axis=tuple(range(block.ndim - 1))) l = np.concatenate((l, mean)) l = np.concatenate((l, var)) x_test.append(l) x_train = np.asarray(x_train).astype(np.float32) x_test = np.asarray(x_test).astype(np.float32) y_test = np.asarray(y_test) y_train = np.asarray(y_train) print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) classifier = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3)) classifier.fit(x_train, y_train) with open('onevsrest-knn-3-luv.pkl', 'wb') as f: pickle.dump(classifier, f) ''' with open('onevsrest-knn-3-luv.pkl', 'rb') as f: classifier = pickle.load(f) ''' predictions = classifier.predict(x_test) print('all match:', np.sum(np.all(predictions == y_test, axis=1)) / len(y_test)) print('at least one match:', (np.sum(np.all(predictions - y_test <= 0, axis=1)) - np.sum(np.all(predictions == 0, axis=1))) / len(y_test)) print('binary :', np.sum(predictions == y_test) / (5 * len(y_test)))
# Add noisy features to make the problem harder random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) # Learn to predict each class against the other #classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, # random_state=random_state)) classifier = OneVsRestClassifier(knn) #y_score = classifier.fit(X_train, y_train).decision_function(X_test) y_score = classifier.fit(X_train, y_train).predict(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot ROC plt.figure() lw = 2
class Model: def __init__(self, estimator_file=None): self.estimator = None self.estimator_file = estimator_file self.estimator_name = None self.threshold = 0.5 self.binarizer = None self.vectorizer = None def load(self): if self.estimator_file is None: raise ValueError( 'Specify an estimator_path for loading a pre-trained model.') # Load file file = open(self.estimator_file, "rb") model = pickle.load(file) # Set variables from loaded model self.estimator = model.estimator self.estimator_name = model.estimator_name self.binarizer = model.binarizer self.vectorizer = model.vectorizer file.close() def fit(self, X, y, estimator='logistic', n_jobs=-1, **estimator_params): # Select estimator if estimator == 'logistic': estimator = LogisticRegressionCV(**estimator_params) elif callable(estimator): estimator = estimator(**estimator_params) else: raise NotImplementedError( f'Estimator "{estimator}" not yet implemented!') # Build into OneVsRestClassifier and fit self.estimator = OneVsRestClassifier(estimator=estimator, n_jobs=n_jobs) self.estimator.fit(X, y) self.estimator_name = type(estimator).__name__ def predict(self, X=None, probas=None, t=None): # Parameter setting and error checking if X is None and probas is None: raise TypeError("Either X or probas must be provided.") if t is None: t = self.threshold # Get probabilities matrix if probas is None: probas = self.estimator.predict_proba(X) # Set to 1 of probability is >= threshold return (probas >= t).astype(int) def score(self, y, X=None, probas=None, t=None): preds = self.predict(X, probas, t) return f1_score(y, preds, average='micro') def set_best_threshold(self, X, y, precision=0.01, max_t=0.5, min_t=None, bias=0): # Parameter setting and error checking if min_t is None: min_t = precision if min_t > max_t: raise ValueError( "Minimum threshold needs to be less than maximum.") # Get probas and score for current threshold probas = self.estimator.predict_proba(X) best_score = self.score(y, probas=probas) # Loop to try to find a better threshold for t in np.arange(min_t, max_t, precision): score = self.score(y, probas=probas, t=t) if score >= best_score: best_score = score self.threshold = t + bias
print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_multilabel_classification from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC from sklearn import metrics from sklearn.model_selection import train_test_split data, target = make_multilabel_classification(n_samples=1000, n_classes=3, n_labels=3, allow_unlabeled=True, random_state=1) traing_data, test_data, traing_target, test_target = train_test_split( data, target, test_size=0.2) classif = OneVsRestClassifier(SVC(kernel='linear')) classif.fit(traing_data, traing_target) predicted = classif.predict(test_data) print(metrics.classification_report(test_target, predicted, target_names="tes"))
cpu_count = 1 if (len(argv) == 2): script, cpu_count = argv try: cpu_count = int(cpu_count) except Exception, e: print "Cpu count should be a number" exit() dataset = numpy.genfromtxt(open('../Data/train.csv', 'r'), delimiter=',', dtype='f8')[1:] target = [x[0] for x in dataset] train = [x[1:] for x in dataset] test = numpy.genfromtxt(open('../Data/test.csv', 'r'), delimiter=',', dtype='f8')[1:] number_of_svms = 40 svm_bagging_classifier = OneVsRestClassifier( BaggingClassifier(svm.SVC(C=0.01, gamma=1e-8), max_samples=1.0 / number_of_svms, n_estimators=number_of_svms, n_jobs=cpu_count)) svm_bagging_classifier.fit(train, target) predictions = svm_bagging_classifier.predict(test) numpy.savetxt('../Predictions/svm_predictions.csv', numpy.c_[range(1, len(test) + 1), predictions], delimiter=',', header='ImageId,Label', comments='', fmt='%d')
def run_test(filename, results_dir, models, random_state, external_split, internal_split, optimization_iterations): global df_results print(filename) data_dict['Dataset Name'] = filename.replace('.csv', '') df = pd.read_csv(directory + '/' + filename) X, Y = fix_dataset(df) kf = StratifiedKFold(n_splits=external_split, random_state=random_state, shuffle=True) for fold_index, (train_index, test_index) in enumerate(kf.split(X, Y)): data_dict['Cross Validation[1-10]'] = fold_index print("fold index =", fold_index) x_train = X.iloc[train_index] y_train = Y.iloc[train_index] x_test = X.iloc[test_index] y_test = Y.iloc[test_index] for model_name, model_class, model, model_dict in models: print('Model:', model_name) data_dict['Algorithm Name'] = model_name # distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1']) distributions = model_dict start_training_time = time.time() randomSearcher = RandomizedSearchCV( model, distributions, random_state=random_state, cv=internal_split, n_iter=optimization_iterations, scoring=make_scorer(accuracy_score)) randomSearcher.fit(x_train, y_train.values.ravel()) if model_class is wprb: params = { k.replace("estimator__", ""): v for k, v in randomSearcher.best_params_.items() } best_model = OneVsRestClassifier(model_class(**params)) else: params = randomSearcher.best_params_ best_model = model_class(**params) data_dict['Hyper-Parameters Values'] = params best_model.fit(x_train, y_train.values.ravel()) data_dict['Training Time'] = time.time() - start_training_time print("best params:", params) print( "train accuracy:", round(accuracy_score(y_train, best_model.predict(x_train)), 4)) start_inference_time = time.time() test_pred = best_model.predict(x_test) test_pred_proba = best_model.predict_proba(x_test) data_dict['Inference Time'] = ( time.time() - start_inference_time) / (len(x_test)) * 1000 print("test accuracy:", round(accuracy_score(y_test, test_pred), 4)) print() data_dict['Accuracy'] = accuracy_score(y_test, test_pred) data_dict['Precision'] = precision_score( y_test, test_pred, average='macro', labels=np.unique(test_pred)) unique_labels = np.unique(Y.values) if len(unique_labels) == 2: # multiclass vs binary classification data_dict['AUC'] = roc_auc_score(y_true=y_test, y_score=test_pred_proba[:, 1]) else: # plaster = test_pred_proba[:, [np.where(np.unique(Y.values) == x)[0][0] for x in np.unique(y_test)]] # plaster2 = np.array([[x / sum(y) for x in y] for y in plaster]) data_dict['AUC'] = roc_auc_score(y_true=y_test, y_score=test_pred_proba, multi_class='ovr', labels=np.unique(y_test)) all_TPR = [] all_FPR = [] all_PR_CURVE = [] for index, class_label in enumerate(np.unique(y_test)): tn, fp, fn, tp = confusion_matrix( y_test == class_label, test_pred == class_label).ravel() all_FPR.append(fp / (fp + tn)) all_TPR.append(tp / (tp + fn)) precision, recall, _ = precision_recall_curve( y_test == class_label, test_pred_proba[:, index]) all_PR_CURVE.append(auc(recall, precision)) data_dict['FPR'] = np.mean(all_FPR) data_dict['TPR'] = np.mean(all_TPR) data_dict['PR Curve'] = np.mean(all_PR_CURVE) df_results = df_results.append(data_dict, ignore_index=True) df_results.to_csv(results_dir + '/' + filename, index=False) df_results = df_results.iloc[0:0]
classifier = SVC( C=100, kernel='rbf', # kernel type degree=3, # default value gamma=1, coef0=1, shrinking=True, tol=0.001, probability=False, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None) model = OneVsRestClassifier(classifier, n_jobs=4) model.fit(X, y) y_test = model.predict(X_test) y_pred = lb.inverse_transform(y_test) test_id = [doc['id'] for doc in test] sub = pd.DataFrame({ 'id': test_id, 'cuisine': y_pred }, columns=['id', 'cuisine']) sub.to_csv('svm_output.csv', index=False)
vencedor = resultados[maximo] print('**************') print("Vencedor: ") print(vencedor) print('**************') ## Treinando o modelo final (vencedor) vencedor.fit(treino_dados, treino_marcacoes) ## Salva modelo Vencedor dump(vencedor, arquivo_modelo_salvo) np.set_printoptions(precision=2) tempo_final = time.time() tempo_total = ((tempo_final - tempo_inicial) / 60) print('-------------------------------------------------------------------------------------') print("Tempo total de execução em minutos: %.2f" % tempo_total) print('-------------------------------------------------------------------------------------') ## Teste real com dados de validação teste_real(vencedor, validacao_dados, validacao_marcacoes) modeloOneVsRest.fit(treino_dados, treino_marcacoes) teste_real(modeloOneVsRest, validacao_dados, validacao_marcacoes) modeloMultinomial.fit(treino_dados, treino_marcacoes) teste_real(modeloMultinomial, validacao_dados, validacao_marcacoes) modeloAdaBoost.fit(treino_dados, treino_marcacoes) teste_real(modeloAdaBoost, validacao_dados, validacao_marcacoes)
def multiclass_classifier(X_train, X_test, y_train, y_test, model, list_of_classes, class_labels): # Binarize the output y_train, y_test = label_binarize(y_train, classes=list_of_classes), label_binarize( y_test, classes=list_of_classes) n_classes = len(class_labels) # Learn to predict each class against the other classifier = OneVsRestClassifier(model) y_score = classifier.fit(X_train, y_train).predict_proba(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at these points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure(figsize=(12, 12)) plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle([ 'aqua', 'darkorange', 'cornflowerblue', 'green', 'purple', 'red', 'blue' ]) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i + 1, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class') plt.legend(loc="lower right") figure = plt.show() y_prob = classifier.predict_proba(X_test) # macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", # average="macro") # weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", # average="weighted") macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, average="macro") weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, average="weighted") # print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " # "(weighted by prevalence)" # .format(macro_roc_auc_ovo, weighted_roc_auc_ovo)) y_pred = classifier.predict(X_test) mcm = multilabel_confusion_matrix(y_test, y_pred, labels=class_labels) print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " "(weighted by prevalence)".format( macro_roc_auc_ovr, weighted_roc_auc_ovr)), print(figure), print(mcm) return classifier
def before_trading_start(context, data): context.days_traded += 1 if context.model == {} or context.model['refresh_date'] <= context.days_traded: context.model['refresh_date'] = context.days_traded + context.refresh_frequency clusters = {} ws.send(msg_placeholder % "Retraining the clustering ML model") for ret_window in context.ret_windows: clusters[ret_window] = {'windows': {}} for window_length in context.window_lengths: cluster_data = create_kmeans_features(context, data, window_length, ret_window) window_length_str = str(window_length) ws.send(msg_placeholder % ("Feature set for k-means with a look back of %s days created" % window_length_str)) cluster_data.dropna(inplace=True) X = cluster_data.drop('rets', axis=1) y = cluster_data['rets'] kmeans = KMeans(n_clusters=context.n_clusters, n_init=100, max_iter=500, random_state=42, precompute_distances=True) kmeans.fit(X) ws.send(msg_placeholder % ("K-means cluster for look back of %s days trained" % window_length_str)) clusters[ret_window]['windows'][window_length] = { "kmeans": kmeans, "regimes": kmeans.predict(X), "rets": y } ws.send(msg_placeholder % "Retraining the Random Forest ML model") panel = create_rand_forest_features(clusters) ws.send(msg_placeholder % "Feature set for Random Forest created") for ret_window, _ in clusters.items(): df = panel[ret_window] ret = df['rets'] X = df.drop('rets', axis=1) X_train = X.values if context.use_classifier: global ret_buckets try: ret_buckets = context.ret_buckets[ret_window] except KeyError: ret_buckets = context.ret_buckets['gen'] clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000, random_state=42)) y = len(ret_buckets) * np.ones(len(ret)).astype(int) for i in range(len(ret_buckets) - 1, -1, -1): I = ret.values < ret_buckets[i] y[I] = i y_train = y clf.fit(X_train, y_train) clusters[ret_window]['clf'] = clf ws.send(msg_placeholder % "Random Forest Classifier trained") else: rfr = RandomForestRegressor(n_estimators=1000, random_state=42) rfr.fit(X_train, ret.values) clusters[ret_window]['reg'] = rfr ws.send(msg_placeholder % "Random Forest Regression trained") context.model['clusters'] = clusters
svm_clf = OneVsRestClassifier(SVC(C=10)) cross_val_score(svm_clf, X_train, y_train, cv=10).mean() svm_clf = OneVsRestClassifier(SVC(C=100)) cross_val_score(svm_clf, X_train, y_train, cv=10).mean() # In[93]: svm_optimized = OneVsRestClassifier(SVC(C=10)) # In[94]: svm_optimized.fit(X_train, y_train) svm_optimized.score(X_test, y_test) # In[95]: plot_learning_curve(svm_optimized, title='SVM learning curve', X=X_train, y=y_train, cv=10) plt.show() # ### Artificial Neural Networks # In[50]: from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score from sklearn.cross_validation import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier random_state = np.random.RandomState(0) n_classes = 2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0) # Run classifier classifier = OneVsRestClassifier( svm.SVC(kernel='linear', probability=True, random_state=random_state)) y_score = classifier.fit(X_train, y_train).decision_function(X_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig( '/home//askrey/Dropbox/Project_step_by_step/3_create_database/SVM.png')