def train_custom_one_vs_all(X_train, X_test, Y_train, topk): #convert matrix to row for efficient splicing Y_train = Y_train.tocsc() tag_classifiers = [] num_training, numclasses = Y_train.shape num_test_examples = X_test.shape[0] # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that num_examples = X_test.shape[0] num_classes = len(tag_classifiers) topk_class_distances = [] for i in xrange(num_examples): heap = [] topk_class_distances += [heap] for j in xrange(numclasses): # train on each class label for all the training examples y = numpy.ravel(Y_train.getcol(j).todense()) clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.8, fit_intercept=True, intercept_scaling=1) clf.fit(X_train, y) print "Trained for class", j # get the decision for all test examples decision = clf.densify().decision_function(X_test) # for each test example add its decision value to the heap of top k decision values for i in xrange(num_test_examples): h = topk_class_distances[i] if len(h) < topk: heapq.heappush(h, (decision[i], j)) else: heapq.heappushpop(h, (decision[i], j)) print "Predicted for class", j #clean the decision values and store the class labels class_label_indices = [] for i in xrange(num_examples): topk_labels = [label for dist, label in topk_class_distances[i]] class_label_indices += [topk_labels] return class_label_indices
def train_custom_one_vs_all(X_train,X_test,Y_train,topk): #convert matrix to row for efficient splicing Y_train = Y_train.tocsc() tag_classifiers = [] num_training,numclasses = Y_train.shape num_test_examples = X_test.shape[0] # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that num_examples = X_test.shape[0] num_classes = len(tag_classifiers) topk_class_distances = [] for i in xrange(num_examples): heap = [] topk_class_distances += [heap] for j in xrange(numclasses): # train on each class label for all the training examples y = numpy.ravel(Y_train.getcol(j).todense()); clf = LogisticRegression(penalty='l2',dual=False,tol=0.0001,C=0.8,fit_intercept=True,intercept_scaling=1) clf.fit(X_train,y); print "Trained for class",j # get the decision for all test examples decision = clf.densify().decision_function(X_test) # for each test example add its decision value to the heap of top k decision values for i in xrange(num_test_examples): h = topk_class_distances[i] if len(h) < topk: heapq.heappush(h,(decision[i],j)) else: heapq.heappushpop(h,(decision[i],j)) print "Predicted for class",j #clean the decision values and store the class labels class_label_indices = [] for i in xrange(num_examples): topk_labels = [label for dist,label in topk_class_distances[i]] class_label_indices += [topk_labels] return class_label_indices
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) clf.fit(X_train, y_train) print("Accuracy ",clf.score(X_test, y_test)) # Model Building #using logistic regression print("TRAINING PHASE") logit = LogisticRegression() logit.fit(X_train, y_train) print("Accuracy ",logit.score(X_test, y_test)) print("coefficient :\n",logit.coef_) print("Intercept:\n",logit.intercept_) print(logit.densify()) print(logit.sparsify()) url_list = list(url_list) print("TESTING PHASE") X_predict = ["8.8.8.8"] with open('logit.pickle', 'wb') as handle: pickle.dump(logit, handle, protocol=2) with open('vectorizer.pickle', 'wb') as handle: pickle.dump(vectorizer, handle, protocol=2)
solver='liblinear', # for use with small datasets multi_class='ovr') # stating this is a binary problem) # training the model clf.fit(x_train, y_train) # attributes classes = clf.classes_ # list of class labels coeff = clf.coef_ # coefficients of the model intercept = clf.intercept_ # the intercept of the model n_iter = clf.n_iter_ # the number of iterations for each class - in the binary case it only returns one value # now having a look at the methods dec_func = clf.decision_function( x_test) # the confidence score for each test data density = clf.densify() # returns the coeffient matrix in densy array format get_param = clf.get_params() # returns the hyper-parameters predicted_array = clf.predict( x_test ) # running the test dataset through the model, giving an array of predicted values predic_log_proba = clf.predict_log_proba( x_test) # log of probability estimate for each class predic_prob = clf.predict_proba(x_test) # the probability for each class mean_accuracy = clf.score(x_test, y_test) # returns the mean accuracy of the test set sparsify = clf.sparsify() # returns the coeffient matrix in sparse format print('The mean accuracy of the test set is: %.3f' % mean_accuracy) # now findng the confusion matrix for the data # we first need to convert the 1 and 2 to 'female' and 'male'