def experimentDefaultSetting(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print('Fitting training data on', len(x_train), 'Samples') clf.fit(x_train, y_train) training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
def experimentCombinatorialCrossValidation(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None cv_results = {} for gamma in arange(0.5, 1.4, 0.15): for C in arange(0.5, 2.1, 0.25): print("\nProcessing Gamma:", gamma, "C:", C) average_score = [] for set in dev_sets: clf = SVM().construct_rbf_classifier(kernel='rbf', gamma=gamma, C=C) validation_set = set union_set = DataService().construct_union_set( set.copy(), dev_sets.copy()) # fit on the rest of the data clf.fit(union_set[0], union_set[1]) # validate on validation set y_pred = clf.predict(validation_set[0]) score = f1_score(y_true=validation_set[1], y_pred=y_pred, average='binary') average_score.append(score) score = round(mean(average_score), 3) cv_results[[C, gamma]] = score print("Average F1 score for C:", str(C) + ".", score) # save the best model and use that to classify the testset if score > best_accuracy: best_accuracy = score best_classifier = clf y_pred = best_classifier.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
def experimentLinearKernel(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None cv_results1 = {} cv_results2 = {} for C in arange(0.5, 2.25, 0.25): print("\nProcessing C:", C) average_score1 = [] average_score2 = [] for set in dev_sets: clf2 = SVM().construct_linear_classifier(penalty='l2', C=C) validation_set = set union_set = DataService().construct_union_set( set.copy(), dev_sets.copy()) # fit on the rest of the data clf2.fit(union_set[0], union_set[1]) # validate on validation set y_pred = clf2.predict(validation_set[0]) score = f1_score(y_true=validation_set[1], y_pred=y_pred, average='binary') average_score1.append(score) cv_results1[C] = mean(average_score1) score = round(mean(average_score2), 3) print("Average F1 score for CLF1:", round(mean(average_score1), 3)) print("Average F1 score for CLF2:", round(mean(average_score2), 3)) # save the best model and use that to classify the testset if score > best_accuracy: best_accuracy = score best_classifier = clf2 y_pred = best_classifier.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
class EnsembleClassifier: # SVM and LR classifers are considered to make final model more robust # Scratch implementation of SVM : SupportVectorMachine.py # Scratch implementation of Logistic Regression : LogisticRegression.py # Code by: Yashitha Agarwal (20230091) def __init__(self,lrAlpha=0.01,svmAlpha=0.01,iterations=1000): # Constructor function to initalize individual hyperparameters for LR and SVM self.lrAlpha = lrAlpha self.svmAlpha =svmAlpha self.iterations = iterations self.lrModel = None self.svmModel = None # Code by: Yashitha Agarwal (20230091) def fit(self,X,y): self.lrModel = LogisticRegression(self.lrAlpha,self.iterations) self.svmModel = SVM(self.svmAlpha,self.iterations) self.lrModel.fit(X,y) # Fitting independent and dependent feature in Logistic Regression self.svmModel.fit(X,y) # Fitting independent and dependent feature in Support Vector Machine Classifier # Code by: Prakhar Gurawa (20231064) def predict(self,X,y): lrScore = self.lrModel.score(X,y) svmScore = self.svmModel.score(X,y) lrPrediction = self.lrModel.predict(X) # Prediction of Logistic Regression model svmPrediction = self.svmModel.predict(X) # Prediction of Support Vector Machine model # Currently we are considering only two algorithms and considering prediction of that higher score classifier in case of disagreement finalPrediction = list() # Storing predicted classes for i in range(len(lrPrediction)): if lrPrediction[i] == svmPrediction[i]: finalPrediction.append(lrPrediction[i]) # Case 1: Both LR ans SVM predict to same class else: # Case 2: Disagreement between LR and SVM classifiers if lrScore > svmScore: finalPrediction.append(lrPrediction[i]) else: finalPrediction.append(svmPrediction[i]) # Future work : If we have more than two algorithms we will make this as a voting classifier. # Mutiple classifer are considered and majority of prediction is taken as final prediction. return finalPrediction # Final Predictions using ensemble # Code by: Prakhar Gurawa (20231064) def score(self,X,y): # Function to calculate number of matches between actual classes and predicted classes by our model size = len(y) return sum(self.predict(X,y)==y)/size # Number of matches divided by total inputs
def experimentBestModel(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_best_classifier() # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
def experimentFeatures(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print('Fitting training data on', len(x_dev_train), 'Samples') clf.fit(x_train, y_train) non_zero = [] training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro')) coef = clf.coef_ def identity(x): return x vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) vec.fit_transform(x) names = vec.get_feature_names() coefs_and_features = list(zip(coef[0], names)) list_sorted_pos = sorted(coefs_and_features, key=lambda x: x[0], reverse=True) list_sorted_neg = sorted(coefs_and_features, key=lambda x: x[0]) features = [] for i in range(200): features.append(list_sorted_pos[i][1]) for i in range(200): features.append(list_sorted_neg[i][1]) print("\nneg", list_sorted_neg[:100], "\npos", list_sorted_pos[:100]) new_data = DataService().get_features_from_data(x, features) clf2 = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(new_data) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print("\nTRIMMED DATA SET\n----------") print('Fitting training data on', len(x_dev_train), 'Samples') clf2.fit(x_dev_train, y_dev_train) non_zero = [] training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf2.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
from SupportVectorMachine import SVM import numpy as np features = np.array([[1, 7], [2, 8], [3, 8], [5, 1], [6, -1], [7, 3]]) labels = np.array([-1, -1, -1, 1, 1, 1]) clf = SVM() clf.fit(features, labels) predict_us = [[0, 10], [1, 3], [3, 4], [3, 5]] for p in predict_us: print(p, clf.predict(p))
from SupportVectorMachine import SVM df = pd.read_csv("../datasets/iris.data", header=None) y = df.iloc[0:100, 4].values y = np.where(y == 'Iris-setosa', -1, 1) """ 0 = sepal length 1 = sepal width 2 = petal length 3 = petal width """ X = df.iloc[0:100, [0, 3]].values svm = SVM() svm.fit(X, y) def hyperplane(x, w, b, offset): return (-w[0] * x + b + offset) / w[1] plt.scatter(X[:50, 0], X[:50, 1], color='red', marker='o', label='setosa') plt.scatter(X[50:100, 0], X[50:100, 1], color='blue', marker='x', label='versicolor') x_max = np.amax(X[:, 0]) x_min = np.amin(X[:, 0])