def main(arguments): # load the features of the dataset features = datasets.load_breast_cancer().data # standardize the features features = StandardScaler().fit_transform(features) # get the number of features num_features = features.shape[1] # load the corresponding labels for the features labels = datasets.load_breast_cancer().target # transform the labels to {-1, +1} labels[labels == 0] = -1 # split the dataset to 70/30 partition: 70% train, 30% test train_features, test_features, train_labels, test_labels = train_test_split( features, labels, test_size=0.3, stratify=labels) train_size = train_features.shape[0] test_size = test_features.shape[0] # slice the dataset as per the batch size train_features = train_features[:train_size - (train_size % BATCH_SIZE)] train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)] test_features = test_features[:test_size - (test_size % BATCH_SIZE)] test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)] # instantiate the SVM class model = SVM( alpha=LEARNING_RATE, batch_size=BATCH_SIZE, svm_c=arguments.svm_c, num_classes=NUM_CLASSES, num_features=num_features, ) # train the instantiated model model.train( epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels], train_size=train_features.shape[0], validation_data=[test_features, test_labels], validation_size=test_features.shape[0], result_path=arguments.result_path, ) test_conf, test_accuracy = utils.plot_confusion_matrix( phase="testing", path=arguments.result_path, class_names=["benign", "malignant"]) print("True negatives : {}".format(test_conf[0][0])) print("False negatives : {}".format(test_conf[1][0])) print("True positives : {}".format(test_conf[1][1])) print("False positives : {}".format(test_conf[0][1])) print("Testing accuracy : {}".format(test_accuracy))
def main(): reviews = retrieve_reviews(5000) # Split reviews into a training and testing portion train_reviews = reviews[:4500] test_reviews = reviews[4500 + 1:] # Separate text and label to use during the training process text, labels = zip(*train_reviews) vector = FeatureVector() # Add features into feature vector vector.append(sentiment.SentimentAnalysis()) vector.append(tfidf.TfIdf()) vector.append(readability.Readability()) vector.append(food_sophistication.FoodSophistication()) vector.append(average_word_length.AverageWordLength()) vector.append(rarity.Rarity()) vector.append(spelling.Spelling()) vector.append(sentence_topic.SentenceTopic()) # Train all of the features individually vector.train(text, labels) model = SVM(vector) model.train(text, labels) # Separate text and label to use during the testing process text, labels = zip(*test_reviews) matches = 0 distance = {} for i in range(len(labels)): predicted_score = model.predict(text[i]) actual_score = labels[i] # count how many predicted scores match with the actual ones if predicted_score == actual_score: matches += 1 # get a histogram of how far predicted scores differ from the actual dist = abs(predicted_score - actual_score) distance[dist] = distance.get(dist, 0) + 1 print('Matches = {:.2%}'.format(matches / len(labels))) for distance, count in distance.items(): print("{} : {}".format(distance, count))
def main(): reviews = retrieve_reviews(5000) # Split reviews into a training and testing portion train_reviews = reviews[:4000] test_reviews = reviews[4001 + 1:] # Separate text and label to use during the training process text, labels = zip(*train_reviews) vector = FeatureVector() # Add features into feature vector vector.append(average_word_length.AverageWordLength()) vector.append(sentiment_analysis.SentimentAnalysis()) vector.append(rarity_analysis.Rarity()) vector.append(tfidf.TfIdf()) vector.append(readability.Readability()) vector.append(spelling.Spelling()) # Train all of the features individually vector.train(text, labels) model = SVM(vector) model.train(text, labels) # Separate text and label to use during the testing process text, labels = zip(*test_reviews) matches = 0 distance = {} for i in range(len(labels)): predicted_score = model.predict(text[i]) actual_score = labels[i] # count how many predicted scores match with the actual ones if predicted_score == actual_score: matches += 1 # get a histogram of how far predicted scores differ from the actual dist = abs(predicted_score - actual_score) distance[dist] = distance.get(dist, 0) + 1 print('Matches = {0:.2f}%'.format((matches / len(labels)) * 100)) for distance, count in distance.items(): print("{} : {}".format(distance, count))
def svm_experiment(train_data, validation_data, test_data): # merge train and validation log.write("Preparing data training") # build feature matrix train_data = flatten([train_data, validation_data]) train_feature_matrix = [] train_label_vector = [] for doc in train_data: for idx, sentences in enumerate(flatten(doc["paragraphs"])): sentence_feature = [] for attr in feature_attr_name: sentence_feature.append(doc[attr][idx]) train_feature_matrix.append(sentence_feature) train_label_vector.append(flatten(doc["gold_labels"])[idx]) n_data = len(train_feature_matrix) split_length = int(n_data / 20) rand_split = np.random.randint(20) offset = rand_split * split_length length = offset + split_length train_feature_matrix = np.array(train_feature_matrix[offset:length]) train_label_vector = np.array(train_label_vector[offset:length]) train_feature_matrix, train_label_vector = negative_sampling( train_feature_matrix, train_label_vector) log.write("Preparing data testing") test_feature_matrix = [] for doc in test_data: for idx, sentences in enumerate(flatten(doc["paragraphs"])): sentence_feature = [] for attr in feature_attr_name: sentence_feature.append(doc[attr][idx]) test_feature_matrix.append(sentence_feature) test_feature_matrix = np.array(test_feature_matrix) log.write("Training SVM") conf = {"kernel": "rbf_kernel", "degree": 2, "sigma": 1, "C": 100} log.write(conf) svm_clf = SVM(kernel=conf["kernel"], C=conf["C"], sigma=conf["sigma"]) svm_clf.fit(train_feature_matrix, train_label_vector) t1 = time.time() log.write("Testing SVM") predicted_labels, val = svm_clf.predict(test_feature_matrix) t2 = time.time() print('Elapsed time: {}'.format(timedelta(seconds=t2 - t1))) predicted_labels = [1 if i > -1 else 0 for i in predicted_labels] return predicted_labels
def test_train_improper_arguments(self): fv = FeatureVector() fv.append(TestSVM.BogusFeature()) fv.append(TestSVM.BogusFeature()) fv.append(TestSVM.BogusFeature()) with pytest.raises(TypeError): SVM(feature_vector=fv).train(reviews=TestSVM.sample_reviews, labels=TestSVM.sample_labels[:-1, :])
def test_svm_predict(self): fv = FeatureVector() fv.append(TestSVM.BogusFeature()) fv.append(TestSVM.BogusFeature()) fv.append(TestSVM.BogusFeature()) svm = SVM(feature_vector=fv) svm.train(reviews=TestSVM.sample_reviews, labels=TestSVM.sample_labels) assert svm.predict(['HI']) == 0 or svm.predict(['HI']) == 1 assert svm.predict(['earth']) == 0 or svm.predict(['earth']) == 1
def test_train(self): fv = FeatureVector() fv.append(TestSVM.BogusFeature()) fv.append(TestSVM.BogusFeature()) fv.append(TestSVM.BogusFeature()) try: SVM(feature_vector=fv).train(reviews=TestSVM.sample_reviews, labels=TestSVM.sample_labels) except: pytest.fail('SVM training failed')
def __init__(self, classifier_type, classifier_params, kernel_name, kernel_params, id_model=0): # define kernel self.kernel = Kernel(kernel_name, kernel_params) self.tag_kernel = str( id_model ) + "_kernel_" + kernel_name # kernel tag used to kernel mat saves # define classifier if classifier_type == "svm": self.classifier = SVM(classifier_params) elif classifier_type == "l_regression": self.classifier = LogisticRegression(lambda_regularisation=0.01) # kernel mat self.kernel_mat_train = None self.kernel_mat_val = None self.kernel_mat_test = None # load save kernel matrix self.do_save_kernel = kernel_params["save_kernel"] self.save_name = kernel_params["save_name"] if kernel_params["load_kernel"]: self.kernel_mat_train = load_object("train_" + self.tag_kernel + "_" + kernel_params["load_name"]) self.kernel_mat_val = load_object("val_" + self.tag_kernel + "_" + kernel_params["load_name"]) self.kernel_mat_test = load_object("test_" + self.tag_kernel + "_" + kernel_params["load_name"]) if self.kernel_mat_train is None: print("## kernel load failed: kernel not found") else: print("## kernel matrix loaded")