def test_svm(): from models.svm import SVM x, y = np.random.randn(2, 400, 2), np.zeros([2, 400], dtype=int) y[0] = -1 y[1] = 1 for i, theta in enumerate(np.linspace(0, 2 * np.pi, 40)): x[0, (10 * i):( 10 * i + 10)] += 5 * np.array([np.cos(theta), np.sin(theta)]) x = x.reshape(-1, 2) y = y.flatten() plot_scatter([x[y == i] for i in [-1, 1]], 'Real') # train svm = SVM(C=10, sigma=1, kernel='rbf', max_iter=100) svm.fit(x, y) pred = np.array(svm.predict(x)) plot_scatter([x[pred == i] for i in [-1, 1]], 'Pred') acc = np.sum(pred == y) / len(pred) print(f'Acc = {100 * acc:.2f}%') print(svm.support_vectors)
def main(): # trainLabelFile = '/tmp2/yucwang/data/mongo/train.csv' # trainPrefix = '/tmp2/yucwang/data/mongo/C1-P1_Train/' # validLabelFile = '/tmp2/yucwang/data/mongo/dev.csv' # validPrefix = '/tmp2/yucwang/data/mongo/C1-P1_Dev/' # # trainX, trainY = extractFeatures(trainLabelFile, trainPrefix) # validX, validY = extractFeatures(validLabelFile, validPrefix) # # np.save('./train_x.npy', trainX) # np.save('./train_y.npy', trainY) # np.save('./val_x.npy', validX) # np.save('./val_y.npy', validY) trainX = np.load('./bin/exp2/train_x.npz.npy') trainY = np.load('./bin/exp2/train_y.npz.npy') validX = np.load('./bin/exp2/val_x.npz.npy') validY = np.load('./bin/exp2/val_y.npz.npy') model = SVM(penalty='l2', loss='squared_hinge', C=0.85, maxIter=2000) print("SVM: Training get started.") model.train(trainX, trainY) print("SVM: Validation get started.") acc, metrics = model.valid(validX, validY, classNum=3) print(acc) print(metrics)
def main(arguments): # load the features of the dataset features = datasets.load_breast_cancer().data # standardize the features features = StandardScaler().fit_transform(features) # get the number of features num_features = features.shape[1] # load the corresponding labels for the features labels = datasets.load_breast_cancer().target # transform the labels to {-1, +1} labels[labels == 0] = -1 # split the dataset to 70/30 partition: 70% train, 30% test train_features, test_features, train_labels, test_labels = train_test_split( features, labels, test_size=0.3, stratify=labels) train_size = train_features.shape[0] test_size = test_features.shape[0] # slice the dataset as per the batch size train_features = train_features[:train_size - (train_size % BATCH_SIZE)] train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)] test_features = test_features[:test_size - (test_size % BATCH_SIZE)] test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)] # instantiate the SVM class model = SVM( alpha=LEARNING_RATE, batch_size=BATCH_SIZE, svm_c=arguments.svm_c, num_classes=NUM_CLASSES, num_features=num_features, ) # train the instantiated model model.train( epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels], train_size=train_features.shape[0], validation_data=[test_features, test_labels], validation_size=test_features.shape[0], result_path=arguments.result_path, ) test_conf, test_accuracy = utils.plot_confusion_matrix( phase="testing", path=arguments.result_path, class_names=["benign", "malignant"]) print("True negatives : {}".format(test_conf[0][0])) print("False negatives : {}".format(test_conf[1][0])) print("True positives : {}".format(test_conf[1][1])) print("False positives : {}".format(test_conf[0][1])) print("Testing accuracy : {}".format(test_accuracy))
def main(arguments): # load the features of the dataset features = datasets.load_breast_cancer().data # standardize the features features = StandardScaler().fit_transform(features) # get the number of features num_features = features.shape[1] # load the corresponding labels for the features labels = datasets.load_breast_cancer().target # transform the labels to {-1, +1} labels[labels == 0] = -1 # split the dataset to 70/30 partition: 70% train, 30% test train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, stratify=labels) train_size = train_features.shape[0] test_size = test_features.shape[0] # slice the dataset as per the batch size train_features = train_features[:train_size - (train_size % BATCH_SIZE)] train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)] test_features = test_features[:test_size - (test_size % BATCH_SIZE)] test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)] # instantiate the SVM class model = SVM(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, svm_c=arguments.svm_c, num_classes=NUM_CLASSES, num_features=num_features) # train the instantiated model model.train(epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels], train_size=train_features.shape[0], validation_data=[test_features, test_labels], validation_size=test_features.shape[0], result_path=arguments.result_path) test_conf, test_accuracy = utils.plot_confusion_matrix(phase='testing', path=arguments.result_path, class_names=['benign', 'malignant']) print('True negatives : {}'.format(test_conf[0][0])) print('False negatives : {}'.format(test_conf[1][0])) print('True positives : {}'.format(test_conf[1][1])) print('False positives : {}'.format(test_conf[0][1])) print('Testing accuracy : {}'.format(test_accuracy))
def get_optimal_polarity_classifier(): """ Trains and returns the optimal polarity classifier. """ tweets = utils.get_pickles(3) tweets, targets = utils.make_polarity_targets(tweets) vect_options = { 'ngram_range': (1,1), 'max_df': 0.5 } tfidf_options = { 'sublinear_tf': False, 'use_idf': True, 'smooth_idf': True, } clf = SVM(tweets, targets, vect_options, tfidf_options) clf.set_feature_set('PC2', features.get_google_sentiment_values(3)) clf.train_on_feature_set() return clf
def train_svm(): data_helper = DataHelper() train_text, train_labels, ver_text, ver_labels, test_text, test_labels = data_helper.get_data_and_labels() stopwords = data_helper.get_stopwords() svm = SVM(train_text, train_labels, ver_text, ver_labels, test_text, test_labels, stopwords) svm.train() svm.verification() print('ver_acc: {:.3}'.format(svm.ver_acc)) svm.test() print('test_acc: {:.3}'.format(svm.test_acc))
def run_model(args, X, y, ensembler = False): model = None if args['model'] == 'logistic': logistic = Logistic(X,y, model) model = logistic.train_model() elif args['model'] == 'knn': knn = KNN(X,y, model) model = knn.train_model() elif args['model'] == 'svm': svm = SVM(X,y, model) model = svm.train_model() elif args['model'] == 'rfa': rfa = RandomForest(X, y, model) model = rfa.train_model(ensembler) elif args['model'] == 'xgb': xgb = XGB(X, y, model) model = xgb.train_model(ensembler) elif args['model'] == 'lgbm': lgbm = LightGBM(X, y, model) model = lgbm.train_model(ensembler) elif args['model'] == 'catboost': catboost = CatBoost(X, y, model) model = catboost.train_model(ensembler) elif len(args['models']) > 1: models = [('', None)]* len(args['models']) for i in range(len(args['models'])): model_name = args['models'][i] temp_args = copy.deepcopy(args) temp_args['model'] = model_name models[i] = (model_name, run_model(temp_args, X, y, True)) ensembler = Ensembler(X, y, model, args['ensembler_type']) model = ensembler.train_model(models) return model else: print('\nInvalid model name :-|\n') exit() return model
def get_model(args, parallel=True, ckpt_path=False): if args.clf == 'fcn': print('Initializing FCN...') model = FCN(args.input_size, args.output_size) elif args.clf == 'mlp': print('Initializing MLP...') model = MLP(args.input_size, args.output_size) elif args.clf == 'svm': print('Initializing SVM...') model = SVM(args.input_size, args.output_size) elif args.clf == 'cnn': print('Initializing CNN...') model = CNN(nc=args.num_channels, fs=args.cnn_view) elif args.clf == 'resnet18': print('Initializing ResNet18...') model = resnet.resnet18(num_channels=args.num_channels, num_classes=args.output_size) elif args.clf == 'vgg19': print('Initializing VGG19...') model = VGG(vgg_name=args.clf, num_channels=args.num_channels, num_classes=args.output_size) elif args.clf == 'unet': print('Initializing UNet...') model = UNet(in_channels=args.num_channels, out_channels=args.output_size) num_params, num_layers = get_model_size(model) print("# params: {}\n# layers: {}".format(num_params, num_layers)) if ckpt_path: model.load_state_dict(torch.load(ckpt_path)) print('Load init: {}'.format(ckpt_path)) if parallel: model = nn.DataParallel(model.to(get_device(args)), device_ids=args.device_id) else: model = model.to(get_device(args)) loss_type = 'hinge' if args.clf == 'svm' else args.loss_type print("Loss: {}".format(loss_type)) return model, loss_type
def get_model(config, dataset): ''' ADD BRANCH HERE, IF YOU HAVE ADDED A MODEL INTO models FOLDER ''' if config.model_type == 'linreg': model = LinReg(config, dataset) elif config.model_type == 'knnclass': model = KNNClassifier(config, dataset) elif config.model_type == 'knnreg': model = KNNRegressor(config, dataset) elif config.model_type == 'svm': model = SVM(config, dataset) elif config.model_type == 'logreg': model = LogisticRegression(config, dataset) elif config.model_type == 'mlpclass': model = MLPClassifier(config, dataset) elif config.model_type == 'mlpreg': model = MLPRegressor(config, dataset) return model
dp = DecayingPerceptron() dp.train(learning_rates) dp.report() dp.evaluate() ap = AveragedPerceptron() ap.train(learning_rates) ap.report() ap.evaluate() ############################################ ###### Part II ########### ############################################ svm = SVM(verbose=True) svm.train(epochs=20) hm.report(svm) hm.evaluate(svm) lr = LogisticRegression(verbose=True) lr.train(epochs=20) hm.report(lr) hm.evaluate(lr) nb = NaiveBayes() nb.train(epochs=1) hm.report(nb) hm.evaluate(nb) # Logistic regression using sklearn
def perform_grid_search_on_featureset_SA_and_PA(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity")
args = vars(ap.parse_args()) args = Struct(**args) if 'fcn' in args.models: print("Initializing FCN...") model = FCN(cfg.input_sizes[args.dataset], cfg.output_sizes[args.dataset]) print('input_size: {}, output_size: {}'.format( model.input_size, model.output_size)) init_path = '../ckpts/init/{}_fcn.init'.format(args.dataset) torch.save(model.state_dict(), init_path) print('Save init: {}'.format(init_path)) if 'svm' in args.models: print("Initializing SVM...") model = SVM(cfg.input_sizes[args.dataset], cfg.output_sizes[args.dataset]) print('input_size: {}, output_size: {}'.format( model.n_feature, model.n_class)) init_path = '../ckpts/init/{}_svm.init'.format(args.dataset) torch.save(model.state_dict(), init_path) print('Save init: {}'.format(init_path)) if 'resnet18' in args.models: print("Initializing SVM...") model = resnet.resnet18(num_channels=cfg.num_channels[args.dataset], num_classes=cfg.output_sizes[args.dataset]) print('num_channels: {}, output_size: {}'.format( model.num_channels, model.num_classes)) init_path = '../ckpts/init/{}_resnet18.init'.format(args.dataset) torch.save(model.state_dict(), init_path) print('Save init: {}'.format(init_path))
from sklearn.datasets import load_breast_cancer from sklearn.datasets import make_blobs from models.svm import SVM import matplotlib.pyplot as plt from validation.classification import A_micro_average from preprocessing.features_enginering import normalize_dataset from preprocessing.split import train_test_split #X, y = load_wine(return_X_y=True) X = make_blobs(1000, centers=3) y = X[1] X = X[0] #normalize_dataset(X) x_train, y_train, x_test, y_test = train_test_split(X, y, .8) plt.scatter(x=X[:, 0], y=X[:, 1], c=y) plt.show() # %% svm = SVM(C=1) svm.fit(x_train, y_train) # %% res = svm.predict(x_test) A = A_micro_average(y_test, res) #%% from mlxtend.plotting import plot_decision_regions plot_decision_regions(X=X, y=y, clf=svm) plt.show()
dataset_texts = list(training_texts) dataset_texts.extend(test_texts) splits = StratifiedKFold(num_folds).split(dataset_embeddings, dataset_labels) test_step = 0 bestScore = 0 bestTestSet = None bestTestInput = None bestTP = None bestTN = None bestFP = None bestFN = None bestTexts = None proportions = [] for train_index, val_index in splits: model = SVM() training_dataset_embeddings = np.asarray([dataset_embeddings[i] for i in train_index]) training_ex_emb = np.asarray([dataset_ex_embeddings[i] for i in train_index]) training_embeddings_bert = [] if (use_bert): training_embeddings_bert = np.asarray([dataset_bert_vectors[i] for i in train_index]) test_dataset_embeddings = np.asarray([dataset_embeddings[i] for i in val_index]) test_ex_emb = np.asarray([dataset_ex_embeddings[i] for i in val_index]) test_embeddings_bert = [] if (use_bert): test_embeddings_bert = np.asarray([dataset_bert_vectors[i] for i in val_index])
def main(): # Read file names parser = argparse.ArgumentParser() parser.add_argument("xTrain", help="filename for features of the training data") parser.add_argument( "yTrain", help="filename for labels associated with training data") parser.add_argument("xTest", help="filename for features of the test data") args = parser.parse_args() # load the train and test data assumes you'll use numpy xTrain = pd.read_csv(args.xTrain) yTrain = pd.read_csv(args.yTrain) xTest = pd.read_csv(args.xTest) colNames = list(xTrain.keys()) # visualize(xTrain, yTrain, colNames) models = { 'boost': Boost(5, .2, 5), 'dt': DT(25, 1, 'entropy'), 'knn': KNN(1), 'nb': NB(), 'rf': RF(51, 25, 'gini', 25, 1), 'svm': SVM(.1, 'poly', 3, .01) } X = xTrain.to_numpy() Y = yTrain.to_numpy() basePreds = [] for k in models: models[k].train(X, Y) basePreds.append(list(models[k].predict(xTrain.to_numpy()))) basePreds = np.array(basePreds) basePreds = np.transpose(basePreds) metalearner = Boost(5, .2, 5) nfolds = 3 kf = KFold(nfolds) trIndices = [] tsIndices = [] for tr, ts in kf.split(X): trIndices.append(tr) tsIndices.append(ts) total = 0 for i in range(nfolds): metalearner.train(X[trIndices[i], :], Y[trIndices[i], :]) acc = metalearner.predAcc(X[tsIndices[i], :], Y[tsIndices[i], :]) total += acc / nfolds print("ACC: ", total) metalearner.train(X, Y) testPreds = metalearner.predict(xTest.to_numpy()) finalPreds = np.array([list(range(len(xTest))), testPreds]).transpose() finalPreds = pd.DataFrame(finalPreds, columns=['Id', 'Cover_Type']) finalPreds.to_csv('finalPredictions.csv', index=False) # print(finalPreds) freq = Counter(list(testPreds)) labelMap = { 1: 'Spruce/Fir', 2: 'Lodgepole Pine', 3: 'Ponderosa Pine', 4: 'Cottonwood/Willow', 5: 'Aspen', 6: 'Douglas-fir', 7: 'Krummholz' } label = [labelMap[k] for k in freq.keys()] no_trees = [freq[k] for k in freq.keys()] index = np.arange(len(label)) plt.bar(index, no_trees) plt.xlabel('Cover type', fontsize=12) plt.ylabel('Number of samples', fontsize=12) plt.xticks(index, label, fontsize=12, rotation=30) plt.title('Class Frequency in prediction') plt.show() return