示例#1
0
def test_svm():
    from models.svm import SVM

    x, y = np.random.randn(2, 400, 2), np.zeros([2, 400], dtype=int)
    y[0] = -1
    y[1] = 1
    for i, theta in enumerate(np.linspace(0, 2 * np.pi, 40)):
        x[0, (10 * i):(
            10 * i +
            10)] += 5 * np.array([np.cos(theta), np.sin(theta)])

    x = x.reshape(-1, 2)
    y = y.flatten()

    plot_scatter([x[y == i] for i in [-1, 1]], 'Real')

    # train
    svm = SVM(C=10, sigma=1, kernel='rbf', max_iter=100)
    svm.fit(x, y)

    pred = np.array(svm.predict(x))
    plot_scatter([x[pred == i] for i in [-1, 1]], 'Pred')
    acc = np.sum(pred == y) / len(pred)
    print(f'Acc = {100 * acc:.2f}%')
    print(svm.support_vectors)
def main():
    #    trainLabelFile = '/tmp2/yucwang/data/mongo/train.csv'
    #    trainPrefix = '/tmp2/yucwang/data/mongo/C1-P1_Train/'
    #    validLabelFile = '/tmp2/yucwang/data/mongo/dev.csv'
    #    validPrefix = '/tmp2/yucwang/data/mongo/C1-P1_Dev/'
    #
    #    trainX, trainY = extractFeatures(trainLabelFile, trainPrefix)
    #    validX, validY = extractFeatures(validLabelFile, validPrefix)
    #
    #    np.save('./train_x.npy', trainX)
    #    np.save('./train_y.npy', trainY)
    #    np.save('./val_x.npy', validX)
    #    np.save('./val_y.npy', validY)

    trainX = np.load('./bin/exp2/train_x.npz.npy')
    trainY = np.load('./bin/exp2/train_y.npz.npy')
    validX = np.load('./bin/exp2/val_x.npz.npy')
    validY = np.load('./bin/exp2/val_y.npz.npy')

    model = SVM(penalty='l2', loss='squared_hinge', C=0.85, maxIter=2000)
    print("SVM: Training get started.")
    model.train(trainX, trainY)

    print("SVM: Validation get started.")
    acc, metrics = model.valid(validX, validY, classNum=3)
    print(acc)
    print(metrics)
def main(arguments):
    # load the features of the dataset
    features = datasets.load_breast_cancer().data

    # standardize the features
    features = StandardScaler().fit_transform(features)

    # get the number of features
    num_features = features.shape[1]

    # load the corresponding labels for the features
    labels = datasets.load_breast_cancer().target

    # transform the labels to {-1, +1}
    labels[labels == 0] = -1

    # split the dataset to 70/30 partition: 70% train, 30% test
    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.3, stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset as per the batch size
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    # instantiate the SVM class
    model = SVM(
        alpha=LEARNING_RATE,
        batch_size=BATCH_SIZE,
        svm_c=arguments.svm_c,
        num_classes=NUM_CLASSES,
        num_features=num_features,
    )

    # train the instantiated model
    model.train(
        epochs=arguments.num_epochs,
        log_path=arguments.log_path,
        train_data=[train_features, train_labels],
        train_size=train_features.shape[0],
        validation_data=[test_features, test_labels],
        validation_size=test_features.shape[0],
        result_path=arguments.result_path,
    )

    test_conf, test_accuracy = utils.plot_confusion_matrix(
        phase="testing",
        path=arguments.result_path,
        class_names=["benign", "malignant"])

    print("True negatives : {}".format(test_conf[0][0]))
    print("False negatives : {}".format(test_conf[1][0]))
    print("True positives : {}".format(test_conf[1][1]))
    print("False positives : {}".format(test_conf[0][1]))
    print("Testing accuracy : {}".format(test_accuracy))
def main(arguments):
    # load the features of the dataset
    features = datasets.load_breast_cancer().data

    # standardize the features
    features = StandardScaler().fit_transform(features)

    # get the number of features
    num_features = features.shape[1]

    # load the corresponding labels for the features
    labels = datasets.load_breast_cancer().target

    # transform the labels to {-1, +1}
    labels[labels == 0] = -1

    # split the dataset to 70/30 partition: 70% train, 30% test
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels,
                                                                                test_size=0.3, stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset as per the batch size
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    # instantiate the SVM class
    model = SVM(alpha=LEARNING_RATE, batch_size=BATCH_SIZE, svm_c=arguments.svm_c, num_classes=NUM_CLASSES,
                num_features=num_features)

    # train the instantiated model
    model.train(epochs=arguments.num_epochs, log_path=arguments.log_path, train_data=[train_features, train_labels],
                train_size=train_features.shape[0], validation_data=[test_features, test_labels],
                validation_size=test_features.shape[0], result_path=arguments.result_path)

    test_conf, test_accuracy = utils.plot_confusion_matrix(phase='testing', path=arguments.result_path,
                                                           class_names=['benign', 'malignant'])

    print('True negatives : {}'.format(test_conf[0][0]))
    print('False negatives : {}'.format(test_conf[1][0]))
    print('True positives : {}'.format(test_conf[1][1]))
    print('False positives : {}'.format(test_conf[0][1]))
    print('Testing accuracy : {}'.format(test_accuracy))
def get_optimal_polarity_classifier():
    """
    Trains and returns the optimal polarity classifier.
    """
    tweets = utils.get_pickles(3)
    tweets, targets = utils.make_polarity_targets(tweets)
    vect_options = {
          'ngram_range': (1,1),
          'max_df': 0.5
        }
    tfidf_options = {
         'sublinear_tf': False,
          'use_idf': True,
          'smooth_idf': True,
                     }
    clf = SVM(tweets, targets, vect_options, tfidf_options)
    clf.set_feature_set('PC2', features.get_google_sentiment_values(3))
    clf.train_on_feature_set()
    return clf
示例#6
0
def train_svm():
    data_helper = DataHelper()
    train_text, train_labels, ver_text, ver_labels, test_text, test_labels = data_helper.get_data_and_labels()
    stopwords = data_helper.get_stopwords()

    svm = SVM(train_text, train_labels, ver_text, ver_labels, test_text, test_labels, stopwords)

    svm.train()
    svm.verification()
    print('ver_acc: {:.3}'.format(svm.ver_acc))
    svm.test()
    print('test_acc: {:.3}'.format(svm.test_acc))
def run_model(args, X, y, ensembler = False):
    model = None
    if args['model'] == 'logistic':
        logistic = Logistic(X,y, model)
        model = logistic.train_model()
    elif args['model'] == 'knn':
        knn = KNN(X,y, model)
        model = knn.train_model()
    elif args['model'] == 'svm':
        svm = SVM(X,y, model)
        model = svm.train_model()
    elif args['model'] == 'rfa':
        rfa = RandomForest(X, y, model)
        model = rfa.train_model(ensembler)
    elif args['model'] == 'xgb':
        xgb = XGB(X, y, model)
        model = xgb.train_model(ensembler)
    elif args['model'] == 'lgbm':
        lgbm = LightGBM(X, y, model)
        model = lgbm.train_model(ensembler)
    elif args['model'] == 'catboost':
        catboost = CatBoost(X, y, model)
        model = catboost.train_model(ensembler)
    elif len(args['models']) > 1:
        models = [('', None)]* len(args['models'])
        for i in range(len(args['models'])):
            model_name = args['models'][i]
            temp_args = copy.deepcopy(args)
            temp_args['model'] = model_name 
            models[i] = (model_name, run_model(temp_args, X, y, True))

        ensembler = Ensembler(X, y, model, args['ensembler_type'])
        model = ensembler.train_model(models)
        return model
    else:
        print('\nInvalid model name :-|\n')
        exit()
    return model
示例#8
0
def get_model(args, parallel=True, ckpt_path=False):
    if args.clf == 'fcn':
        print('Initializing FCN...')
        model = FCN(args.input_size, args.output_size)
    elif args.clf == 'mlp':
        print('Initializing MLP...')
        model = MLP(args.input_size, args.output_size)
    elif args.clf == 'svm':
        print('Initializing SVM...')
        model = SVM(args.input_size, args.output_size)
    elif args.clf == 'cnn':
        print('Initializing CNN...')
        model = CNN(nc=args.num_channels, fs=args.cnn_view)
    elif args.clf == 'resnet18':
        print('Initializing ResNet18...')
        model = resnet.resnet18(num_channels=args.num_channels,
                                num_classes=args.output_size)
    elif args.clf == 'vgg19':
        print('Initializing VGG19...')
        model = VGG(vgg_name=args.clf,
                    num_channels=args.num_channels,
                    num_classes=args.output_size)
    elif args.clf == 'unet':
        print('Initializing UNet...')
        model = UNet(in_channels=args.num_channels,
                     out_channels=args.output_size)

    num_params, num_layers = get_model_size(model)
    print("# params: {}\n# layers: {}".format(num_params, num_layers))

    if ckpt_path:
        model.load_state_dict(torch.load(ckpt_path))
        print('Load init: {}'.format(ckpt_path))

    if parallel:
        model = nn.DataParallel(model.to(get_device(args)),
                                device_ids=args.device_id)
    else:
        model = model.to(get_device(args))

    loss_type = 'hinge' if args.clf == 'svm' else args.loss_type
    print("Loss: {}".format(loss_type))

    return model, loss_type
示例#9
0
def get_model(config, dataset):
    '''
    ADD BRANCH HERE, IF YOU HAVE ADDED A MODEL INTO models FOLDER
    '''
    if config.model_type == 'linreg':
        model = LinReg(config, dataset)
    elif config.model_type == 'knnclass':
        model = KNNClassifier(config, dataset)
    elif config.model_type == 'knnreg':
        model = KNNRegressor(config, dataset)
    elif config.model_type == 'svm':
        model = SVM(config, dataset)
    elif config.model_type == 'logreg':
        model = LogisticRegression(config, dataset)
    elif config.model_type == 'mlpclass':
        model = MLPClassifier(config, dataset)
    elif config.model_type == 'mlpreg':
        model = MLPRegressor(config, dataset)

    return model
示例#10
0
    dp = DecayingPerceptron()
    dp.train(learning_rates)
    dp.report()
    dp.evaluate()

    ap = AveragedPerceptron()
    ap.train(learning_rates)
    ap.report()
    ap.evaluate()

    ############################################
    ###### Part II                   ###########
    ############################################

    svm = SVM(verbose=True)
    svm.train(epochs=20)
    hm.report(svm)
    hm.evaluate(svm)

    lr = LogisticRegression(verbose=True)
    lr.train(epochs=20)
    hm.report(lr)
    hm.evaluate(lr)

    nb = NaiveBayes()
    nb.train(epochs=1)
    hm.report(nb)
    hm.evaluate(nb)

    # Logistic regression using sklearn
示例#11
0
def perform_grid_search_on_featureset_SA_and_PA():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
示例#12
0
args = vars(ap.parse_args())
args = Struct(**args)

if 'fcn' in args.models:
    print("Initializing FCN...")
    model = FCN(cfg.input_sizes[args.dataset],
                cfg.output_sizes[args.dataset])
    print('input_size: {}, output_size: {}'.format(
        model.input_size, model.output_size))
    init_path = '../ckpts/init/{}_fcn.init'.format(args.dataset)
    torch.save(model.state_dict(), init_path)
    print('Save init: {}'.format(init_path))

if 'svm' in args.models:
    print("Initializing SVM...")
    model = SVM(cfg.input_sizes[args.dataset],
                cfg.output_sizes[args.dataset])
    print('input_size: {}, output_size: {}'.format(
        model.n_feature, model.n_class))
    init_path = '../ckpts/init/{}_svm.init'.format(args.dataset)
    torch.save(model.state_dict(), init_path)
    print('Save init: {}'.format(init_path))

if 'resnet18' in args.models:
    print("Initializing SVM...")
    model = resnet.resnet18(num_channels=cfg.num_channels[args.dataset],
                            num_classes=cfg.output_sizes[args.dataset])
    print('num_channels: {}, output_size: {}'.format(
        model.num_channels, model.num_classes))
    init_path = '../ckpts/init/{}_resnet18.init'.format(args.dataset)
    torch.save(model.state_dict(), init_path)
    print('Save init: {}'.format(init_path))
示例#13
0
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import make_blobs
from models.svm import SVM
import matplotlib.pyplot as plt
from validation.classification import A_micro_average
from preprocessing.features_enginering import normalize_dataset
from preprocessing.split import train_test_split

#X, y = load_wine(return_X_y=True)
X = make_blobs(1000, centers=3)
y = X[1]
X = X[0]
#normalize_dataset(X)
x_train, y_train, x_test, y_test = train_test_split(X, y, .8)
plt.scatter(x=X[:, 0], y=X[:, 1], c=y)
plt.show()

# %%
svm = SVM(C=1)
svm.fit(x_train, y_train)

# %%

res = svm.predict(x_test)
A = A_micro_average(y_test, res)

#%%

from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X=X, y=y, clf=svm)
plt.show()
示例#14
0
            dataset_texts = list(training_texts)
            dataset_texts.extend(test_texts)

            splits = StratifiedKFold(num_folds).split(dataset_embeddings, dataset_labels)
            test_step = 0
            bestScore = 0
            bestTestSet = None
            bestTestInput = None
            bestTP = None
            bestTN = None
            bestFP = None
            bestFN = None
            bestTexts = None
            proportions = []
            for train_index, val_index in splits:
                model = SVM()

                training_dataset_embeddings = np.asarray([dataset_embeddings[i] for i in train_index])
                training_ex_emb = np.asarray([dataset_ex_embeddings[i] for i in train_index])

                training_embeddings_bert = []
                if (use_bert):
                    training_embeddings_bert = np.asarray([dataset_bert_vectors[i] for i in train_index])

                test_dataset_embeddings = np.asarray([dataset_embeddings[i] for i in val_index])
                test_ex_emb = np.asarray([dataset_ex_embeddings[i] for i in val_index])

                test_embeddings_bert = []
                if (use_bert):
                    test_embeddings_bert = np.asarray([dataset_bert_vectors[i] for i in val_index])
示例#15
0
def main():
    # Read file names
    parser = argparse.ArgumentParser()
    parser.add_argument("xTrain",
                        help="filename for features of the training data")
    parser.add_argument(
        "yTrain", help="filename for labels associated with training data")
    parser.add_argument("xTest", help="filename for features of the test data")

    args = parser.parse_args()

    # load the train and test data assumes you'll use numpy
    xTrain = pd.read_csv(args.xTrain)
    yTrain = pd.read_csv(args.yTrain)
    xTest = pd.read_csv(args.xTest)
    colNames = list(xTrain.keys())

    # visualize(xTrain, yTrain, colNames)

    models = {
        'boost': Boost(5, .2, 5),
        'dt': DT(25, 1, 'entropy'),
        'knn': KNN(1),
        'nb': NB(),
        'rf': RF(51, 25, 'gini', 25, 1),
        'svm': SVM(.1, 'poly', 3, .01)
    }

    X = xTrain.to_numpy()
    Y = yTrain.to_numpy()

    basePreds = []
    for k in models:
        models[k].train(X, Y)
        basePreds.append(list(models[k].predict(xTrain.to_numpy())))
    basePreds = np.array(basePreds)
    basePreds = np.transpose(basePreds)

    metalearner = Boost(5, .2, 5)

    nfolds = 3
    kf = KFold(nfolds)
    trIndices = []
    tsIndices = []
    for tr, ts in kf.split(X):
        trIndices.append(tr)
        tsIndices.append(ts)

    total = 0

    for i in range(nfolds):
        metalearner.train(X[trIndices[i], :], Y[trIndices[i], :])
        acc = metalearner.predAcc(X[tsIndices[i], :], Y[tsIndices[i], :])
        total += acc / nfolds

    print("ACC: ", total)

    metalearner.train(X, Y)
    testPreds = metalearner.predict(xTest.to_numpy())
    finalPreds = np.array([list(range(len(xTest))), testPreds]).transpose()
    finalPreds = pd.DataFrame(finalPreds, columns=['Id', 'Cover_Type'])
    finalPreds.to_csv('finalPredictions.csv', index=False)
    # print(finalPreds)

    freq = Counter(list(testPreds))
    labelMap = {
        1: 'Spruce/Fir',
        2: 'Lodgepole Pine',
        3: 'Ponderosa Pine',
        4: 'Cottonwood/Willow',
        5: 'Aspen',
        6: 'Douglas-fir',
        7: 'Krummholz'
    }

    label = [labelMap[k] for k in freq.keys()]
    no_trees = [freq[k] for k in freq.keys()]

    index = np.arange(len(label))
    plt.bar(index, no_trees)
    plt.xlabel('Cover type', fontsize=12)
    plt.ylabel('Number of samples', fontsize=12)
    plt.xticks(index, label, fontsize=12, rotation=30)
    plt.title('Class Frequency in prediction')
    plt.show()

    return