예제 #1
0
def mlPreddict(label):
    df = pd.read_csv('classification.csv')
    df_names = df
    Xfeatures = df_names['Names']
    cv = CountVectorizer()
    x = cv.fit_transform(Xfeatures)
    y = df_names.Classes
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=12)

    knn = KNeighborsClassifier(n_neighbors=158)
    r_c = RidgeClassifier()
    m_nb = MultinomialNB()
    dt_c = DecisionTreeClassifier()
    svm_c = svm.SVC()

    knn.fit(x_train, y_train)
    r_c.fit(x_train, y_train)
    m_nb.fit(x_train, y_train)
    dt_c.fit(x_train, y_train)
    svm_c.fit(x_train, y_train)

    knn.score(x_test, y_test)
    r_c.score(x_test, y_test)
    m_nb.score(x_test, y_test)
    dt_c.score(x_test, y_test)
    svm_c.score(x_test, y_test)

    sample_name = [label]
    vect = cv.transform(sample_name).toarray()

    prediction = svm_c.predict(vect)
    predict = ''.join(map(str, prediction))
    print(predict)

    return predict
def fit_ridge(l2_reg, train_random_features, y_train, test_random_features, y_test):
    clf = RidgeClassifier(alpha=l2_reg)
    clf.fit(train_random_features, y_train.ravel())
    train_accuracy = clf.score(train_random_features, y_train)
    test_accuracy = clf.score(test_random_features, y_test)
    print("Train accuracy:", train_accuracy)
    print("Test accuracy:", test_accuracy)
    return train_accuracy, test_accuracy
    def fit_logistic_regression(self, X, y):
        X = review_train_logged

        y = review_train['has_reviewed'].to_numpy().astype(int)

        model = RidgeClassifier().fit(X, y)
        model.score(X, y)  # 0.68 accuracy

        model.get_params()
예제 #4
0
def scikit_ridgec_test(size):
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=size,
                                        n_informative=2,
                                        n_redundant=2)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    model = RidgeClassifier(random_state=42)
    model.fit(X_train, y_train)
    model.score(X_test, y_test)
예제 #5
0
def main():
    # load data
    file_lssvm = "hw2_lssvm_all.dat"
    size_train = 400
    with open(file_lssvm, 'r') as fr:
        list_line = fr.readlines()
        x_train, y_train = load_xy(list_line[:size_train])
        x_test, y_test = load_xy(list_line[size_train:])
    list_lambda = [0.05, 0.5, 5, 50, 500]

    # Q9, Q10: run regression
    ein = []
    eout = []
    for lam in list_lambda:
        rcf = RidgeClassifier(alpha=lam)
        rcf.fit(x_train, y_train)
        Ein = 1 - rcf.score(x_train, y_train)
        Eout = 1 - rcf.score(x_test, y_test)
        ein.append(Ein)
        eout.append(Eout)
    print("Ridge Classifier:")
    print("argminEin = {}, minEin_lambda = {}".format(
        min(ein), list_lambda[ein.index(min(ein))]))
    print("argminEout = {}, minEout_lambda = {}".format(
        min(eout), list_lambda[eout.index(min(eout))]))
    print("==========")
    # Q11, Q12: Bagging
    ein.clear()
    eout.clear()
    num_iter = 250
    for lam in list_lambda:
        rcf = RidgeClassifier(alpha=lam)
        bcf = BaggingClassifier(base_estimator=rcf,
                                n_estimators=num_iter,
                                n_jobs=-1,
                                random_state=0)
        bcf.fit(x_train, y_train)
        Ein = 1 - bcf.score(x_train, y_train)
        Eout = 1 - bcf.score(x_test, y_test)
        ein.append(Ein)
        eout.append(Eout)
    print("Bagging Ridge Classifier:")
    print("argminEin = {}, minEin_lambda = {}".format(
        min(ein), list_lambda[ein.index(min(ein))]))
    print("argminEout = {}, minEout_lambda = {}".format(
        min(eout), list_lambda[eout.index(min(eout))]))

    return
def train_linear_classificator(challenge, new=False):
    if new:
        unigram_tagger, st = spamFilter.prepare_tagger()
        idealist = list(
        importDataHelper.readcsvdata(variables.ideadbpath + challenge + '.csv'))
        featurelist = {}
        for idea in idealist:
            idea['TRIGGERED'] = []
            idea['PREDICTION'] = "Ham"
            idea, ideafeatures = spamFilter.classify_and_get_idea(idea, unigram_tagger, st)
            if "unusable" in idea["STATUS"] or 'spam' in idea.get("SPAM", ""):
                ideafeatures["Spam"] = 1
            else:
                ideafeatures["Spam"] = 0
            for key in ideafeatures.keys():
                featurelist[key] = featurelist.get(key, [])
                featurelist[key].append(ideafeatures[key])
    else:
        if challenge == "all":
            idealist = []
            for file in listdir(variables.linclasstrainingsdatapath):
                if isfile(join(variables.linclasstrainingsdatapath, file)):
                    idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file)))
        else:
            idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv"))
        featurelist = {}
        for key in idealist[0].keys():
            featurelist[key] = [int(x) for x in idealist[0][key].replace('[', '').replace(']', '').split(',')]
    testdata = pd.DataFrame(featurelist)
    X = testdata.drop('Spam', axis=1)
    y = testdata['Spam']
    importDataHelper.writecsvfiledict(variables.linclasstrainingsdatapath + challenge + ".csv", featurelist.keys(), featurelist)
    clf = RidgeClassifier().fit(X, y)
    print(clf.score(X, y))
    return clf, clf.coef_
def train_linear_classifier(featurelist):
    testdata = pd.DataFrame(featurelist)
    X = testdata.drop('Spam', axis=1)
    y = testdata['Spam']
    clf = RidgeClassifier().fit(X, y)
    print(clf.score(X, y))
    return clf, clf.coef_
예제 #8
0
def ridgeClass(features, response):
    from sklearn.linear_model import RidgeClassifier
    X_train, X_test, y_train, y_test = train_test_split(
        features, response, test_size=0.4)  # , random_state=4)
    ridgeC = RidgeClassifier()
    ridgeC.fit(X_train, y_train)
    print(ridgeC.score(X_test, y_test))
예제 #9
0
def ridge_class_model(X_train2, X_validate2, X_test2, y_train2, y_validate2,
                      y_test2):
    '''Creates a ridge classifier model
    shows it accuracy on train, validate and test'''
    # create the model object
    clf2 = RidgeClassifier(random_state=123)
    # fit to train only
    clf2.fit(X_train2, y_train2)
    y_pred = clf2.predict(X_train2)
    # evaluate with score, returns the mean accuracy on the given test data and labels
    print('Accuracy of Ridge Classifier Model on Train: \n',
          round(clf2.score(X_train2, y_train2), 4))
    print('Accuracy of Ridge Classifier Model on Validate: \n',
          round(clf2.score(X_validate2, y_validate2), 4))
    print('Accuracy of Ridge Classifier Model on Test: \n',
          round(clf2.score(X_train2, y_train2), 4))
예제 #10
0
def train(train_data, train_target, test_data, test_target, alphas):

    # model = RidgeClassifier()
    # parameters = {'alpha':alphas}
    # clf = GridSearchCV(model, parameters, cv=2, n_jobs=len(alphas))
    warned = False

    start_time = time.time()

    test_scores = []
    val_scores = []

    with warnings.catch_warnings(record=True) as caught_warnings:
        warnings.simplefilter("always")
        # clf.fit(train_data, train_target)
        # val_scores = clf.cv_results_['mean_test_score']

        for alpha in alphas:
            # validation score
            end_index = int(len(train_data) * 0.8)

            clf = RidgeClassifier(alpha=alpha, fit_intercept=False)
            # clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False, selection='random')
            clf.fit(train_data[:end_index], train_target[:end_index])

            val_scores.append(
                clf.score(train_data[end_index:], train_target[end_index:]))
            # val_scores.append(clf_score(clf, train_data[end_index:],train_target[end_index:]))

        for alpha in alphas:
            clf = RidgeClassifier(alpha=alpha, fit_intercept=False)
            # clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False, selection='random')
            clf.fit(train_data, train_target)

            test_scores.append(clf.score(test_data, test_target))
            # test_scores.append(clf_score(clf, test_data, test_target))


#         for warning in caught_warnings:
#             # if warning.category == UnsupportedWarning:
#             print(str(warning.message))
#             warned = True
    train_time = time.time() - start_time

    return val_scores, test_scores, caught_warnings, train_time
예제 #11
0
class RidgeModel(ccobra.CCobraModel):
    def __init__(self, name='Ridge', k=1):
        super(RidgeModel, self).__init__(name, ["moral"], ["single-choice"])

        self.clf = RidgeClassifier(alpha=7)

        self.n_epochs = 1

    def pre_train(self, dataset):

        x = []
        y = []

        for subj_train_data in dataset:
            for seq_train_data in subj_train_data:

                seq_train_data['task'] = seq_train_data['item'].task
                inp = create_input(seq_train_data)

                target = float(output_mppng[seq_train_data['response'][0][0]])

                x.append(inp)

                y.append(target)
        x = np.array(x)
        y = np.array(y)

        self.train_x = x
        self.train_y = y

        self.train_network(self.train_x,
                           self.train_y,
                           self.n_epochs,
                           verbose=True)

    def train_network(self, train_x, train_y, n_epochs, verbose=False):
        print('Starting training...')
        for epoch in range(self.n_epochs):

            # Shuffle the training data
            perm_idxs = np.random.permutation(np.arange(len(train_x)))
            train_x = train_x[perm_idxs]
            train_y = train_y[perm_idxs]

            self.clf.fit(train_x, train_y)

            print('Mean accuracy:')
            print(self.clf.score(train_x, train_y))

    def predict(self, item, **kwargs):
        input = {'task': item.task}
        input['aux'] = kwargs
        x = np.array(create_input(input)).reshape(1, -1)
        output = self.clf.predict(x)

        self.prediction = output_mppngREV[output[0]]
        return self.prediction
def train(train_data, train_target, test_data, test_target, alphas):

    warned = False
    
    start_time = time.time()
    
    test_scores = []
    
    with warnings.catch_warnings(record=True) as caught_warnings:
        warnings.simplefilter("always")
        
#         model = RidgeClassifier()
#         parameters = {'alpha':alphas}
#         if train_data.shape[1] <= 5000:
#             clf = GridSearchCV(model, parameters, cv=2, n_jobs=len(alphas))
#         else:
#             # decrease memory usage for 5K+ dimensions
#             clf = GridSearchCV(model, parameters, cv=2, n_jobs=1)
        
#         clf.fit(train_data, train_target)
#         val_scores = clf.cv_results_['mean_test_score']
        
        val_scores = []
        
        for alpha in alphas:
            # validation score
            end_index = int(len(train_data) * 0.8)
            
            clf = RidgeClassifier(alpha=alpha)
            clf.fit(train_data[:end_index], train_target[:end_index])
            val_scores.append(clf.score(train_data[end_index:], train_target[end_index:]))
        
        for alpha in alphas:
            clf = RidgeClassifier(alpha=alpha)
            clf.fit(train_data, train_target)
            test_scores.append(clf.score(test_data, test_target))

#         for warning in caught_warnings:
#             # if warning.category == UnsupportedWarning:
#             print(str(warning.message))
#             warned = True
    train_time = time.time() - start_time
    
    return val_scores, test_scores, caught_warnings, train_time
예제 #13
0
def linear_ridge(M, labels, seed, split=0.8):
    """
    linear ridge algorithm for input M and output labels
    Inputs:
        M : matrix m*n where each row is a different example and the columns are composed of the features
        labels : vector m*1 where each row is the correponding class of the row of M
        seed : random seed to do the split between test/validation/training
        split: number between 0 and 1. Split between training and testing set. Default : 0.8
    Ouputs:
        roc_auc_train: AUC score on the train set
        roc_auc_val: AUC score on the validation set
        roc_auc_test: AUC score on the test set
    """
    M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str)
    M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test(
        M_float, seed, labels, nb_val=3, split=0.8)
    X_train = M_train_val
    Y_train = labels_train_val
    X_test = M_test
    Y_train = np.reshape(Y_train, (Y_train.shape[0], ))
    # Create our imputer to replace missing values with the mean e.g.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)

    # Impute our data, then train
    X_train_imp = imp.transform(X_train)
    clf = RidgeClassifier()
    clf = clf.fit(X_train_imp, Y_train)

    # Impute each test item, then predict
    X_test_imp = imp.transform(X_test)
    X_val_imp = imp.transform(M_val)

    # Compute the accuracy
    lin_acc = clf.score(X_test_imp, labels_test)
    # Compute the AUC
    pred_train = clf.decision_function(X_train_imp)
    pred = clf.decision_function(X_test_imp)
    pred_val = clf.decision_function(X_val_imp)

    fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, pred_train)
    roc_auc_train = auc(fpr_svm, tpr_svm)
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val, pred_val)
    roc_auc_val = auc(fpr_svm, tpr_svm)
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, pred)
    roc_auc_test = auc(fpr_svm, tpr_svm)
    print(
        'linear ridge: train set: %0.5f, validation: %0.5f, test set: %0.5f' %
        (roc_auc_train, roc_auc_val, roc_auc_test))
    return roc_auc_train, roc_auc_val, roc_auc_test
예제 #14
0
def run_candidate(genome, X_train, X_test, y_train, y_test):

    clf = RidgeClassifier(alpha=1.0)
    para_return = Parallel(n_jobs=-1)(
        delayed(run_case)(train_case_X, train_case_y)
        for train_case_X, train_case_y in zip(X_train, y_train))

    states = []
    trains = []
    files = []

    for X, y in para_return:
        states.append(X)
        trains.append(y)

    trains = np.asarray(trains, dtype=np.float64)
    #selection_indices = np.asarray(range(1000,1500)) #np.random.random_integers(0, 1000, size=(500,))
    #selection_indices = [i * period for i in range(10)] #np.random.random_integers(0, len(X_train[0]-1), size=(int(len(X_train[0])/2),))
    X_train_pc = [
        np.asarray(s['sp'])[np.asarray(genome, dtype=np.int)] for s in states
    ]
    print("RESERVOIR:", X_train_pc)
    clf.fit(X_train_pc, trains)

    #TESTING
    states = []
    test_labels = []
    test_para_return = Parallel(n_jobs=-1)(delayed(run_case)(X, y)
                                           for X, y in zip(X_test, y_test))

    for X, y in test_para_return:
        states.append(X)
        test_labels.append(y)

    test_labels = np.asarray(test_labels, dtype=np.float64)
    X_test_pc = [
        np.asarray(s['sp'])[np.asarray(genome, dtype=np.int)] for s in states
    ]

    #for prediction, true_label in zip(clf.predict(X_test_pc), test_labels):
    #    print("Prediction: ", prediction, "True label ", true_label)
    #print("Score:", clf.score(X_test_pc, test_labels))
    return clf.score(X_test_pc, test_labels)
예제 #15
0
파일: ml.py 프로젝트: hafsa-kk/Ai-project
def ml_predict(label):
    df = pd.read_csv('classification.csv')
    df_names = df
    Xfeatures = df_names['Names']
    cv = CountVectorizer()
    x = cv.fit_transform(Xfeatures)
    y = df_names.Classes
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=12)

    knn = KNeighborsClassifier(n_neighbors=158)
    r_c = RidgeClassifier()
    m_nb = MultinomialNB()
    dt_c = DecisionTreeClassifier()
    svm_c = svm.SVC()

    knn.fit(x_train, y_train)
    r_c.fit(x_train, y_train)
    m_nb.fit(x_train, y_train)
    dt_c.fit(x_train, y_train)
    svm_c.fit(x_train, y_train)

    k = knn.score(x_test, y_test)
    r = r_c.score(x_test, y_test)
    m = m_nb.score(x_test, y_test)
    d = dt_c.score(x_test, y_test)
    s = svm_c.score(x_test, y_test)

    score_list = [k, r, m, d, s]
    sorted(score_list, reverse=True)

    sample_name = ['Sara']
    vect = cv.transform(sample_name).toarray()

    predict_w = r_c.predict(vect)

    prdct = ''.join(map(str, predict_w))

    #print(predict_w)
    return prdct
예제 #16
0
def Classification_Test(N=400, eta=0.35, gamma=0.05, tau=400, bits=np.inf, num_waves=1000, test_size=0.1,
						preload=False, write=False, mask=0.1, activate='mg', beta=1.0, power=7, t=1, theta=0.2):
	"""
	Args:
		N: number of virtual high_nodes
		gamma: input gain
		eta: oscillation strength
		tau: loop delay length
		bits: bit precision
		preload: preload time-series data
		write: save created time-series data
		mask: amplitude of mask values
		activate: activation function to be used (sin**2,tanh,mg)
		beta: driver gain
		t: timestep used to solve diffeq
		power: exponent for MG equation
		theta: distance between virtual high_nodes


	Returns:
		Accuracy of Ridge Model on Testing Data
	"""
	X_train, X_test, y_train, y_test = make_training_testing_set(num_waves=num_waves, test_percent=test_size,
																 preload=preload, write=write)

	clf = RidgeClassifier(alpha=0)
	m = np.array([random.choice([-mask, mask]) for i in range(N)])

	# Instantiate Reservoir, feed in training and prediction data sets
	r1 = DelayReservoir(N=N, eta=eta, gamma=gamma, theta=theta, beta=beta, tau=tau, power=power)
	Xs = [X_train, X_test]
	new_Xs = [[], []]
	for k, data in enumerate(Xs):
		for idx in tqdm(range(len(data))):
			new_Xs[k].append(np.array(r1.calculate(data[idx], m, bits, t, activate))[:, -1])
	new_Xs = np.array(new_Xs)
	clf.fit(new_Xs[0], y_train)

	return [clf.score(new_Xs[1], y_test), np.mean(margin(clf, X_test))]
예제 #17
0
        print('Features shape: {}'.format(conv_features['train'].shape))

        print('Determining binarization threshold...')
        best_score = 0
        best_threshold = 0
        for threshold in np.arange(0, 3, 0.2):
            clf = RidgeClassifier(alpha=1.0, fit_intercept=False)
            # clf = SGDClassifier(loss='squared_loss', penalty='elasticnet', alpha=1.0, l1_ratio=0.5, fit_intercept=False)
            # clf = ElasticNet(alpha=0.0001, l1_ratio=l1_ratio, fit_intercept=False, selection='random')
            features_train = threshold_binarize(conv_features['train'],
                                                threshold)
            features_test = threshold_binarize(conv_features['test'],
                                               threshold)

            clf.fit(features_train, labels_train)
            score = clf.score(features_test, labels_test)

            # score = clf_score(clf, features_test, labels_test)

            print('Threshold', threshold, 'Score', score)

            if score > best_score:
                best_score = score
                best_threshold = threshold
        print(
            'Finished threshold determination. Best threshold: {}. Best Score: {}.'
            .format(best_threshold, best_score))

        features_train = threshold_binarize(conv_features['train'],
                                            best_threshold)
        features_test = threshold_binarize(conv_features['test'],
plt.legend(loc="lower right")

plt.show()

#RidgeClassifier Algorithm

from sklearn.linear_model import RidgeClassifier
RC = RidgeClassifier()
RC = RC.fit(X_train, y_train)
RC

#accuracy of RidgeClassifier Algorithm

y_pred1 = RC.predict(X_test)
print('Accuracy score= {:.2f}'.format(RC.score(X_test, y_test)))

#ROC curve of RidgeClassifier Algorithm

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_pred1)

roc_auc = auc(fpr, tpr)

plt.figure()

plt.plot(fpr,
         tpr,
         color='darkorange',
예제 #19
0
# np.savetxt(base_dir+"classifications/train_test_split.csv", idxs, delimiter=",")
idxs = np.loadtxt(base_dir + "classifications/train_test_split.csv",
                  delimiter=",").astype(int)
cnt = 0
for idx in idxs:
    ntrain = int(np.round(0.7 * ndata))
    idx_train = idx[0:ntrain]
    idx_test = idx[ntrain:]

    # use 'class2' here to train machine on two classes
    # only, aurora and non-aurora (instead of 'class6')
    X_train = features[idx_train, :]
    y_train = df["class6"][idx_train]
    X_test = features[idx_test, :]
    y_test = df["class6"][idx_test]

    clf = RidgeClassifier(random_state=10 * cnt, normalize=False, alpha=alpha)
    clf.fit(X_train, y_train)
    avgscore[cnt] = clf.score(X_test, y_test)
    cnt += 1
print("==========================")
print("follow are testing results")
print("==========================")
print("average accuracy: ", np.mean(avgscore))
print("standard deviation of accuracy: ", np.std(avgscore))
#print( np.mean(avgscore), np.std(avgscore) )
y_pred = clf.predict(X_test)
mat = confusion_matrix(y_test, y_pred)
print("confusion matrix is:")
print(mat)
예제 #20
0
def general_model(df, training_data):
    ml_names, ml_genders = [], []
    for k, v in training_data.items():
        k = re.sub('[^a-z]+', '?', k)
        ml_names.append(k)
        ml_genders.append(v)

    training_df = pd.DataFrame()
    training_df['name'] = ml_names
    training_df['gender'] = ml_genders
    training_df['-3'] = training_df['name'].str[-3]
    training_df['-2'] = training_df['name'].str[-2]
    training_df['-1'] = training_df['name'].str[-1]

    columns = ['-3', '-2', '-1']
    X = [pd.get_dummies(training_df[i]) for i in columns]

    modelX = pd.concat([i for i in X], axis=1)

    def map_genders(x):
        if x == 'F':
            return 1
        return 0

    modelY = training_df['gender'].apply(map_genders)

    X_train, X_test, y_train, y_test = train_test_split(modelX,
                                                        modelY,
                                                        test_size=0.2,
                                                        random_state=42)

    def to_gender_class(x):
        if x:
            return 'F'
        return 'M'

    model = RidgeClassifier(fit_intercept=False, solver='lsqr')
    model.fit(X_train, y_train)
    training_df['pred'] = model.predict(modelX)
    training_df['pred'] = training_df['pred'].apply(to_gender_class)
    print("Model Scores (Train/Test):")
    print(model.score(X_train, y_train))
    print(model.score(X_test, y_test))
    # training_df[training_df['gender'] != training_df['pred']]

    # Predict using model
    vectorized_df = pd.DataFrame()
    vectorized_df['cleaned_name'] = df['cleaned_name']

    vectorized_df['-3'] = df['cleaned_name'].str[-3]
    vectorized_df['-2'] = df['cleaned_name'].str[-2]
    vectorized_df['-1'] = df['cleaned_name'].str[-1]

    columns = ['-3', '-2', '-1']
    tempX = [pd.get_dummies(vectorized_df[i], ) for i in columns]

    necessary_columns = ['?'] + [chr(i) for i in range(97, 123)]
    for i in tempX:
        for j in necessary_columns:
            if j not in i.columns:
                i[j] = [0 for k in range(len(df))]

    actualX = pd.concat([i for i in tempX], axis=1)

    def map_to_gender(x):
        if x:
            return 'F'
        return 'M'

    predicted_values = model.predict(actualX)
    df['model_prediction'] = predicted_values
    df['model_prediction'] = df['model_prediction'].apply(map_to_gender)

    return df
    plt.savefig('gmm_images/{}_h1_cder_n_{}.png'.format(n, num_dgms))
    plt.close()

    # -----------------------------------------------------------------------------
    # ------------------------------ CDER features --------------------------------
    # -----------------------------------------------------------------------------

    X_train_features_1 = get_all_features(X_train, ellipses, f_ellipse)

    X_test_features_1 = get_all_features(X_test, ellipses, f_ellipse)

    # -----------------------------------------------------------------------------
    # ------------------------------ Ridge Classification  ------------------------
    # -----------------------------------------------------------------------------

    t0 = time.time()
    X_train_features = np.column_stack(
        (X_train_features_0, X_train_features_1))

    X_test_features = np.column_stack((X_test_features_0, X_test_features_1))

    ridge_model = RidgeClassifier().fit(X_train_features, F_train)

    score_train.append(ridge_model.score(X_train_features, F_train))
    score_test.append(ridge_model.score(X_test_features, F_test))
    t1 = time.time()

    print('Ridge Classification: {}'.format(t1 - t0))

print(np.mean(score_train), np.std(score_train))
print(np.mean(score_test), np.std(score_test))
예제 #22
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data,
               params, subject_IDs):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """

    print(len(train_ind))

    # selection of a subset of data if running experiments with a subset of the training set
    labeled_ind = Reader.site_percentage(train_ind, params['num_training'],
                                         subject_IDs)

    # feature selection/dimensionality reduction step
    x_data = Reader.feature_selection(features, y, labeled_ind,
                                      params['num_features'])

    fold_size = len(test_ind)

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')
    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(-dist**2 / (2 * sigma**2))
    final_graph = graph_feat * sparse_graph

    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])
    lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred)

    print("Linear Accuracy: " + str(lin_acc))

    # Classification with GCNs
    test_acc, test_auc = Train.run_training(final_graph,
                                            sparse.coo_matrix(x_data).tolil(),
                                            y_data, train_ind, val_ind,
                                            test_ind, params)

    print(test_acc)

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    lin_acc = int(round(lin_acc * len(test_ind)))

    return test_acc, test_auc, lin_acc, lin_auc, fold_size
예제 #23
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, graph_feat2, features, y, y_data, idx, lr, params, subject_IDs,
               pathToSave, i):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """
    tf.reset_default_graph()
    tf.app.flags._global_parser = argparse.ArgumentParser()
    print(len(train_ind))
    # selection of a subset of data if running experiments with a subset of the training set
    #labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs)
    labeled_ind = reader.site_percentage(train_ind,1.0)
    # feature selection/dimensionality reduction step
    x_data = Reader.feature_selection(features, y, labeled_ind,  params['num_features'])
    fold_size = len(test_ind)

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')
    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2))
    num_nodes = 662
    final_graph = graph_feat * sparse_graph # Gender

    final_graph2 = graph_feat2 * sparse_graph # Age



    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])
    lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred)

    print("Linear Accuracy: " + str(lin_acc))
    # Classification with GCNs
    test_acc, test_auc, weights= Train.run_training(final_graph, final_graph2, sparse.coo_matrix(x_data).tolil(), y_data,
                                            train_ind, val_ind,
                                            test_ind, idx, lr, params, pathToSave, i)
    # return number of correctly classified samples instead of percentage
    # test_acc = int(round(test_acc * len(test_ind)))
    # lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]
    weights_0 = weights[0]
    weights_1 = weights[1]

    scores_lin_ = np.sum(scores_lin)
    scores_auc_lin_ = np.mean(scores_auc_lin)
    scores_acc_ = np.sum(scores_acc)
    scores_auc_ = np.mean(scores_auc)

    if not os.path.exists(pathToSave + 'excel/'):
        os.makedirs(pathToSave + 'excel/')
    pathToSave2 = pathToSave + 'excel/'
    result_name = 'ABIDE_classification.mat'
    sio.savemat(pathToSave2 + str(trial) + result_name,
                {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1})
    df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                       'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1})

    prediction.append(df)

    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer_n, sheet_name='Sheet1')
    # Close the Pandas Excel writer and output the Excel file.
    writer_n.save()

    test_acc = int(round(test_acc * len(test_ind)))
    lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    return test_acc, test_auc, lin_acc, lin_auc, fold_size
예제 #24
0
# print(accuracy_score(y_val, ypred))
# print(confusion_matrix(y_val, ypred))

# # Linreg Using sklearn
# clf = LinearRegression()
# clf.fit(X, y_train)
# print(clf.score(X_val[selected_features], y_val))
# print(accuracy_score(y_val, clf.predict(X_val[selected_features])))
# print(confusion_matrix(y_val , clf.predict(X_val[selected_features])))

# # Polynomial features ?
# from sklearn.preprocessing import PolynomialFeatures
# polynomial_features= PolynomialFeatures(degree=3)
# Xp = polynomial_features.fit_transform(X)
# model = sm.OLS(y_train,Xp)
# results = model.fit()
# print(results.summary())

#%% Feature Selection - Ridge Regression
estimator = RidgeClassifier(class_weight='balanced')
n_features = 5
selector = RFE(estimator, n_features_to_select=n_features, step=1, verbose=0)
selector = selector.fit(X_train, y_train)
selected_features = np.take(features, np.where(selector.support_)[0])
clf = RidgeClassifier(class_weight='balanced')
clf.fit(X_train[selected_features], y_train)

print(clf.score(X_val[selected_features], y_val))
print(balanced_accuracy_score(y_val, clf.predict(X_val[selected_features])))
print(confusion_matrix(y_val, clf.predict(X_val[selected_features])))
예제 #25
0
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import RidgeClassifier
import matplotlib.pyplot as plt

X, y = load_breast_cancer(return_X_y=True)

# 取一个特征
X = X[:, 2][:, np.newaxis]

# train,test
X_train, X_test = X[:-30], X[-30:]
y_train, y_test = y[:-30], y[-30:]

clf = RidgeClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_test)
print("Train Set score: {}".format(clf.score(X_train, y_train)))
print("Test Set score: {}".format(clf.score(X_test, y_test)))
예제 #26
0
         X_train_features_Z1_tent, X_train_features_H1_tent,
         X_train_features_S1_tent, X_train_features_V1_tent))
    X_test_features = np.column_stack(
        (X_test_features_R0_tent, X_test_features_G0_tent,
         X_test_features_B0_tent, X_test_features_X0_tent,
         X_test_features_Y0_tent, X_test_features_Z0_tent,
         X_test_features_H0_tent, X_test_features_S0_tent,
         X_test_features_V0_tent, X_test_features_R1_tent,
         X_test_features_G1_tent, X_test_features_B1_tent,
         X_test_features_X1_tent, X_test_features_Y1_tent,
         X_test_features_Z1_tent, X_test_features_H1_tent,
         X_test_features_S1_tent, X_test_features_V1_tent))

    ### Ridge Model
    ridge_model = RidgeClassifier().fit(X_train_features, y_train)
    tent_train_accuracy_ridge[k] = ridge_model.score(X_train_features, y_train)
    tent_test_accuracy_ridge[k] = ridge_model.score(X_test_features, y_test)

    ### SVM Model
    c = 5
    svm_model = SVC(kernel='rbf', C=c).fit(X_train_features, y_train)
    tent_train_accuracy_svm[k] = svm_model.score(X_train_features, y_train)
    tent_test_accuracy_svm[k] = svm_model.score(X_test_features, y_test)

    ### CDER Adaptive
    X_train_features_R0_cder, X_test_features_R0_cder = adaptive_features(
        R0_train_sample, R0_test_sample, "cder", y_train)
    X_train_features_G0_cder, X_test_features_G0_cder = adaptive_features(
        G0_train_sample, G0_test_sample, "cder", y_train)
    X_train_features_B0_cder, X_test_features_B0_cder = adaptive_features(
        B0_train_sample, B0_test_sample, "cder", y_train)
예제 #27
0
print(precision_score(y_test,log_predict,average='binary'))

#%%Ridge regression
from sklearn.linear_model import RidgeClassifier
#We can set up a grid search here to find the optimal value of the learning
#rate, alpha

alpharange = np.arange(start=0.05,stop=1.0,step=0.05)
ridge_trainS = []
ridge_testS = []


for a in alpharange:
    ridge = RidgeClassifier(alpha=a)
    ridge.fit(X_train,y_train)
    ridge_trainS.append(ridge.score(X_train,y_train))
    ridge_testS.append(ridge.score(X_test,y_test))

plt.plot(alpharange, ridge_trainS, label="Training Accuracy")
plt.plot(alpharange, ridge_testS, label="Test Accuracy")
plt.title("Ridge Scores")
plt.ylabel("Accuracy")
plt.xlabel("Alpha")
plt.grid()
plt.legend()

#%% KNN Classifier


neighbors = np.arange(10)+1
knn_trainS = []
예제 #28
0
        return cs


if __name__ == '__main__':
    # Add LDA component to auto-sklearn.
    autosklearn.pipeline.components.feature_preprocessing.add_preprocessor(
        AutoBinning)

    # Create dataset.
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    from sklearn.linear_model import RidgeClassifier
    clf = RidgeClassifier().fit(X_train, y_train)
    print(clf.score(X_test, y_test))

    print('binning')
    pp = AutobinningTransform(binning_method='xgb')
    X_train = pp.fit_transform(X_train, y_train)
    X_test = pp.transform(X_test)
    clf = RidgeClassifier().fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    print('=' * 100)
    # Configuration space.
    cs = AutoBinning.get_hyperparameter_search_space()
    print(cs)

    # Fit the model using LDA as preprocessor.
    clf = autosklearn.classification.AutoSklearnClassifier(
        include_preprocessors=['AutoBinning'], )
예제 #29
0
# %% feature sets
prefeatures = data.drop(columns=['loyal','subsequent_purchases','insert_num','distance_missing','purchase_make','purchase_model','purchase_make_cat'])
postfeatures = prefeatures.drop(columns=['gender_missing','income_missing','customer_age','distance_binned'])
labels = data['loyal']

# %%
features = postfeatures
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# %%
model = RidgeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(model.score(X_val, y_val),metrics.accuracy_score(y_test, predictions))

# %%
model = RandomForestClassifier(n_estimators=50,max_depth=10,class_weight='balanced')
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(model.score(X_val, y_val),metrics.accuracy_score(y_test, predictions))
df = pd.DataFrame()
df['feat'] = X_train.columns
df['score'] = model.feature_importances_
print(df.sort_values(by='score',ascending=False))
print(metrics.confusion_matrix(predictions,y_test))
print(metrics.classification_report(predictions,y_test))

# %%
effect = []
예제 #30
0
f = np.frompyfunc(mysub, 1, 1)
# Defining the Numeric Features column Expicitely
NumCols = [0, 2, 5, 6, 7, 9]
# Picking Object features column list
StrCols = [i for i in range(data.shape[1]) if i not in NumCols]
# Encoding data by applying the label encoder for the object features train data
EncData = np.hstack([
    f(data[:, NumCols]),
    np.apply_along_axis(le.fit_transform, 1, data[:, StrCols])
])
# Converting the Encoded object features data to numpy float data type
EncData = np.array(EncData, dtype=np.float64)
# Picking the Predicting variable from traind data
PredVariable = data[:, 1].astype(int)

#Obtain mean of columns as you need, nanmean is just convenient.
col_mean = np.nanmean(EncData, axis=0)
#Find indicies that you need to replace
inds = np.where(np.isnan(EncData))
#Place column means in the indices. Align the arrays using take
EncData[inds] = np.take(col_mean, inds[1])
# Splitting the data to train and test
X_train, X_test, y_train, y_test = train_test_split(EncData,
                                                    PredVariable,
                                                    test_size=0.33)
# Defining and applying Ridge aclassifier to data
clf = RidgeClassifier().fit(X_train, y_train)
clf.score(X_train, y_train)

# Pring the Classification report of predictions
print(classification_report(y_test, clf.predict(X_test)))