예제 #1
0
def linear_ridge(M, labels, seed, split=0.8):
    """
    linear ridge algorithm for input M and output labels
    Inputs:
        M : matrix m*n where each row is a different example and the columns are composed of the features
        labels : vector m*1 where each row is the correponding class of the row of M
        seed : random seed to do the split between test/validation/training
        split: number between 0 and 1. Split between training and testing set. Default : 0.8
    Ouputs:
        roc_auc_train: AUC score on the train set
        roc_auc_val: AUC score on the validation set
        roc_auc_test: AUC score on the test set
    """
    M_float = preprocessing_dataset.preprocessing_nan_normalization(M_str)
    M_train_val, M_val, M_test, labels_train_val, labels_val, labels_test = preprocessing_dataset.split_train_val_test(
        M_float, seed, labels, nb_val=3, split=0.8)
    X_train = M_train_val
    Y_train = labels_train_val
    X_test = M_test
    Y_train = np.reshape(Y_train, (Y_train.shape[0], ))
    # Create our imputer to replace missing values with the mean e.g.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)

    # Impute our data, then train
    X_train_imp = imp.transform(X_train)
    clf = RidgeClassifier()
    clf = clf.fit(X_train_imp, Y_train)

    # Impute each test item, then predict
    X_test_imp = imp.transform(X_test)
    X_val_imp = imp.transform(M_val)

    # Compute the accuracy
    lin_acc = clf.score(X_test_imp, labels_test)
    # Compute the AUC
    pred_train = clf.decision_function(X_train_imp)
    pred = clf.decision_function(X_test_imp)
    pred_val = clf.decision_function(X_val_imp)

    fpr_svm, tpr_svm, thresholds_train = roc_curve(Y_train, pred_train)
    roc_auc_train = auc(fpr_svm, tpr_svm)
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_val, pred_val)
    roc_auc_val = auc(fpr_svm, tpr_svm)
    fpr_svm, tpr_svm, thresholds_train = roc_curve(labels_test, pred)
    roc_auc_test = auc(fpr_svm, tpr_svm)
    print(
        'linear ridge: train set: %0.5f, validation: %0.5f, test set: %0.5f' %
        (roc_auc_train, roc_auc_val, roc_auc_test))
    return roc_auc_train, roc_auc_val, roc_auc_test
예제 #2
0
def RidgeReg(file1, file2):
    feature1, lable1 = file2matrix(file1)
    clf = RidgeClassifier()
    clf.fit(feature1, lable1)

    feature2, label2 = file2matrix(file2)
    y_true = label2
    y_score = clf.decision_function(feature2)
    y_pred = clf.predict(feature2)
    return y_true, y_score, y_pred
예제 #3
0
class RidgeC(BaseClassifier):
    def __init__(self,TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\
                    TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\
                    TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\
                    UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\
                    TEST_MERGE,TEST,name='ridge',USE_TINY=False,RANDOMSTATE=2018):
        super(RidgeC, self).__init__(
                    TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\
                    TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\
                    TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\
                    UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\
                    TEST_MERGE,TEST,name,USE_TINY,RANDOMSTATE)
        '''In Ridge, only 'sag' solver can currently fit the intercept when X is sparse.'''
        '''No normlize is better'''
        self.clf=RidgeClassifier(tol=1e-2, solver="sag",normalize=False)
    def trainWithEva(self,trainval_x):
        '''fit the data with evalidation'''
        train_x, valid_x, train_y, valid_y = train_test_split(\
                            trainval_x,self.trainval['label'],\
                            test_size=0.1, random_state=self.randomstate)
        self.clf.fit(train_x,train_y)
        pred = self.clf.decision_function(valid_x)
        #print(valid_y,pred)
        score=metrics.roc_auc_score(valid_y, pred)
        print("%s on valid set accuracy:   %0.5f" % (self.name,score))
        return score

    def predict(self,test_x=None,model_path=None):
        if model_path is not None:
            self.load_model(model_path)
        if test_x is None:
            _,test_x=self.feature_engineering()
        #self.clf.decision_function(test_x)
        #print(pd.read_csv(self.ds.TEST),self.ds.TEST)
        pre=pd.read_csv(self.ds.TEST)
        #print(test_x.shape,pre.shape)
        pre['score'] = self.clf.decision_function(test_x)
        pre['score'] = pre['score'].apply(lambda x: float('%.6f' % x))
        return pre
예제 #4
0
class RidgeClassifierImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
예제 #5
0
    def set_forward(self, support_images, support_labels, query_images):
        """
        Overwrites method set_forward in AbstractMetaLearner.
        """

        support_query_size = len(support_images)
        n_chunks = support_query_size // 32 + 1

        support_chunk = []
        query_chunk = []

        for support, query in zip(support_images.chunk(n_chunks),
                                  query_images.chunk(n_chunks)):

            support_features, query_features = (
                features.detach().cpu() for features in self.extract_features(
                    set_device(support), set_device(query)))

            support_chunk.append(support_features.detach().cpu())
            query_chunk.append(query_features.detach().cpu())

        z_support = torch.cat(support_chunk, dim=0)

        del support_chunk

        z_query = torch.cat(query_chunk, dim=0)

        del query_chunk

        # If a transportation method in the feature space has been defined, use it
        if self.transportation_module:
            z_support, z_query = (z.cpu() for z in self.transportation_module(
                set_device(z_support), set_device(z_query)))

        z_support = z_support.numpy()
        z_query = z_query.numpy()
        support_labels = support_labels.cpu().numpy()

        linear_classifier = RidgeClassifier(alpha=0.1)
        linear_classifier.fit(z_support, support_labels)

        scores = torch.tensor(linear_classifier.decision_function(z_query))

        scores = set_device(scores)
        return scores
예제 #6
0
class Ridge:
    def __repr__(self):
        return 'Ridge'

    def __init__(self, alpha, class_weight, random_state):
        self.ridge = RidgeClassifier(alpha,
                                     class_weight=class_weight,
                                     fit_intercept=False,
                                     random_state=random_state)

    def fit(self, X_train, y_train):
        self.ridge.fit(X_train, y_train)
        return self

    def predict_proba(self, Z):
        preds_class_1 = self.ridge.decision_function(Z)
        preds = []
        for pred in preds_class_1:
            preds += [[1 - pred, pred]]
        return np.array(preds)
예제 #7
0
                (predictions == y_test).astype(int)) / predictions.shape[0]
        else:

            temp_classifier = classifier
            x_train, x_test, y_train, y_test = train_test_split(X_train_all,
                                                                Y_train_all,
                                                                test_size=0.1)
            temp_classifier.fit(x_train, y_train)
            dev_accuracy = temp_classifier.score(x_test, y_test)
            predictions = temp_classifier.predict(x_test)
            # if ridge_option:
            # 	print(temp_classifier.decision_function(x_test))
            try:
                y_proba[i] = classifier.predict_proba(x_test)
            except:
                scores = classifier.decision_function(x_test)
                y_proba[i] = scores / (1 + scores)
            y_mistake[i] = np.mean(
                y_proba[i][y_proba[i].argmax(axis=1) != y_test].max(axis=1))
            testArray = np.array([
                np.mean(y_proba[i][y_test == j][y_proba[i][y_test == j].argmax(
                    axis=1) != j].max(axis=1)) for j in range(4)
            ])
            y_mistake_perClass[i] = testArray

        confusion_matrices[i] = confusion_matrix(y_test, predictions)
        print('Fold N°', str(i))
        print('SCORE : ', dev_accuracy)

    if not (ensemble_option):
예제 #8
0
def main():
    np.random.seed(29118)
    # Generate toy data
    n_samples = 200

    xs, ys = make_blobs(n_samples,
                        centers=[[0, 0], [0, 2]],
                        cluster_std=[0.3, 0.35])
    xt, yt = make_blobs(n_samples,
                        centers=[[2, -2], [2, 0.2]],
                        cluster_std=[0.35, 0.4])

    # visualize toy data
    colors = ["c", "m"]
    x_all = [xs, xt]
    y_all = [ys, yt]
    labels = ["source", "Target"]
    plt.figure(figsize=(8, 5))
    for i in range(2):
        idx_pos = np.where(y_all[i] == 1)
        idx_neg = np.where(y_all[i] == 0)
        plt.scatter(
            x_all[i][idx_pos, 0],
            x_all[i][idx_pos, 1],
            c=colors[i],
            marker="o",
            alpha=0.4,
            label=labels[i] + " positive",
        )
        plt.scatter(
            x_all[i][idx_neg, 0],
            x_all[i][idx_neg, 1],
            c=colors[i],
            marker="x",
            alpha=0.4,
            label=labels[i] + " negative",
        )
    plt.legend()
    plt.title("Source domain and target domain blobs data",
              fontsize=14,
              fontweight="bold")
    plt.show()

    clf = RidgeClassifier(alpha=1.0)
    clf.fit(xs, ys)

    yt_pred = clf.predict(xt)
    print("Accuracy on target domain: {:.2f}".format(
        accuracy_score(yt, yt_pred)))

    # visualize decision scores of non-adaptation classifier
    ys_score = clf.decision_function(xs)
    yt_score = clf.decision_function(xt)
    title = "Ridge classifier decision score distribution"
    title_kwargs = {"fontsize": 14, "fontweight": "bold"}
    hist_kwargs = {"kde": True, "alpha": 0.7}
    plt_labels = ["Source", "Target"]
    distplot_1d(
        [ys_score, yt_score],
        labels=plt_labels,
        xlabel="Decision Scores",
        title=title,
        title_kwargs=title_kwargs,
        hist_kwargs=hist_kwargs,
    ).show()

    # domain adaptation
    clf_ = CoIRLS(lambda_=1)
    # encoding one-hot domain covariate matrix
    covariates = np.zeros(n_samples * 2)
    covariates[:n_samples] = 1
    enc = OneHotEncoder(handle_unknown="ignore")
    covariates_mat = enc.fit_transform(covariates.reshape(-1, 1)).toarray()

    x = np.concatenate((xs, xt))
    clf_.fit(x, ys, covariates_mat)
    yt_pred_ = clf_.predict(xt)
    print("Accuracy on target domain: {:.2f}".format(
        accuracy_score(yt, yt_pred_)))

    ys_score_ = clf_.decision_function(xs).detach().numpy().reshape(-1)
    yt_score_ = clf_.decision_function(xt).detach().numpy().reshape(-1)
    title = "Domain adaptation classifier decision score distribution"
    distplot_1d(
        [ys_score_, yt_score_],
        labels=plt_labels,
        xlabel="Decision Scores",
        title=title,
        title_kwargs=title_kwargs,
        hist_kwargs=hist_kwargs,
    ).show()
예제 #9
0
def get_ridge_plot(best_param_, experiment_, 
                   param_keys_, param_vals_,
                   png_folder,
                   png_fname,
                   score_threshold=0.8):

    parameters = dict(zip(param_keys_, param_vals_))
    del parameters['model_type']

    clf = RidgeClassifier()
    X_train, y_train = experiment_.get_train_data()
    clf.set_params(**best_param_)
    clf.fit(X_train, y_train)    
    best_alpha = best_param_['alpha']
    result = {'alphas':[],
              'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ),
              'scores':[],
              'score':None}


    for i, alpha in enumerate(parameters.get('alpha',None)):
        result['alphas'].append(alpha)
        del best_param_['alpha']
        best_param_['alpha'] = alpha
        clf.set_params(**best_param_)
        clf.fit(X_train, y_train)

        # regularization path
        tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32)
        if best_param_['fit_intercept']:
            tmp = np.append(clf.intercept_, clf.coef_)
        else:
            tmp[1:] = clf.intercept_
        result['coefs'][i,:] = tmp
        result['scores'].append(experiment_.get_proba(clf, X_train))
    del X_train, y_train

    # 2. 
    tmp_len = len(experiment_.get_data_col_name())
    index2feature = dict(zip(np.arange(1, tmp_len + 1), 
                             experiment_.get_data_col_name()))
    if best_param_['fit_intercept']:
        index2feature[0] = 'intercept'

    # 3. plot
    gs = GridSpec(2,2)
    ax1 = plt.subplot(gs[:,0])
    ax2 = plt.subplot(gs[0,1])
    ax3 = plt.subplot(gs[1,1])


    # 3.1 feature importance
    labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name())
    nrows, ncols = result['coefs'].shape
    for ncol in xrange(ncols):
        ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol])
    ax1.legend(loc='best')
    ax1.set_xscale('log')
    ax1.set_title("Regularization Path:%1.3e" % (best_alpha))
    ax1.set_xlabel("alpha", fontsize=10)

    # 3.2 PDF
    X_test, y_test = experiment_.get_test_data()
    result['score'] = clf.decision_function(X_test)
    sns.distplot(result['score'], kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF : Decision_Function")


    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True)
    except:
        counts, bin_edges = np.histogram(result['score'], normed=True)

    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10)


    png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True
def get_ridge_plot(best_param_, experiment_, 
                   param_keys_, param_vals_,
                   png_folder,
                   png_fname,
                   score_threshold=0.8):

    parameters = dict(zip(param_keys_, param_vals_))
    del parameters['model_type']

    clf = RidgeClassifier()
    X_train, y_train = experiment_.get_train_data()
    clf.set_params(**best_param_)
    clf.fit(X_train, y_train)    
    best_alpha = best_param_['alpha']
    result = {'alphas':[],
              'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ),
              'scores':[],
              'score':None}


    for i, alpha in enumerate(parameters.get('alpha',None)):
        result['alphas'].append(alpha)
        del best_param_['alpha']
        best_param_['alpha'] = alpha
        clf.set_params(**best_param_)
        clf.fit(X_train, y_train)

        # regularization path
        tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32)
        if best_param_['fit_intercept']:
            tmp = np.append(clf.intercept_, clf.coef_)
        else:
            tmp[1:] = clf.intercept_
        result['coefs'][i,:] = tmp
        result['scores'].append(experiment_.get_proba(clf, X_train))
    del X_train, y_train

    # 2. 
    tmp_len = len(experiment_.get_data_col_name())
    index2feature = dict(zip(np.arange(1, tmp_len + 1), 
                             experiment_.get_data_col_name()))
    if best_param_['fit_intercept']:
        index2feature[0] = 'intercept'

    # 3. plot
    gs = GridSpec(2,2)
    ax1 = plt.subplot(gs[:,0])
    ax2 = plt.subplot(gs[0,1])
    ax3 = plt.subplot(gs[1,1])


    # 3.1 feature importance
    labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name())
    nrows, ncols = result['coefs'].shape
    for ncol in xrange(ncols):
        ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol])
    ax1.legend(loc='best')
    ax1.set_xscale('log')
    ax1.set_title("Regularization Path:%1.3e" % (best_alpha))
    ax1.set_xlabel("alpha", fontsize=10)

    # 3.2 PDF
    X_test, y_test = experiment_.get_test_data()
    result['score'] = clf.decision_function(X_test)
    sns.distplot(result['score'], kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF : Decision_Function")


    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True)
    except:
        counts, bin_edges = np.histogram(result['score'], normed=True)

    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10)


    png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True
예제 #11
0
print


# # predict by simply apply the classifier
# # this will not use the multi-label threshold
# predicted = clf_rdg.predict(X_new)
# for doc, category in zip(docs_new, predicted):
#     print '%r => %s' % (doc, data_train.target_names[int(category)])
#     print


####################################
# Multi-label prediction using Ridge
# decision_function
print clf_rdg
pred_decision = clf_rdg.decision_function(X_new)
print pred_decision
print

# filtering using threshold
pred_decision_filtered = label_filtering(pred_decision, 0.1)
print pred_decision_filtered
print

# predict and print
for doc, labels in zip(docs_new, pred_decision_filtered):
    print doc
    for label in labels:
            # label[0]: score; label[1]: #
            print data_train.target_names[label[1]], label[0]
    print
예제 #12
0
                        learning_rate=0.09,objective="multi:softmax").fit(x_train, y_train)
prediction_gbm = gbm.predict(x_test)
gmbscore = accuracy_score(y_test, prediction_gbm)
interval=time.time()-start_time
#eta=0.3 max_depth=25 obj=mult num_class=20 

#Ensemblistes
svc_clf8 = LinearSVC(C=0.8)
svc_clf8.fit(np.log(x_train+1), y_train)
decision_svc=svc_clf8.decision_function(x_test)
prediction_svc8=svc_clf8.predict(x_test)
svc_score8 = accuracy_score(y_test, prediction_svc8)

Ridge_clf = RidgeClassifier(alpha=1)
Ridge_clf.fit(x_train, y_train)
decision_ridge=Ridge_clf.decision_function(x_test)
prediction_ridge=Ridge_clf.predict(x_test)
Ridge_clf_score = accuracy_score(y_test, prediction_ridge)

PAC_clf = PassiveAggressiveClassifier(C=0.1)
PAC_clf.fit(x_train, y_train)
decision_pac=PAC_clf.decision_function(x_test)
prediction_PAC=PAC_clf.predict(x_test)
PAC_clf_score = accuracy_score(y_test, prediction_PAC)

from sklearn.linear_model import RandomizedLogisticRegression
RandomizedLogisticRegression_clf = RandomizedLogisticRegression(C=5,n_jobs=-1)
RandomizedLogisticRegression_clf.fit(x_train, y_train)

prediction_RandomizedLogisticRegression=RandomizedLogisticRegression_clf.predict(x_test)
RandomizedLogisticRegression_clf_score = accuracy_score(y_test, prediction_RandomizedLogisticRegression)
예제 #13
0
class IntentRidgeClassifier:
    def __init__(self):
        self.model = None
        self.x_train = None
        self.y_train = None
        self.language = None
        self.word2index = None
        self.dict_labels = None

    def set_word2index(self, word2index: dict):
        self.word2index = word2index

    def set_dict_labels(self, dict_labels: dict):
        self.dict_labels = dict_labels

    def set_data_train(self, x_raw, y_raw, language):
        if x_raw is None or y_raw is None or language is None:
            print('Data train is None')
            return
        self.x_train = x_raw
        self.y_train = y_raw
        self.language = language
        self.process_data()

    def process_data(self):
        # tokenize
        if self.language == constant.LANG_JP:
            self.x_train = [
                data_processor.japanese_segment(sentence)
                for sentence in self.x_train
            ]

        # build vocab
        if self.word2index is None:
            unique_words = list(
                set([
                    word for sentence in self.x_train
                    for word in sentence.split(' ')
                ]))
            self.word2index = {
                word: index
                for index, word in enumerate(unique_words)
            }

        # Convert data to vector
        self.x_train = [
            data_processor.sentence_2_vec(sentence, self.word2index,
                                          len(self.word2index) + 1)
            for sentence in self.x_train
        ]

        self.dict_labels = data_processor.make_dict_labels(self.y_train)
        self.y_train = data_processor.label2vec(self.y_train, self.dict_labels)

    def build_model(self):
        self.model = RidgeClassifier(alpha=0.5,
                                     class_weight=None,
                                     copy_X=True,
                                     fit_intercept=True,
                                     solver='svd',
                                     tol=1)

    def train_model(self):
        self.model.fit(self.x_train, self.y_train)

    def predict(self, x_raw):
        x_to_predict = data_processor.sentence_2_vec(x_raw, self.word2index,
                                                     len(self.word2index) + 1)
        if sum(x_to_predict) == 0:
            return None
        d = self.model.decision_function([x_to_predict])[0] * 5
        probs = np.exp(d) / np.sum(np.exp(d))

        dict_labels = {
            self.dict_labels[key]: key
            for key in self.dict_labels.keys()
        }

        if len(dict_labels.keys()) < 3:
            final_rs = [{
                'intent': dict_labels[index],
                'prob': probs
            } for index in range(1)]
            return final_rs[0]
        else:
            max_probability = np.argmax(probs)
        return {
            'intent': dict_labels[max_probability],
            'prob': probs[max_probability]
        }

    def save_word2index(self, file_path):
        io_utils.save_dict_to_file(self.word2index, file_path)
        pass

    def save_model(self, model_path):
        if self.model is not None:
            with open(model_path, 'wb') as fw:
                pickle.dump(self.model, fw)
        else:
            print('Model is None.')

    def load_model(self, model_path):
        try:
            with open(model_path, 'rb') as fr:
                self.model = pickle.load(fr)
        except Exception:
            self.model = None
            print('Error when load model \n', traceback.format_exc())

    def save_dict_labels(self, file_path):
        io_utils.save_dict_to_file(self.dict_labels, file_path)
예제 #14
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, graph_feat2, features, y, y_data, idx, lr, params, subject_IDs,
               pathToSave, i):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """
    tf.reset_default_graph()
    tf.app.flags._global_parser = argparse.ArgumentParser()
    print(len(train_ind))
    # selection of a subset of data if running experiments with a subset of the training set
    #labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs)
    labeled_ind = reader.site_percentage(train_ind,1.0)
    # feature selection/dimensionality reduction step
    x_data = Reader.feature_selection(features, y, labeled_ind,  params['num_features'])
    fold_size = len(test_ind)

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')
    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2))
    num_nodes = 662
    final_graph = graph_feat * sparse_graph # Gender

    final_graph2 = graph_feat2 * sparse_graph # Age



    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])
    lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred)

    print("Linear Accuracy: " + str(lin_acc))
    # Classification with GCNs
    test_acc, test_auc, weights= Train.run_training(final_graph, final_graph2, sparse.coo_matrix(x_data).tolil(), y_data,
                                            train_ind, val_ind,
                                            test_ind, idx, lr, params, pathToSave, i)
    # return number of correctly classified samples instead of percentage
    # test_acc = int(round(test_acc * len(test_ind)))
    # lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]
    weights_0 = weights[0]
    weights_1 = weights[1]

    scores_lin_ = np.sum(scores_lin)
    scores_auc_lin_ = np.mean(scores_auc_lin)
    scores_acc_ = np.sum(scores_acc)
    scores_auc_ = np.mean(scores_auc)

    if not os.path.exists(pathToSave + 'excel/'):
        os.makedirs(pathToSave + 'excel/')
    pathToSave2 = pathToSave + 'excel/'
    result_name = 'ABIDE_classification.mat'
    sio.savemat(pathToSave2 + str(trial) + result_name,
                {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1})
    df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                       'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1})

    prediction.append(df)

    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer_n, sheet_name='Sheet1')
    # Close the Pandas Excel writer and output the Excel file.
    writer_n.save()

    test_acc = int(round(test_acc * len(test_ind)))
    lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    return test_acc, test_auc, lin_acc, lin_auc, fold_size
예제 #15
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs, pathToSave, i, subject_labels, idx):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """

    print(len(train_ind))
    tf.reset_default_graph()
    tf.app.flags._global_parser = argparse.ArgumentParser()

    # selection of a subset of data if running experiments with a subset of the training set
    # labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs)
    num_nodes = np.size(graph_feat, 0)
    #print features[0,:],"features"
    x_data_1 = features.astype(float)#Reader.feature_selection(features, y, labeled_ind, params['num_features'])
    xrow,xcol = np.shape(x_data_1)
    for i in range(xrow):
        for j in range(xcol):
            x_data_1[i, j] = round(x_data_1[i,j], 4)
    fold_size = len(test_ind)
    x_data_1[np.where(np.isnan(x_data_1))] = 0
    distv = distance.pdist(x_data_1, metric='correlation')

    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2))
    # plt.matshow(sparse_graph)
    # plt.savefig('features_sparsegraph.png', bbox_inches='tight')
    # exit()
    graph = Reader.get_affinity(sparse_graph, idx)

    x_data = features.astype(float)#np.identity(num_nodes)
    xrow,xcol = np.shape(x_data)
    for i in range(xrow):
        for j in range(xcol):
            x_data[i, j] = round(x_data[i,j], 4)
    np.savetxt("x_data.csv", x_data, delimiter=',')
    x_data[np.where(np.isnan(x_data))] = 0
    print(np.where(np.isnan(x_data)))
    #exit()
    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])

    y_one_hot = label_binarize(y[test_ind], classes=np.arange(3))
    lin_auc = sklearn.metrics.roc_auc_score(y_one_hot, pred)

    # np.savetxt("x_data.csv", x_data, delimiter = ',')
    # Classification with GCNs
    test_acc, test_auc, weights, confusion = Train.run_training(graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind,
                                            test_ind, params, pathToSave, i)

    # print(test_acc)
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]
    if FLAGS.model == 'gcn_cheby':
        weights_0 = weights[0]
        weights_1 = weights[1]
        weights_2 = weights[2]

    scores_lin_ = np.sum(scores_lin)
    scores_auc_lin_ = np.mean(scores_auc_lin)
    scores_acc_ = int(np.sum(scores_acc) * len(test_ind))
    scores_auc_ = np.mean(scores_auc)

    if not os.path.exists(pathToSave + 'excel/'):
        os.makedirs(pathToSave + 'excel/')
    pathToSave2 = pathToSave + 'excel/'
    result_name = 'ABIDE_classification.mat'
    if FLAGS.model == 'gcn_cheby':
        sio.savemat(pathToSave2 + str(trial) + result_name,
                    {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                     'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0,
                     'weights_1': weights_1, 'weights_2': weights_2})
        df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                           'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1,
                           'weights_2':weights_2, 'confusion_matrix': [confusion]})
    else:
        sio.savemat(pathToSave2 + str(trial) + result_name,
                    {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                     'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes})
        df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                           'scores_auc_lin': [scores_auc_lin_], 'confusion_matrix': [confusion]})

    prediction.append(df)

    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer_n, sheet_name='Sheet1')
    # Close the Pandas Excel writer and output the Excel file.
    writer_n.save()

    lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    return test_acc, test_auc, lin_acc, lin_auc, fold_size, len(test_ind)
pred_train_label = model.predict(feature_train_)
pred_val_label = model.predict(feature_validation_)

# 模型验证,以及根据验证情况调参
acc_train = metrics.accuracy_score(label_train, pred_train_label)
f1score_train = metrics.f1_score(label_train, pred_train_label)
acc_validation = metrics.accuracy_score(label_validation, pred_val_label)
f1score_validation = metrics.f1_score(label_validation, pred_val_label)
print(
    f"acc_train = {acc_train:.3f}; f1score_train = {f1score_train}\nacc_validation = {acc_validation:.8f}; f1score_validaton = {f1score_validation}"
)

#%% ============================最终的测试============================
# 最好使用外部测试集
pred_test_label = model.predict(feature_test_)
pred_test_prob = model.decision_function(feature_test_)
acc_test = metrics.accuracy_score(label_test, pred_test_label)
f1score_test = metrics.f1_score(label_test, pred_test_label)
print(f"acc_test = {acc_test:.8f}; f1score_test = {f1score_test}\n")

#%% ============================结果可视化============================
# 获取权重
wei = model.coef_
wei = (wei - wei.mean()) / wei.std()
wei = selector.inverse_transform(wei)
wei = pca.inverse_transform(wei)
weight = np.zeros(mask.shape)
weight[mask] = wei[0]
weight = weight + weight.T

# 只显示前0.2%的权重
예제 #17
0
def classify(granularity=10):
    trainDir = path.join(
        GEOTEXT_HOME,
        'processed_data/' + str(granularity).strip() + '_clustered/')
    testDir = path.join(GEOTEXT_HOME, 'processed_data/test')
    data_train = load_files(trainDir, encoding=encoding)
    target = data_train.target
    data_test = load_files(testDir, encoding=encoding)

    categories = data_train.target_names

    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
          (len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()

    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target

    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    t0 = time()
    vectorizer = TfidfVectorizer(use_idf=True,
                                 norm='l2',
                                 binary=False,
                                 sublinear_tf=True,
                                 min_df=2,
                                 max_df=1.0,
                                 ngram_range=(1, 1),
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print(
        "Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    chi = False
    if chi:
        k = 500000
        print("Extracting %d best features by a chi-squared test" % 0)
        t0 = time()
        ch2 = SelectKBest(chi2, k=k)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)

        print("done in %fs" % (time() - t0))
        print()

    feature_names = np.asarray(vectorizer.get_feature_names())
    # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3)
    clf = RidgeClassifier(tol=1e-2, solver="auto")
    print('_' * 80)
    print("Training: ")
    print(clf)

    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    print scores.shape
    print pred.shape
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # score = metrics.f1_score(y_test, pred)
    # print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))

    sumMeanDistance = 0
    sumMedianDistance = 0
    distances = []
    confidences = []
    randomConfidences = []

    for i in range(0, len(pred)):
        user = path.basename(data_test.filenames[i])
        location = userLocation[user].split(',')
        lat = float(location[0])
        lon = float(location[1])
        prediction = categories[pred[i]]
        confidence = scores[i][pred[i]] - mean(scores[i])
        randomConfidence = scores[i][random.randint(0, len(categories) - 1)]
        confidences.append(confidence)
        randomConfidences.append(randomConfidence)
        medianlat = classLatMedian[prediction]
        medianlon = classLonMedian[prediction]
        meanlat = classLatMean[prediction]
        meanlon = classLonMean[prediction]
        distances.append(distance(lat, lon, medianlat, medianlon))
        sumMedianDistance = sumMedianDistance + distance(
            lat, lon, medianlat, medianlon)
        sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat,
                                                     meanlon)
    averageMeanDistance = sumMeanDistance / float(len(pred))
    averageMedianDistance = sumMedianDistance / float(len(pred))
    print "Average mean distance is " + str(averageMeanDistance)
    print "Average median distance is " + str(averageMedianDistance)
    print "Median distance is " + str(median(distances))
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)

    plt.xlim(0, 4000)
    plt.ylim(0, 2)
    ax1.scatter(distances, confidences)
    ax2.bar(distances, confidences)
    plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
예제 #18
0
    naive_bayes.fit(X, y)
    
    ridge = RidgeClassifier(random_state=rng)
    ridge.fit(X, y)
    
    
    #%% Testing;
    
    # Create some random inputs;
    
    num_test_docs = 100
    X_test = rng.randint(max_occurrence_of_ngram, size=(num_test_docs, num_features))
    
    nb_scores = naive_bayes.predict_proba(X_test)
    print(naive_bayes.predict(X_test))
    ridge_scores = ridge.decision_function(X_test)
    print(ridge.predict(X_test))
        
    print(np.argmax(softmax(nb_scores) + softmax(ridge_scores), axis=1))
    
    
    #%% Testing, using hand-made functions;
    
    nb_res_2 = naive_bayes_predict(X_test, naive_bayes.feature_log_prob_, naive_bayes.class_log_prior_)
    print(np.argmax(nb_res_2, axis=1))

    ridge_pred_2 = ridge_pred(X_test, ridge.coef_, ridge.intercept_)
    print(np.argmax(ridge_pred_2, axis=1))

    print(np.argmax(softmax(nb_res_2) + softmax(ridge_pred_2), axis=1)) 
    
예제 #19
0
파일: fit.py 프로젝트: antcc/proyecto-aa
class RBFNetworkClassifier(BaseEstimator, ClassifierMixin):
    """Implementación de un clasificador de red de funciones (gaussianas) de base radial.
       Internamente utiliza un clasificador lineal RidgeClassifier para ajustar
       los pesos del modelo final."""
    def __init__(self, k=7, alpha=1.0, batch_size=100, random_state=None):
        """Construye un clasificador con los parámetros necesarios:
             - k: número de centros a elegir.
             - alpha: valor de la constante regularización.
             - batch_size: tamaño del batch para el clustering no supervisado.
             - random_state: semilla aleatoria."""

        self.k = k
        self.alpha = alpha
        self.batch_size = batch_size
        self.random_state = random_state
        self.centers = None
        self.r = None

    def _choose_centers(self, X):
        """Usando k-means escoge los k centros de los datos."""

        init_size = 3 * self.k if 3 * self.batch_size <= self.k else None

        kmeans = MiniBatchKMeans(n_clusters=self.k,
                                 batch_size=self.batch_size,
                                 init_size=init_size,
                                 random_state=self.random_state)
        kmeans.fit(X)
        self.centers = kmeans.cluster_centers_

    def _choose_radius(self, X):
        """Escoge el radio para la transformación radial."""

        # "Diámetro" de los datos
        R = np.max(euclidean_distances(X, X))

        self.r = R / (self.k**(1 / self.n_features_in_))

    def _transform_rbf(self, X):
        """Transforma los datos usando el kernel RBF."""

        return rbf_kernel(X, self.centers, 1 / (2 * self.r**2))

    def fit(self, X, y):
        """Entrena el modelo."""

        # Establecemos el modelo lineal subyacente
        self.model = RidgeClassifier(alpha=self.alpha,
                                     random_state=self.random_state)

        # Guardamos las clases y las características vistas durante el entrenamiento
        self.classes_ = unique_labels(y)
        self.n_features_in_ = X.shape[1]

        # Obtenemos los k centros usando k-means
        self._choose_centers(X)

        # Elegimos el radio para el kernel RBF
        self._choose_radius(X)

        # Transformamos los datos usando kernel RBF respecto de los centros
        Z = self._transform_rbf(X)

        # Entrenamos el modelo lineal resultante
        self.model.fit(Z, y)

        # Guardamos los coeficientes obtenidos
        self.intercept_ = self.model.intercept_
        self.coef_ = self.model.coef_

        return self

    def score(self, X, y=None):
        # Transformamos datos con kernel RBF
        Z = self._transform_rbf(X)

        # Score del modelo lineal
        return self.model.score(Z, y)

    def predict(self, X):
        # Transformamos datos con kernel RBF
        Z = self._transform_rbf(X)

        # Predicciones del modelo lineal
        return self.model.predict(Z)

    def decision_function(self, X):
        # Transformamos datos con kernel RBF
        Z = self._transform_rbf(X)

        # Función de decisión del modelo lineal
        return self.model.decision_function(Z)
예제 #20
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data,
               params, subject_IDs):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """

    print(len(train_ind))

    # selection of a subset of data if running experiments with a subset of the training set
    labeled_ind = Reader.site_percentage(train_ind, params['num_training'],
                                         subject_IDs)

    # feature selection/dimensionality reduction step
    x_data = Reader.feature_selection(features, y, labeled_ind,
                                      params['num_features'])

    fold_size = len(test_ind)

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')
    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(-dist**2 / (2 * sigma**2))
    final_graph = graph_feat * sparse_graph

    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])
    lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred)

    print("Linear Accuracy: " + str(lin_acc))

    # Classification with GCNs
    test_acc, test_auc = Train.run_training(final_graph,
                                            sparse.coo_matrix(x_data).tolil(),
                                            y_data, train_ind, val_ind,
                                            test_ind, params)

    print(test_acc)

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    lin_acc = int(round(lin_acc * len(test_ind)))

    return test_acc, test_auc, lin_acc, lin_auc, fold_size
예제 #21
0
def classify(granularity=10):
    trainDir = path.join(GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/')
    testDir = path.join(GEOTEXT_HOME, 'processed_data/test')
    data_train = load_files(trainDir, encoding=encoding)
    target = data_train.target
    data_test = load_files(testDir, encoding=encoding)

    categories = data_train.target_names
    
    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target
    
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    chi = False
    if chi:
        k = 500000
        print("Extracting %d best features by a chi-squared test" % 0)
        t0 = time()
        ch2 = SelectKBest(chi2, k=k)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        
        print("done in %fs" % (time() - t0))
        print()
        
    feature_names = np.asarray(vectorizer.get_feature_names())
    # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3)
    clf = RidgeClassifier(tol=1e-2, solver="auto")
    print('_' * 80)
    print("Training: ")
    print(clf)
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    print scores.shape
    print pred.shape
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # score = metrics.f1_score(y_test, pred)
    # print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))

    
    sumMeanDistance = 0
    sumMedianDistance = 0
    distances = []
    confidences = []
    randomConfidences = []
    
    for i in range(0, len(pred)):
        user = path.basename(data_test.filenames[i])
        location = userLocation[user].split(',')
        lat = float(location[0])
        lon = float(location[1])
        prediction = categories[pred[i]]
        confidence = scores[i][pred[i]] - mean(scores[i])
        randomConfidence = scores[i][random.randint(0, len(categories) - 1)]
        confidences.append(confidence)
        randomConfidences.append(randomConfidence)
        medianlat = classLatMedian[prediction]  
        medianlon = classLonMedian[prediction]  
        meanlat = classLatMean[prediction] 
        meanlon = classLonMean[prediction]      
        distances.append(distance(lat, lon, medianlat, medianlon))
        sumMedianDistance = sumMedianDistance + distance(lat, lon, medianlat, medianlon)
        sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon)
    averageMeanDistance = sumMeanDistance / float(len(pred))
    averageMedianDistance = sumMedianDistance / float(len(pred))
    print "Average mean distance is " + str(averageMeanDistance)
    print "Average median distance is " + str(averageMedianDistance)
    print "Median distance is " + str(median(distances))
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
    
    plt.xlim(0, 4000)
    plt.ylim(0, 2)
    ax1.scatter(distances, confidences)
    ax2.bar(distances, confidences)
    plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
예제 #22
0
X_train = vectorizer.fit_transform(train.values)
X_train

# Let's explain how our model recognizes toxic comments

# In[ ]:

classifier = RidgeClassifier(solver='sag')

y = ys['toxic'].values

kf = KFold(n_splits=5, shuffle=True, random_state=239)
for train_index, test_index in kf.split(X_train):
    classifier = RidgeClassifier(solver='sag')
    classifier.fit(X_train[train_index], y[train_index])
    predict = classifier.decision_function(X_train[test_index])
    cv_score = roc_auc_score(y[test_index], predict)
    print(cv_score)
    break

# In[ ]:

eli5.show_weights(classifier, vec=vectorizer)

# In[ ]:

train[COMMENT].values[6]

# In[ ]:

eli5.show_prediction(classifier, doc=train.values[6], vec=vectorizer)
예제 #23
0
class Level1Model(object):
    train_features = [
        "ps_car_13",  # : 1571.65 / shadow  609.23
        "ps_reg_03",  # : 1408.42 / shadow  511.15
        "ps_ind_05_cat",  # : 1387.87 / shadow   84.72
        "ps_ind_03",  # : 1219.47 / shadow  230.55
        "ps_ind_15",  # :  922.18 / shadow  242.00
        "ps_reg_02",  # :  920.65 / shadow  267.50
        "ps_car_14",  # :  798.48 / shadow  549.58
        "ps_car_12",  # :  731.93 / shadow  293.62
        "ps_car_01_cat",  # :  698.07 / shadow  178.72
        "ps_car_07_cat",  # :  694.53 / shadow   36.35
        "ps_ind_17_bin",  # :  620.77 / shadow   23.15
        "ps_car_03_cat",  # :  611.73 / shadow   50.67
        "ps_reg_01",  # :  598.60 / shadow  178.57
        "ps_car_15",  # :  593.35 / shadow  226.43
        "ps_ind_01",  # :  547.32 / shadow  154.58
        "ps_ind_16_bin",  # :  475.37 / shadow   34.17
        "ps_ind_07_bin",  # :  435.28 / shadow   28.92
        "ps_car_06_cat",  # :  398.02 / shadow  212.43
        "ps_car_04_cat",  # :  376.87 / shadow   76.98
        "ps_ind_06_bin",  # :  370.97 / shadow   36.13
        "ps_car_09_cat",  # :  214.12 / shadow   81.38
        "ps_car_02_cat",  # :  203.03 / shadow   26.67
        "ps_ind_02_cat",  # :  189.47 / shadow   65.68
        "ps_car_11",  # :  173.28 / shadow   76.45
        "ps_car_05_cat",  # :  172.75 / shadow   62.92
        "ps_calc_09",  # :  169.13 / shadow  129.72
        "ps_calc_05",  # :  148.83 / shadow  120.68
        "ps_ind_08_bin",  # :  140.73 / shadow   27.63
        "ps_car_08_cat",  # :  120.87 / shadow   28.82
        "ps_ind_09_bin",  # :  113.92 / shadow   27.05
        "ps_ind_04_cat",  # :  107.27 / shadow   37.43
        "ps_ind_18_bin",  # :   77.42 / shadow   25.97
        "ps_ind_12_bin",  # :   39.67 / shadow   15.52
        "ps_ind_14",  # :   37.37 / shadow   16.65
    ]

    def __init__(self,
                 strat=True,
                 splits=5,
                 random_state=15,
                 submit=False,
                 mean_sub=False,
                 metric=None):
        # type: (bool, int, int, bool, bool, Callable) -> None
        self.curr_date = datetime.datetime.now()
        self._submit = submit
        self._id = ""
        self.trn = None
        self.target = None
        self.sub = None
        self.model = None
        self.metric = metric
        self.mean_submission = mean_sub
        self.trn_csr = None
        self.sub_csr = None
        if strat:
            self._folds = StratifiedKFold(n_splits=splits,
                                          shuffle=True,
                                          random_state=random_state)
        else:
            self._folds = KFold(n_splits=splits,
                                shuffle=True,
                                random_state=random_state)
        self.set_model()

    def set_model(self):
        self.model = RidgeClassifier(
            alpha=3000,  # Was 1000
            normalize=False,
            max_iter=1000,
            class_weight="balanced",  # {0: 1, 1: 2},
            random_state=1,
            solver="sag",
            tol=1e-3,
            copy_X=False,
        )
        # self.model.fit()

    @property
    def do_submission(self):
        return self._submit

    @property
    def id(self):
        return self._get_id()

    @abc.abstractmethod
    def _get_id(self):
        self._id = "ridge_dummies"
        if self._id == "":
            raise ValueError("Id is not set for class " + str(type(self)))
        return self._id

    def read_data(self):
        self.trn = pd.read_csv("../../input/train.csv", index_col=0)
        self.target = self.trn["target"]
        del self.trn["target"]
        if self.do_submission:
            self.sub = pd.read_csv("../../input/test.csv", index_col=0)

    def prepare_data(self):
        self.trn = self.trn[self.train_features]
        if self.do_submission:
            self.sub = self.sub[self.train_features]

        for f in ["ps_reg_03", "ps_car_12", "ps_car_13", "ps_car_14"]:
            full_f = pd.concat([self.trn[f], self.sub[f]], axis=0)
            full_cut = np.array(pd.cut(full_f, 20, labels=False))
            self.trn[f] = full_cut[:len(self.trn)]
            self.sub[f] = full_cut[len(self.trn):]
            del full_f
            del full_cut

        # Transform low card f to
        high_card_f = []
        binary_f = []
        for f in self.trn.columns:
            card = len(np.unique(self.trn[f]))
            one = OneHotEncoder(handle_unknown='ignore')

            if (card > 2) & (card < 110):
                print("Encoding %s" % f)
                if self.trn_csr is None:
                    self.trn_csr = one.fit_transform(self.trn[[f]].replace(
                        -1, 99999))
                    if self.do_submission:
                        self.sub_csr = one.transform(self.sub[[f]].replace(
                            -1, 99999))
                else:
                    self.trn_csr = csr_hstack(
                        (self.trn_csr,
                         one.fit_transform(self.trn[[f]].replace(-1, 99999))))
                    if self.do_submission:
                        self.sub_csr = csr_hstack(
                            (self.sub_csr,
                             one.transform(self.sub[[f]].replace(-1, 99999))))
            elif card <= 2:
                binary_f.append(f)
            else:
                high_card_f.append(f)

        # Add binary data
        print("Add binary feats : ", binary_f)
        self.trn_csr = csr_hstack((self.trn_csr, self.trn[binary_f]))
        if self.do_submission:
            self.sub_csr = csr_hstack((self.sub_csr, self.sub[binary_f]))

        # Add High card data
        # We need to scale those features
        print("Add high card feats : ", high_card_f)
        # skl = StandardScaler()
        # if not self.do_submission:
        #     self.trn_csr = csr_hstack((self.trn_csr, skl.fit_transform(self.trn[high_card_f].values)))
        # else:
        #     skl.fit(np.vstack((self.trn[high_card_f].values, self.sub[high_card_f].values)))
        #     self.trn_csr = csr_hstack((self.trn_csr, skl.transform(self.trn[high_card_f].values)))
        #     self.sub_csr = csr_hstack((self.sub_csr, skl.transform(self.sub[high_card_f].values)))

        print("Transform to csr")
        self.trn_csr = self.trn_csr.tocsr()
        print("CSR shape = ", self.trn_csr.shape)
        if self.do_submission:
            self.sub_csr = self.sub_csr.tocsr()

        print(self.trn_csr.sum(axis=0) < 100)

        self.sub_csr_not_enough = np.array(
            self.sub_csr.sum(axis=0) <= 100)[0, :]
        self.sub_csr_occurences = np.array(self.sub_csr.sum(axis=0))[0, :]
        print(self.sub_csr_occurences.shape)
        print(self.sub_csr_not_enough)

    def predict_oof_and_submission(self):

        self.read_data()
        self.prepare_data()
        pos_ratio = .5
        class_weight = {0: 1 / (2 * (1 - pos_ratio)), 1: 1 / (2 * pos_ratio)}
        coefs = np.zeros((self.trn_csr.shape[1], self._folds.n_splits))

        if self.model is None:
            raise ValueError("Model is not set for class " + str(type(self)))
        if self.target is None:
            raise ValueError("Model is not set for class " + str(type(self)))
        if self.trn is None:
            raise ValueError("Model is not set for class " + str(type(self)))
        if (self.sub is None) and self.do_submission:
            raise ValueError("Model is not set for class " + str(type(self)))

        # Prepare predictors
        oof_preds = np.zeros(len(self.trn))
        if self.do_submission:
            sub_preds = np.zeros(len(self.sub))
        # Go through folds
        start = time.time()
        for i_fold, (trn_idx, val_idx) in enumerate(
                self._folds.split(self.target, self.target)):
            # Fit model
            self.model.fit(self.trn_csr[trn_idx], self.target.values[trn_idx])
            coefs[:, i_fold] = self.model.coef_
            print(self.model.coef_[0, self.sub_csr_not_enough])
            print(self.sub_csr_occurences[self.sub_csr_not_enough])
            # Predict OOF
            oof_preds[val_idx] = self.model.decision_function(
                self.trn_csr[val_idx])

            # Predict SUB if mean is requested
            if (self.sub is not None) and self.mean_submission:
                sub_preds += self.model.decision_function(
                    self.sub_csr) / self._folds.n_splits

            # Print results of current fold
            print(
                "Fold %2d score : %.6f in [%5.1f]" %
                (i_fold + 1,
                 self.metric(self.target.values[val_idx], oof_preds[val_idx]),
                 (time.time() - start) / 60))

        # display OOF result
        oof_score = self.metric(self.target, oof_preds)
        print("Full OOF score : %.6f" % oof_score)

        # Check if we need to fit the model on the full dataset
        if (self.sub is not None) and not self.mean_submission:
            # Fit model
            self.model.fit(self.trn_csr, self.target)
            # Compute prediction for submission
            sub_preds = self.model.decision_function(self.sub_csr)
            # Make sure coefs are not crazy
            coefs = np.abs(np.array(self.model.coef_)[0, :])
            sub_occ = np.array(self.sub_csr.sum(axis=0))[0, :]
            trn_occ = np.array(self.trn_csr.sum(axis=0))[0, :]
            sortation = np.argsort(coefs)[::-1]
            for s in sortation:
                print("%6d %6d %.5f" % (trn_occ[s], sub_occ[s], coefs[s]))

        if self.do_submission:
            filename = "../output_preds/" + self.id + "_"
            filename += str(int(1e6 * oof_score)) + "_"
            filename += self.curr_date.strftime("%Y_%m_%d_%Hh%M")

            # Save OOF predictions for stacking
            self.trn[self.id] = 1 / (1 + np.exp(-oof_preds))
            self.trn[[self.id]].to_csv(filename + "_oof.csv",
                                       float_format="%.9f")

            # Save submission prediction for stacking or submission
            self.sub["target"] = 1 / (1 + np.exp(-sub_preds))
            self.sub[["target"]].to_csv(filename + "_sub.csv",
                                        float_format="%.9f")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train, y_train)
    clf_ridge.fit(X_train, y_train)
    clf_SGD.fit(X_train, y_train)
    clf_lSVC.fit(X_train, y_train)
    clf_SVC.fit(X_train, y_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_test)
    prob_ridge  = clf_ridge.decision_function(X_test)
    prob_SGD    = clf_SGD.decision_function(X_test)
    prob_lSVC   = clf_lSVC.decision_function(X_test)
    prob_SVC    = clf_SVC.predict_proba(X_test)

    # add prob functions into the z 2d-array
    z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)


# remove the first sub-1d-array of z, due to the creation with 0s
z = np.delete(z, 0, 0)
# the result of z is a 2d array with shape of (n_samples, n_categories)
# the elements are the sum of probabilities of classifiers on each (sample,category) pair
print z
print 'z shape:     ', z.shape
예제 #25
0
    X_train_train, X_train_test = X_train[train_index], X_train[test_index]
    y_train_train, y_train_test = y_train[train_index], y_train[test_index]

    # X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train_train, y_train_train)
    # clf_kNN.fit(X_train_train, y_train_train)
    clf_ridge.fit(X_train_train, y_train_train)
    clf_lSVC.fit(X_train_train, y_train_train)
    clf_SVC.fit(X_train_train, y_train_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_train_test)
    # prob_kNN    = clf_kNN.predict_proba(X_train_test)
    prob_ridge  = clf_ridge.decision_function(X_train_test)
    prob_lSVC   = clf_lSVC.decision_function(X_train_test)
    prob_SVC    = clf_SVC.predict_proba(X_train_test)

    # update z array for each model
    # z_temp = prob_lSVC
    # z_temp = (prob_ridge + prob_lSVC)
    z_temp = (prob_mNB + prob_ridge + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)


# remove the first sub-1d-array of z, due to the creation with 0s
z = np.delete(z, 0, 0)
# the result of z is a 2d array with shape of (n_samples, n_categories)
# the elements are the sum of probabilities of classifiers on each (sample,category) pair
# Possible preprocessing on z