예제 #1
0
def SGDGridSearch_OLD():  
    # C=1 is best
    cs = 10.0**np.arange(-9,9,1)   
    aucs = []
    for c in cs:
        clf = SGDClassifier(penalty='l1',alpha=c).fit(f_train, y_train)
        probs = clf.decision_function(f_test)
        fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs)
        roc_auc = auc(fpr,tpr)
        cstr = '%0.2e'%c
        myplt = st.plotROC(fpr,tpr,roc_auc,
                    figure=False,
                    show=False,
                    returnplt=True,
                    showlegend=False,
                    title='Grid Search - SGD Classifier ROC Curve')
        aucs.append(roc_auc)
    best = 0
    for i in range(len(cs)):
        if aucs[i] > aucs[best]:
            best = i
    c = cs[best]
    clf = SGDClassifier(penalty='l1',alpha=c).fit(f_train, y_train)
    probs = clf.decision_function(f_test)
    fpr,tpr,_ = roc_curve(y_true=y_test,y_score=probs)
    myplt = st.plotROC(fpr,tpr,roc_auc,
                    legendlabel='Best C = %0.2e' % c,
                    figure=False,
                    show=False,
                    returnplt=True,
                    showlegend=True,
                    title='Grid Search - SGD Classifier ROC Curve')
    myplt.show()
    return clf, aucs
예제 #2
0
def train_kaggle(dataset, alg="rig", data="bow"):
    train_x, train_y, test_x = dataset
    print "shape for training data is", train_x.shape

    if alg == "svm":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20)
    elif alg == "svm_sq":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge")
    elif alg == "log":
        clf = LogisticRegression(verbose=1, n_jobs=2)
    elif alg == "per":
        clf = Perceptron(verbose=1, n_jobs=2, n_iter=25)
    elif alg == "rig":
        clf = RidgeClassifier()
    elif alg == "pa":
        clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25)
    else:
        raise NotImplementedError

    print "training with %s..." % alg

    clf.fit(train_x, train_y)
    # clf.fit(validate_x, validate_y)
    predicted = clf.predict(test_x)
    save_csv(predicted, fname=alg + "_" + data)

    if alg != "nb":
        return clf.decision_function(train_x), clf.decision_function(test_x)
    else:
        return clf.predict_proba(train_x), clf.predict_proba(test_x)
예제 #3
0
class PlattScaledSVM(BaseEstimator, ClassifierMixin):
    def __init__(self, **svm_kwargs):
        self.svm_kwargs = svm_kwargs
        self.svm = SGDClassifier(loss="hinge", **self.svm_kwargs)
        self.lr = LogisticRegression()

    def fit(self, X, y):
        self.svm.fit(X, y)
        dists = self.svm.decision_function(X)
        self.lr.fit(dists.reshape(-1, 1), y)
        return self

    def predict(self, X, y=None):
        dists = self.svm.decision_function(X)
        preds = self.lr.predict(dists.reshape(-1, 1))

    def predict_proba(self, X, y=None):
        dists = self.svm.decision_function(X)
        probs = self.lr.predict_proba(dists.reshape(-1, 1))
        return probs

    def get_params(self, deep=True):
        return self.svm_kwargs

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self
예제 #4
0
class SVM(object):
    """docstring for SVM"""
    def __init__(self, ground_truth):
        self.max_num = 3.0
        self.ground_truth = ground_truth
        self.scale = Scaler((0, 1))

    #scale all features to be between 0-1
    def scaleDataFit(self, data):
        self.scale.fit(data)
        return self.scale.transform(data)

    def fit(self, X, Y, distance=chi2Distance):
        X = self.scaleDataFit(X)
        self.distance = distance
        #compute distances
        match, mis = threadComputeMatrix(X, Y, distance)
        #prepare labels
        labels = [1 for l in match]
        lab = [0 for l in mis]
        #merge matches and mismatches
        match = np.vstack((match, mis))
        del mis
        labels.extend(lab)
        match = np.asarray(match)
        labels = np.asarray(labels)
        #learn
        self.clf = SGDClassifier(loss="hinge",
                                 penalty="l2",
                                 n_jobs=8,
                                 shuffle=True)
        self.clf.fit(match, labels)
        #self.clf = SVC()
        #self.clf.fit(match, labels)
        # print "ACC", self.clf.score( match ,labels)
        # data_match = self.clf.decision_function( np.asarray(match) )
        # data_mis   = self.clf.decision_function( np.asarray(mis  ) )
        # return (data_match, data_mis)

    #transform data for evaulation purpose
    def transform(self, X, Y):
        X = self.scale.transform(X)
        match, mis = threadComputeMatrix(X, Y, self.distance)
        #match, mis = computeDistanceMatrix(X, Y, self.distance)
        data_match = self.clf.decision_function(np.asarray(match))
        data_mis = self.clf.decision_function(np.asarray(mis))
        return (data_match, data_mis)

    def predict(self, X1, X2):
        stack = np.vstack((X1, X2))
        stack = self.scale.transform(stack)
        distance = self.distance(stack[0, :], stack[1, :])
        return self.clf.decision_function(np.asarray(distance))
예제 #5
0
def select_threshold(X, Y, a):
    global tol
    global loss
    global penalty
    skf = StratifiedKFold(n_splits=3)
    model = SGDClassifier(loss=loss,
                          alpha=a,
                          class_weight='balanced',
                          penalty=penalty,
                          n_jobs=-1,
                          tol=tol)
    thld = 0
    mean_f1 = 0
    for train_index, test_index in skf.split(X, Y):
        model.fit(X[train_index], Y[train_index])
        scores = model.decision_function(X[test_index])
        fpr, tpr, thresholds = roc_curve(Y[test_index], scores, pos_label=1)
        f1 = []
        #thld_range = thresholds
        thld_range = np.linspace(thresholds[0], thresholds[-1], 50)
        for t in thld_range:
            f1.append(f1_score(Y[test_index], (scores > t).astype(int)))
        best_f1 = max(f1)
        best_t = thld_range[f1.index(best_f1)]
        thld = thld + best_t / 3
        mean_f1 = mean_f1 + best_f1 / 3
    return thld, mean_f1
예제 #6
0
def get_CV_data(X, Y, Alpha, verbose=False):
    global tol
    global loss
    global penalty
    t_0 = time.time()
    skf = StratifiedKFold(n_splits=3)
    auc_scores = []
    i = 0
    for a in Alpha:
        model = SGDClassifier(loss=loss,
                              alpha=a,
                              class_weight='balanced',
                              penalty=penalty,
                              n_jobs=-1,
                              tol=tol)
        auc_scores.append(0)
        for train_index, test_index in skf.split(X, Y):
            model.fit(X[train_index], Y[train_index])
            scores = model.decision_function(X[test_index])
            roc_auc = roc_auc_score(Y[test_index], scores)
            auc_scores[i] = auc_scores[i] + roc_auc / 3
        if verbose:
            time_str = str(timedelta(seconds=time.time() - t_0))
            print(
                'CV for alpha = {}\n AUC score = {} Time since begining: {}\n'.
                format(a, auc_scores[i], time_str))
        i += 1
    return auc_scores
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50,
                      centers=2,
                      random_state=0,
                      cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge",
                        alpha=0.01,
                        max_iter=200,
                        fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([[x1, x2]])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')
예제 #8
0
class kernelsvm():
    def __init__(self, theta0, alpha, loss_metric):
        self.theta0 = theta0
        self.alpha = alpha
        self.loss_metric = loss_metric
    def fit(self, X, y, idx_SR):
        n_SR = len(idx_SR)
        self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR)
        X_features = self.feature_map_nystroem.fit_transform(X,idx_SR)
        print("fitting SGD")
        self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha)
        self.clf.fit(X_features, y)
        print("fitting SGD finished")
    def predict(self, X):
        print("Predicting")
        X_transform = self.feature_map_nystroem.transform(X)
        return self.clf.predict(X_transform), X_transform
    def decision_function(self, X):
        # X should be the transformed input!
        return self.clf.decision_function(X)
    def err_rate(self, y_true, y_pred):
        acc = accuracy_score(y_true, y_pred)
        err_rate = 1.0-acc
        return err_rate
    def get_params(self):
        return self.clf.get_params()
예제 #9
0
def linear_sgd(data_test, data_train, target_train, proba=False):
    """
    
    :param data_test:
    :param data_train:
    :param target_train:
    :param proba:
    :return:
    """
    logging.info('SGDClassifier')
    sgd = SGDClassifier()

    duration = time.time()
    sgd.fit(data_train, target_train)
    duration = time.time() - duration
    logging.info(f'duration fit: {duration}')

    if proba:
        duration = time.time()
        result = sgd.predict(data_test)
        duration = time.time() - duration
        logging.info(f'duration predict: {duration}')
        proba = sgd.decision_function(data_test)
        return result, proba
    duration = time.time()
    result = sgd.predict(data_test)
    duration = time.time() - duration
    logging.info(f'duration predict: {duration}')
    return result
예제 #10
0
class RBFSamplerSGDClassifierEstimator(BaseEstimator, TransformerMixin):
    def __init__(self,
                 gamma=1.0,
                 n_components=100,
                 random_state=None,
                 **kwargs):
        kwargs['random_state'] = random_state
        self.rbf_sampler = RBFSampler(gamma=gamma,
                                      n_components=n_components,
                                      random_state=random_state)
        self.sgdclassifier = SGDClassifier(**kwargs)

    def fit(self, X, y):
        X = self.rbf_sampler.fit_transform(X)
        self.sgdclassifier.fit(X, y)
        return self

    def transform(self, X, y=None):
        return np.sqrt(self.rbf_sampler.n_components) / np.sqrt(
            2.) * self.rbf_sampler.transform(X)

    def predict(self, X):
        return self.sgdclassifier.predict(self.transform(X))

    def decision_function(self, X):
        return self.sgdclassifier.decision_function(self.transform(X))
예제 #11
0
def plot_sgd_classifier(num_samples, clt_std):
    #generation of data
    X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std)

    #fitting of data using logistic regression
    clf = SGDClassifier(loss='log', alpha=0.01)
    clf.fit(X, y)

    #plotting of data
    x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10)
    y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10)

    X_, Y_ = np.meshgrid(x_, y_)
    Z = np.empty(X_.shape)

    for (i, j), val in np.ndenumerate(X_):
        x1 = val
        x2 = Y_[i, j]
        conf_score = clf.decision_function([x1, x2])
        Z[i, j] = conf_score[0]

    levels = [-1.0, 0, 1.0]
    colors = 'k'
    linestyles = ['dashed', 'solid', 'dashed']

    ax = plt.axes()
    plt.xlabel('X1')
    plt.ylabel('X2')
    ax.contour(X_, Y_, Z, colors=colors,
               levels=levels, linestyles=linestyles, labels='Boundary')
    ax.scatter(X[:, 0], X[:, 1], c=y)
예제 #12
0
def multiple_claasifier():
    sgd_clfs = SGDClassifier(random_state=42, max_iter=None, tol=None)
    sgd_clfs.fit(X_train, y_train)
    some_digit = X[1]
    sgd_clfs.predict([some_digit])
    some_digit_scores = sgd_clfs.decision_function([some_digit])
    print(some_digit_scores)
예제 #13
0
def plot_sgd_separador():
    # Creamos 50 puntos separados
    X, Y = make_blobs(n_samples=50,
                      centers=2,
                      random_state=0,
                      cluster_std=0.60)

    # fijamos el modelo
    clf = SGDClassifier(loss="hinge",
                        alpha=0.01,
                        n_iter=200,
                        fit_intercept=True)
    clf.fit(X, Y)

    # dibujamos la  linea, los  puntos y los puntos cercanos al plano
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function(np.array([[x1, x2]]))
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')
예제 #14
0
def main(feature_pkl):
    print 'Loading data...'
    featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
    print 'Normalizing data...'
    trainFeatures = sklearn.preprocessing.normalize(trainFeatures.tocsc(), norm='l2', axis=0)
    testFeatures = sklearn.preprocessing.normalize(testFeatures.tocsc(), norm='l2', axis=0)
    #trainSplit, testSplit = splitTuple
    # Best estimator from grid search:
    clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
       verbose=0, warm_start=False)

    print 'Fitting model...'
    clf.fit(trainFeatures,trainTargets)

    # Use probabilities or decision function to generate a ranking    
    predicted_scores = clf.decision_function(testFeatures)
    with open(os.path.splitext(feature_pkl)[0]+'_testRanking.csv', 'w') as f:
        f.write('id\n')
        for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
            f.write('%d\n' % (item_id))

   # Turn estimator params into word clouds
    features, indices = zip(*sorted(featureIndex.iteritems(), key=operator.itemgetter(1)))
    coef_tuple = zip(clf.coef_[0],indices)
    coef_sort = sorted(coef_tuple, reverse=True)
    print 'Top 20 for illicit:'
    wordle_print(coef_sort[:20],features)
    print 'Top 20 for licit:'
    wordle_print(coef_sort[-20:],features)
예제 #15
0
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)
    # e.g.
    # array([-1.        , -0.33333333,  0.33333333,  1.        ,  1.66666667,
    #    2.33333333,  3.        ,  3.66666667,  4.33333333,  5.        ])

    X1, X2 = np.meshgrid(xx, yy) # make 2 lists comprising all 2D co-ordinate pairs
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        decision_function_array = np.array([x1, x2]).reshape(1, -1) # e.g. [[-1.0, -1.0]]
        p = clf.decision_function(decision_function_array)        
        Z[i, j] = p[0] # confidence scores for sample (signed distance to hyperplane for each sample)
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')
예제 #17
0
def plot_sgd_separating_hyperplane():
    """
    =========================================
    SGD: Maximum margin separating hyperplane
    =========================================

    Plot the maximum margin separating hyperplane within a two-class
    separable dataset using a linear Support Vector Machines classifier
    trained using SGD.
    """
    print(__doc__)

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.linear_model import SGDClassifier
    from sklearn.datasets.samples_generator import make_blobs

    # we create 50 separable points
    X, Y = make_blobs(n_samples=50,
                      centers=2,
                      random_state=0,
                      cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge",
                        alpha=0.01,
                        max_iter=200,
                        fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([[x1, x2]])
        Z[i, j] = p[0]

    # Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 用这句更简单
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'
    plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    plt.scatter(X[:, 0],
                X[:, 1],
                c=Y,
                cmap=plt.cm.Paired,
                edgecolor='black',
                s=20)

    plt.axis('tight')
    plt.show()
예제 #18
0
def svm_cross_validate(X,y,category,C,penalty,sample_weights):
	
	clf_svm_1 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True)
	clf_svm_2 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True)
	
	#N = len(category)
	#half_data= np.floor(N/2)
	#cv_indices_1= np.repeat([False],N)
	#cv_indices_2= np.repeat([False],N)
	#cv_indices_1[0:half_data] =True
	#cv_indices_2[half_data:N] =True
	#cv_indices= np.concatenate((cv_indices_1,cv_indices_2),axis=1)
	
	cv_indices = generate_cv_indices_unbalanced(category)
	
	train_ids = cv_indices[0:N]
	test_ids = cv_indices[N:2*N]
	
	clf_svm_1.fit(X[train_ids,:], y[train_ids],sample_weight=sample_weights[train_ids])
	clf_svm_2.fit(X[test_ids,:], y[test_ids],sample_weight=sample_weights[test_ids])
	
	score = np.zeros(2)
	score[0] = clf_svm_1.score(X[test_ids,:], y[test_ids])
	score[1] = clf_svm_2.score(X[train_ids,:], y[train_ids])
	mean_score = np.mean(score)
	
	#y_1 = clf_svm_1.predict_proba(X[test_ids,:])
	#y_2 = clf_svm_2.predict_proba(X[train_ids,:])
	y_1 = clf_svm_1.decision_function(X[test_ids,:])
	y_2 = clf_svm_2.decision_function(X[train_ids,:])
	y_1 = sigmoid(y_1)
	y_2 = sigmoid(y_2)
	
	auc = np.zeros(2)
	fpr, tpr, thresholds = metrics.roc_curve(y[test_ids], y_1, pos_label=1)
	auc[0] = metrics.auc(fpr, tpr)

	fpr, tpr, thresholds = metrics.roc_curve(y[train_ids], y_2, pos_label=1)
	auc[1] = metrics.auc(fpr, tpr)	
	
	mean_auc = np.mean(auc,axis=0)
	print("Finished running standard cross validation")
	return mean_auc
def Get10SGDClassifiers(X_train, X_test, y_train, y_test):
    sgd_classificator = SGDClassifier(random_state=42, max_iter=5, tol=-np.inf)
    sgd_classificator.fit(X_train, y_train)
    predict = sgd_classificator.predict([X_test[1]])
    array_score = sgd_classificator.decision_function([X_test[1]])
    print("każda cyfra ma swój klasyfikator")
    print("predykcja: ", predict)
    print("target: ", y_test[1])
    print("klasy: ", sgd_classificator.classes_)
    print("macierz punktów: ", array_score)
예제 #20
0
def svm_cross_validate_category(X,y,category,C,penalty,sample_weights):
	
	clf_svm_1 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True)
	clf_svm_2 = SGDClassifier(loss=loss, penalty=penalty, alpha=C, shuffle=True)
	
	cv_indices = generate_cv_indices(category)
	
	train_ids = cv_indices[0:N]
	test_ids = cv_indices[N:2*N]
	
	clf_svm_1.fit(X[train_ids,:], y[train_ids],sample_weight=sample_weights[train_ids])
	clf_svm_2.fit(X[test_ids,:], y[test_ids],sample_weight=sample_weights[test_ids])
	
	score = np.zeros(2)
	score[0] = clf_svm_1.score(X[test_ids,:], y[test_ids])
	score[1] = clf_svm_2.score(X[train_ids,:], y[train_ids])
	mean_score = np.mean(score)
	
#	y_1 = clf_svm_1.predict_proba(X[test_ids,:])
#	y_2 = clf_svm_2.predict_proba(X[train_ids,:])
	y_1 = clf_svm_1.decision_function(X[test_ids,:])
	y_2 = clf_svm_2.decision_function(X[train_ids,:])
	y_1 = sigmoid(y_1)
	y_2 = sigmoid(y_2)
	
	u, indices = np.unique(category, return_inverse=True)
	auc = np.zeros((2,len(u)))
	for i in range(0,len(u)):
		
		i_inds = indices == i
		
		if(np.sum(test_ids & i_inds)!=0):
			fpr, tpr, thresholds = metrics.roc_curve(y[test_ids & i_inds], y_1[i_inds[test_ids],1], pos_label=1)
			auc[0,i] = metrics.auc(fpr, tpr)

		if(np.sum(train_ids & i_inds)!=0):
			fpr, tpr, thresholds = metrics.roc_curve(y[train_ids & i_inds], y_2[i_inds[train_ids],1], pos_label=1)
			auc[1,i] = metrics.auc(fpr, tpr)	
	
		mean_auc = np.mean(auc,axis=0)
	print("Finished running category cross-validation")
	return mean_auc
예제 #21
0
def number_classify_ova(X_train, y_train):
    # 创建随机梯度下降多分类器实例
    sgd_clf = SGDClassifier(random_state=42)
    sgd_clf.fit(X_train, y_train)
    # 预测样本
    sample = X_train[100]
    predict = sgd_clf.predict([sample])
    # 查看该样本在各类中的得分
    digit_scores = sgd_clf.decision_function([sample])
    print('OvA的随机梯度下降分类器预测结果为:', predict, '该样本的各类得分:', digit_scores)
    return sgd_clf
예제 #22
0
def sgd_ova(digit):
    some_digit = X[digit]
    sgd_clf = SGDClassifier(random_state = 34)
    sgd_clf.fit(X_train, y_train)
    prediction = sgd_clf.predict([some_digit])
    some_digit_scores = sgd_clf.decision_function([some_digit])
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
    cvs = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
    print(cvs)
    print(some_digit_scores)
    print(prediction)
예제 #23
0
def run():

	x_train,y_train,x_test = load_data()
	X_train,Y_train,X_test,Y_test = split_data(x_train,y_train)

	best_score_cv = 0
	best_algo = ''

	clf = SGDClassifier(loss="hinge", penalty="l2")
	clf.fit(X_train,Y_train)
	Y_pred = clf.decision_function(X_test)
	if best_score_cv<metric(Y_test,Y_pred):
		best_score_cv = metric(Y_test,Y_pred)
		best_algo = 'hinge + l2'

	for alpha in [0.0001,0.001, 0.01, 0.1]:
		clf= Lasso(alpha=alpha)
		clf.fit(X_train,Y_train)
		Y_pred = clf.decision_function(X_test)
		if best_score_cv<metric(Y_test,Y_pred):
			best_score_cv = metric(Y_test,Y_pred)
			best_algo = 'LASSO with alpha='+str(alpha)

	clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0)
	clf.fit(X_train,Y_train)
	Y_pred = clf.predict_proba(X_test)
	if best_score_cv<metric(Y_test,Y_pred[:,1]):
		best_score_cv = metric(Y_test,Y_pred[:,1])
		best_algo = 'randomforest with 100 trees'

	print 	
	print 'Thank you for running ML21 futurist meta-algorithm'
	print 
	print '> the best algorithm is : '+best_algo
	print 
	print '> the best cross-validation score is : '+str(best_score_cv)
	print 
	print 'If you want, I can also do your breakfast.'
	print
예제 #24
0
	def sgd_classify(self):
		print "Stochastic Gradient Descent"

		clf = SGDClassifier()
		clf.fit(self.descr, self.target)
		mean = clf.score(self.test_descr, self.test_target)

		print "Mean : %3f" % mean
		print "Probability ", clf.coef_
		print "Mean of each feature per class ", clf.intercept_
		print "Confidence Score ",clf.decision_function(self.descr)
		print "Predict Probability ", clf.predict_proba(self.descr)
		print "Transform ", clf.transform(self.descr)
예제 #25
0
    def sgd_classify(self):
        print "Stochastic Gradient Descent"

        clf = SGDClassifier()
        clf.fit(self.descr, self.target)
        mean = clf.score(self.test_descr, self.test_target)

        print "Mean : %3f" % mean
        print "Probability ", clf.coef_
        print "Mean of each feature per class ", clf.intercept_
        print "Confidence Score ", clf.decision_function(self.descr)
        print "Predict Probability ", clf.predict_proba(self.descr)
        print "Transform ", clf.transform(self.descr)
예제 #26
0
def evaluate(X_train,
             Y_train,
             X_test,
             Y_test,
             a,
             thld,
             plot=False,
             plot_path='../results/Test_ROC.png'):
    global tol
    global loss
    global penalty
    model = SGDClassifier(loss=loss,
                          alpha=a,
                          class_weight='balanced',
                          penalty=penalty,
                          n_jobs=-1,
                          tol=tol)
    model.fit(X_train, Y_train)

    train_scores = model.decision_function(X_train)
    Y_train_pred = (train_scores > thld).astype(int)
    train_report = classification_report(Y_train, Y_train_pred, digits=3)
    print('Train report:\n{}'.format(train_report))

    test_scores = model.decision_function(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, test_scores, pos_label=1)
    Y_test_pred = (test_scores > thld).astype(int)
    test_report = classification_report(Y_test, Y_test_pred, digits=3)
    print('test report:\n{}'.format(test_report))
    if (plot):
        plt.plot(fpr, tpr)
        plt.plot(fpr, fpr, linestyle=':', color='k')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.savefig(plot_path)
        plt.show()
    ROC_data = (fpr, tpr, thld)
    return train_report, test_report, ROC_data, model
예제 #27
0
class MySGDClassifier(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0, random_state=42):
        self.threshold = threshold
        self.random_state = random_state
        self.classifier = SGDClassifier(random_state=random_state)

    def fit(self, X, y):
        self.classifier.fit(X, y)
        return self

    def predict(self, x):
        Y_score_values = self.classifier.decision_function(x)
        Y_values = Y_score_values > self.threshold
        return Y_values
예제 #28
0
def classify(x_train, y_train, x_test):
    """
    Trains logistic regression classifier on training set and then returns the probabilities
    of being the correct answer for points in training and test sets.

    Args:
        x_train: features of training set
        y_train: labels of training set
        x_test: features of test set

    Returns:
        y_train_prob: probabilities of training set points
        y_test_prob: probabilities of testing set points
        lr: classifier
    """
    # train classifier
    lr = SGDClassifier(loss='log', penalty='l2', max_iter=5, tol=None)
    lr.fit(x_train, y_train)

    # obtain probabilities from decision boundary of classifier
    y_train_prob = lr.decision_function(x_train)
    y_test_prob = lr.decision_function(x_test)
    return y_train_prob, y_test_prob, lr
예제 #29
0
def SGDC(train_x, train_y, test_x, test_y, parameters=None):
    '''
    Creates and fits the SGDClassifier
    :param train_x: train_x
    :param train_y: train_y
    :param test_x: test_x
    :param test_y: test_y
    :return: fpr, tpr, auc_score
    '''
    clf_sgd = SGDClassifier(n_jobs=-1)
    clf_sgd.fit(train_x, train_y)
    predictions = clf_sgd.decision_function(test_x)
    fpr, tpr, _ = roc_curve(test_y, predictions, pos_label=1.0)
    score = round(roc_auc_score(test_y, predictions), 4)
    return fpr, tpr, score
def GetSGDClassifier2(X_train, X_test, y_train, y_test):
    n = 36000
    some_digit_image = X_train[n]
    some_digit_target = y_train[n]

    y_train_5 = (y_train == 5)
    y_test_5 = (y_test == 5)

    sgd_classificator = SGDClassifier(random_state=42, max_iter=5, tol=-np.inf)
    sgd_classificator.fit(X_train, y_train_5)
    score = cross_val_score(sgd_classificator, X_train, y_train_5, cv=3, scoring="accuracy")
    y_train_predict = cross_val_predict(sgd_classificator, X_train, y_train_5, cv=3)

    confusion_matrix_digits = confusion_matrix(y_train_5, y_train_predict)

    precision_s = precision_score(y_train_5, y_train_predict)
    pelnosc_recall = recall_score(y_train_5, y_train_predict)
    f1 = f1_score(y_train_5, y_train_predict)

    y_scores = sgd_classificator.decision_function([some_digit_image])
    threshold_df = 200000
    y_some_digit_pred = (y_scores > threshold_df)

    y_scores_plot = cross_val_predict(sgd_classificator, X_train, y_train_5, cv=3, method="decision_function")
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores_plot)
    # EditData.plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    # EditData.plot_precision_vs_recall(precisions, recalls)
    p = 70000
    y_train_pred_90 = (y_scores_plot > p)

    #EditData.plot_roc_curve(y_train_5, y_scores_plot)
    if False:
        print("predykcja: ", sgd_classificator.predict([some_digit_image]))
        print("target: ", some_digit_target)
        print("Score: ", score)
        print("confusion matrix: [PN, FP]: ", confusion_matrix_digits[0], ", [FN, PP]: ", confusion_matrix_digits[1])

        print("Wynik F1: ", f1)
        print("predykcja z progiem ", threshold_df, ": ", y_some_digit_pred)
        print("Precyzja z progiem", p, precision_score(y_train_5, y_train_pred_90))
        print("Pełność z progiem", p, recall_score(y_train_5, y_train_pred_90))
        print("................................................................................")
        print("ROC")
    print("........SGD............")
    print("Precyzja SGD: ", np.round(precision_s, 4))
    print("Pełność SGD: ", np.round(pelnosc_recall, 4))
    print(".......................")
    return y_train_5, y_scores_plot
예제 #31
0
def main():
    train_x,test_x,train_y,test_y = split_data()
    print("train ",train_x.shape)
    print("test ",test_x.shape)
    # 二分类
    train_y_5 = (train_y == 5)
    test_y_5 = (test_y == 5)
    # sgd_clf(train_x,train_y_5,test_x,test_y_5)
    # cross_val(train_x,train_y_5)

    # 多分类
    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier()
    sgd_clf.fit(train_x,train_y)
    pred_y = sgd_clf.decision_function(test_x[50].reshape(1,-1))
    print(pred_y)
예제 #32
0
def train_custom_one_vs_all(X_train, X_test, Y_train, topk):

    #convert matrix to row for efficient splicing
    Y_train = Y_train.tocsc()
    tag_classifiers = []
    num_training, numclasses = Y_train.shape
    num_test_examples = X_test.shape[0]

    # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that
    num_examples = X_test.shape[0]
    num_classes = len(tag_classifiers)
    topk_class_distances = []
    for i in xrange(num_examples):
        heap = []
        topk_class_distances += [heap]

    for j in xrange(numclasses):
        # train on each class label for all the training examples
        y = numpy.ravel(Y_train.getcol(j).todense())

        clf = SGDClassifier(loss='hinge',
                            penalty='l2',
                            alpha=0.0001,
                            fit_intercept=True,
                            n_iter=10,
                            shuffle=True,
                            n_jobs=4,
                            learning_rate='optimal')

        clf.fit(X_train, y)
        print "Trained for class", j
        # get the decision for all test examples
        decision = clf.decision_function(X_test)
        # for each test example add its decision value to the heap of top k decision values
        for i in xrange(num_test_examples):
            h = topk_class_distances[i]
            if len(h) < topk: heapq.heappush(h, (decision[i], j))
            else: heapq.heappushpop(h, (decision[i], j))
        print "Predicted for class", j

    #clean the decision values and store the class labels
    class_label_indices = []
    for i in xrange(num_examples):
        topk_labels = [label for dist, label in topk_class_distances[i]]
        class_label_indices += [topk_labels]

    return class_label_indices
예제 #33
0
def train(input_filename, num_train_examples, num_test_examples, block_size):
    # Load initial training data and test data
    X_train, y_train, X_test, y_test, scaler = loaddata(
        input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=150, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_test = encoder.transform(forest.apply(X_test))
    # Make sure that classes are weighted inversely to their frequencies
    weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train))
    class_weights = {0: weights[0], 1: weights[1]}
    learner = SGDClassifier(loss="hinge",
                            penalty="l2",
                            learning_rate="invscaling",
                            alpha=0.0001,
                            average=10**4,
                            eta0=1.0,
                            class_weight=class_weights)

    num_passes = 3
    aucs = []

    for j in range(num_passes):
        for i in range(0, num_train_examples, block_size):
            df = pandas.read_csv(input_filename,
                                 header=None,
                                 skiprows=i,
                                 nrows=block_size)
            X_train = df.values[:, 1:]
            X_train = scaler.transform(X_train)
            X_train = encoder.transform(forest.apply(X_train))
            y_train = numpy.array(df.values[:, 0], numpy.int)
            del df

            learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1]))
            y_pred_prob = learner.decision_function(X_test)
            auc = roc_auc_score(y_test, y_pred_prob)
            aucs.append([i + num_train_examples * j, auc])
            print(aucs[-1])

    df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"])
    df = df.set_index("Iterations")
    return df
예제 #34
0
def bursi_get_extremes(num=200):
    po, ne = list(gspan.gspan_to_eden("bursi.pos.gspan")), list(
        gspan.gspan_to_eden("bursi.neg.gspan"))
    X, y = graphs_to_Xy(po, ne)
    esti = SGDClassifier(average=True,
                         class_weight='balanced',
                         shuffle=True,
                         n_jobs=4,
                         loss='log')
    esti.fit(X, y)
    res = [(score, idd)
           for idd, score in enumerate(esti.decision_function(X))]  # list
    res.sort()
    graphs = po + ne
    # returns pos/neg
    return [graphs[idd] for (score, idd) in res[0 - num:]
            ], [graphs[idd] for (score, idd) in res[:num]]
def test_not_robust_classif(loss, weighting, multi_class):
    clf = RobustWeightedClassifier(
        loss=loss,
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        multi_class=multi_class,
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss=loss, random_state=rng)
    clf.fit(X_c, y_c)
    clf_not_rob.fit(X_c, y_c)
    pred1 = clf.base_estimator_.decision_function(X_c)
    pred2 = clf_not_rob.decision_function(X_c)

    assert np.mean((pred1 > 0) == (pred2 > 0)) > 0.8
예제 #36
0
class LinearClassifier(object):
    def __init__(self,
                 decompose_func=None,
                 preprocessor=None,
                 nbits=15,
                 seed=1):
        self.decompose_func = decompose_func
        self.nbits = nbits
        feature_size, bitmask = set_feature_size(nbits=nbits)
        self.feature_size = feature_size
        self.bitmask = bitmask
        self.encoding_func = make_encoder(decompose_func,
                                          preprocessors=preprocessor,
                                          bitmask=self.bitmask,
                                          seed=seed)
        self.classifier = SGDClassifier(penalty='elasticnet')

    def fit(self, graphs, targets):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        self.classifier.fit(data_mtx, targets)
        return self

    def decision_function(self, graphs):
        # return probability associated to largest target type
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.decision_function(data_mtx)
        return preds

    def predict(self, graphs):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.predict(data_mtx)
        return preds
예제 #37
0
def train(input_filename, num_train_examples, num_test_examples, block_size):
    # Load initial training data and test data
    X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=150, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_test = encoder.transform(forest.apply(X_test))
    # Make sure that classes are weighted inversely to their frequencies
    weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train))
    class_weights = {0: weights[0], 1: weights[1]}
    learner = SGDClassifier(
        loss="hinge",
        penalty="l2",
        learning_rate="invscaling",
        alpha=0.0001,
        average=10 ** 4,
        eta0=1.0,
        class_weight=class_weights,
    )

    num_passes = 3
    aucs = []

    for j in range(num_passes):
        for i in range(0, num_train_examples, block_size):
            df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size)
            X_train = df.values[:, 1:]
            X_train = scaler.transform(X_train)
            X_train = encoder.transform(forest.apply(X_train))
            y_train = numpy.array(df.values[:, 0], numpy.int)
            del df

            learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1]))
            y_pred_prob = learner.decision_function(X_test)
            auc = roc_auc_score(y_test, y_pred_prob)
            aucs.append([i + num_train_examples * j, auc])
            print(aucs[-1])

    df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"])
    df = df.set_index("Iterations")
    return df
def train_custom_one_vs_all(X_train,X_test,Y_train,topk):

    #convert matrix to row for efficient splicing
    Y_train = Y_train.tocsc()
    tag_classifiers = []
    num_training,numclasses = Y_train.shape
    num_test_examples = X_test.shape[0]


    # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that
    num_examples = X_test.shape[0]
    num_classes = len(tag_classifiers)
    topk_class_distances = []
    for i in xrange(num_examples):
        heap = []
        topk_class_distances += [heap]
    

    for j in xrange(numclasses):
        # train on each class label for all the training examples
        y = numpy.ravel(Y_train.getcol(j).todense());

        clf =  SGDClassifier(loss='hinge',penalty='l2',alpha=0.0001,fit_intercept=True,n_iter = 10,shuffle=True,n_jobs=4,learning_rate='optimal')
    
        clf.fit(X_train,y);
        print "Trained for class",j
        # get the decision for all test examples
        decision = clf.decision_function(X_test)
        # for each test example add its decision value to the heap of top k decision values
        for i in xrange(num_test_examples):
            h = topk_class_distances[i]
            if len(h) < topk: heapq.heappush(h,(decision[i],j))
            else:             heapq.heappushpop(h,(decision[i],j))
        print "Predicted for class",j

    #clean the decision values and store the class labels
    class_label_indices = []
    for i in xrange(num_examples):
        topk_labels = [label for dist,label in topk_class_distances[i]]
        class_label_indices += [topk_labels]

    return class_label_indices
예제 #39
0
def multi_class(X, y, output_loc, number=16000, rand_state=42, cv=3):
    """ Model fitted, does a OvA, hence shown by desc_func output """

    sgd_clf = SGDClassifier(random_state=rand_state)
    sgd_clf.fit(X, y)

    # Can train on the OvO strategy
    ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=rand_state))

    ovo_clf.fit(X, y)

    forest_clf = RandomForestClassifier(random_state=rand_state)
    forest_clf.fit(X, y)

    # Index of the highest score is the given class
    scores = sgd_clf.decision_function(X[number, :].reshape(1, -1))
    """
    assert int(sgd_clf.predict(X[number,:].reshape(1, -1))[0,]) == \
        int(np.argmax(scores))

    assert int(ovo_clf.predict(X[number,:].reshape(1, -1))[0,]) == \
        int(sgd_clf.predict(X[number,:].reshape(1, -1))[0,])

    assert int(sgd_clf.predict(X[number,:].reshape(1, -1))[0,]) == \
        int(forest_clf.predict(X[number,:].reshape(1, -1))[0,])

    assert int(np.argmax(
        forest_clf.predict_proba(
            X[number,:].reshape(1, -1))[0,]).flatten()) == \
        int(forest_clf.predict(X[number,:].reshape(1, -1))[0,])
    """

    assert len(ovo_clf.estimators_) == 45

    sgd_score = cross_val_score(sgd_clf, X, y, cv=cv, scoring="accuracy")
    ovo_score = cross_val_score(ovo_clf, X, y, cv=cv, scoring="accuracy")
    rf_score = cross_val_score(forest_clf, X, y, cv=cv, scoring="accuracy")

    plot_save_loc = os.path.join(os.getcwd(), output_loc, "conf_mat_multi.jpg")
    conf_mat_normal = conf_mat(sgd_clf, X, y, plot_save_loc, None, cv=3)

    return np.mean(sgd_score), np.mean(rf_score), np.mean(ovo_score)
예제 #40
0
class SGDClassifierImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
예제 #41
0
class SGDC(object):
    def __init__(self, texts, classes, nlpdict):
        # TODO: add list of smileys to texts/classes
        self.s = SGDClassifier(loss="hinge", penalty="l1", shuffle=True, class_weight="auto")
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        self._train(texts, classes)

    def _train(self, texts, classes):
        vectors = self.dictionary.feature_vectors(texts)
        self.s.fit(vectors, classes)

    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        predictions = self.s.decision_function(vectors)
        predictions = predictions / 20 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
예제 #42
0
	axes[i].set_ylim(y_min, y_max)
	pylab.sca(axes[i])
	plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap= plt.cm.prism)
	ys = (-clf.intercept_[i] - xs * clf.coef_[i,0])/ clf.coef_[i,1]
	plt.plot(xs, ys, hold=True)

# Show Triple Binary Classifier
plt.show()

# Predicts the Species of Flower with Sepal Width 4.7 and Sepal Length 3.1
# Selects the Class in which it is more confident (Boundary line whose distance
# to instance is longer)
print clf.predict(scaler.transform([[4.7, 3.1]]))

# Prints distance of all three boundary lines from the Point(4.7, 3.1)
print clf.decision_function(scaler.transform([[4.7, 3.1]]))

# Measure effeciveness of Results (82 % here in Train Dataset)
from sklearn import metrics
y_train_pred = clf.predict(x_train)
print metrics.accuracy_score(y_train, y_train_pred)

# (68 % Efficiency in Test Data)
y_test_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_test_pred)

# Print Precision, F1-Score, Recall, Support
print metrics.classification_report(y_test, y_test_pred, target_names= iris.target_names)

# Print Confusion Matrix
print metrics.confusion_matrix(y_test, y_test_pred)
'''
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(trained_model.vecs, trained_model.emos)


'''
emotion categories predicted for the test vectors
'''
predicted_op = lr.predict(test_model.vecs)


'''
decision_function provides the value by which the hyperplane is
separated which is used in ROC curves
'''
predicted_score = lr.decision_function(test_model.vecs)


def plot_confusion_matrix(cm, cmap=plt.cm.Greens):
    fig, ax = plt.subplots(figsize=(9,9))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    cb = plt.colorbar()
    cb.set_label("Predicted values")
    tick_marks = np.arange(len(emotion_categories))
    plt.xticks(tick_marks, emotion_categories.keys(), rotation=45)
    plt.yticks(tick_marks, emotion_categories.keys())
    width, height = np.shape(cm)
    for x in xrange(width):
        for y in xrange(height):
            ax.annotate(str(cm[x][y]), xy=(y, x), horizontalalignment='center', verticalalignment='center')
    ax.xaxis.tick_top()
예제 #44
0
파일: model.py 프로젝트: eamonnmag/magpie
class LearningModel(object):
    """
    Represents the model that can be trained and later used to predict
    keywords for unknown data
    """
    def __init__(self, global_index, word2vec_model):
        self.scaler = StandardScaler()
        self.classifier = SGDClassifier(n_jobs=-1)  # try loss log (logistic reg)
        self.global_index = global_index
        self.word2vec = word2vec_model

    def maybe_fit_and_scale(self, matrix):
        """
        If the scaler is not initialized, the fit() is performed on given data.
        Exception is thrown if the data is not big enough. Input matrix is
        scaled and returned.
        :param matrix: matrix to be transformed

        :return: scaled matrix
        """
        if not hasattr(self.scaler, 'n_samples_seen_'):
            if len(matrix) < 1000:
                raise ValueError("Please user bigger batch size. "
                                 "The feature matrix is too small "
                                 "to fit the scaler.")
            else:
                self.scaler.fit(matrix)
        return self.scaler.transform(matrix)

    def partial_fit_classifier(self, input_matrix, output_vector):
        """
        Fit the classifier on X, y matrices. Can be used for online training.
        :param input_matrix: feature matrix
        :param output_vector: vector of the same length as input_matrix

        :return: None
        """
        classes = np.array([0, 1], dtype=np.bool_)
        # TODO Maybe initialize the classifier with this for balancing classes
        # weights = compute_class_weight('balanced', classes, output_vector)

        self.classifier = self.classifier.partial_fit(
            input_matrix,
            output_vector,
            classes=classes,
        )

    def fit_classifier(self, input_matrix, output_vector):
        """
        Fit the classifier on X, y matrices. Previous fit is discarded.
        :param input_matrix: feature matrix
        :param output_vector: vector of the same length as input_matrix

        :return: None
        """
        self.classifier = self.classifier.fit(input_matrix, output_vector)

    def scale_and_predict(self, input_matrix):
        """
        Predict output for given samples
        :param input_matrix: a feature matrix

        :return: matrix with predictions for each sample
        """
        scaled_matrix = self.scaler.transform(input_matrix)
        return self.classifier.predict(scaled_matrix)

    def scale_and_predict_confidence(self, input_matrix):
        """
        Predict confidence values for given samples
        :param input_matrix: a feature matrix

        :return: matrix with confidence values for each sample
        """
        scaled_matrix = self.scaler.transform(input_matrix)
        return self.classifier.decision_function(scaled_matrix)

    def get_global_index(self):
        """ Get the GlobalFrequencyIndex field. """
        return self.global_index
예제 #45
0
    def test_sgd_proba(self):
        """Check SGD.predict_proba"""

        # Hinge loss does not allow for conditional prob estimate.
        # We cannot use the factory here, because it defines predict_proba
        # anyway.
        clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y)
        assert_false(hasattr(clf, "predict_proba"))
        assert_false(hasattr(clf, "predict_log_proba"))

        # log and modified_huber losses can output probability estimates
        # binary case
        for loss in ["log", "modified_huber"]:
            clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
            clf.fit(X, Y)
            p = clf.predict_proba([3, 2])
            assert_true(p[0, 1] > 0.5)
            p = clf.predict_proba([-1, -1])
            assert_true(p[0, 1] < 0.5)

            p = clf.predict_log_proba([3, 2])
            assert_true(p[0, 1] > p[0, 0])
            p = clf.predict_log_proba([-1, -1])
            assert_true(p[0, 1] < p[0, 0])

        # log loss multiclass probability estimates
        clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2)

        d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
        p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
        assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
        assert_almost_equal(p[0].sum(), 1)
        assert_true(np.all(p[0] >= 0))

        p = clf.predict_proba([-1, -1])
        d = clf.decision_function([-1, -1])
        assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))

        l = clf.predict_log_proba([3, 2])
        p = clf.predict_proba([3, 2])
        assert_array_almost_equal(np.log(p), l)

        l = clf.predict_log_proba([-1, -1])
        p = clf.predict_proba([-1, -1])
        assert_array_almost_equal(np.log(p), l)

        # Modified Huber multiclass probability estimates; requires a separate
        # test because the hard zero/one probabilities may destroy the
        # ordering present in decision_function output.
        clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
        clf.fit(X2, Y2)
        d = clf.decision_function([3, 2])
        p = clf.predict_proba([3, 2])
        if not isinstance(self, SparseSGDClassifierTestCase):
            assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1))
        else:  # XXX the sparse test gets a different X2 (?)
            assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1))

        # the following sample produces decision_function values < -1,
        # which would cause naive normalization to fail (see comment
        # in SGDClassifier.predict_proba)
        x = X.mean(axis=0)
        d = clf.decision_function(x)
        if np.all(d < -1):  # XXX not true in sparse test case (why?)
            p = clf.predict_proba(x)
            assert_array_almost_equal(p[0], [1 / 3.0] * 3)
예제 #46
0
class EdenEstimator(BaseEstimator, ClassifierMixin):
    """Build an estimator for graphs."""

    def __init__(self, r=3, d=8, nbits=16, discrete=True,
                 balance=False, subsample_size=200, ratio=2,
                 normalization=False, inner_normalization=False,
                 penalty='elasticnet'):
        """construct."""
        self.set_params(r, d, nbits, discrete, balance, subsample_size,
                        ratio, normalization, inner_normalization,
                        penalty)

    def set_params(self, r=3, d=8, nbits=16, discrete=True,
                   balance=False, subsample_size=200, ratio=2,
                   normalization=False, inner_normalization=False,
                   penalty='elasticnet'):
        """setter."""
        self.r = r
        self.d = d
        self.nbits = nbits
        self.normalization = normalization
        self.inner_normalization = inner_normalization
        self.discrete = discrete
        self.balance = balance
        self.subsample_size = subsample_size
        self.ratio = ratio
        if penalty == 'perceptron':
            self.model = Perceptron(max_iter=5, tol=None)
        else:
            self.model = SGDClassifier(
                average=True, class_weight='balanced', shuffle=True,
                penalty=penalty, max_iter=5, tol=None)
        self.vectorizer = Vectorizer(
            r=self.r, d=self.d,
            normalization=self.normalization,
            inner_normalization=self.inner_normalization,
            discrete=self.discrete,
            nbits=self.nbits)
        return self

    def transform(self, graphs):
        """transform."""
        x = self.vectorizer.transform(graphs)
        return x

    @timeit
    def kernel_matrix(self, graphs):
        """kernel_matrix."""
        x = self.transform(graphs)
        return metrics.pairwise.pairwise_kernels(x, metric='linear')

    def fit(self, graphs, targets, randomize=True):
        """fit."""
        if self.balance:
            if randomize:
                bal_graphs, bal_targets = balance(
                    graphs, targets, None, ratio=self.ratio)
            else:
                samp_graphs, samp_targets = subsample(
                    graphs, targets, subsample_size=self.subsample_size)
                x = self.transform(samp_graphs)
                self.model.fit(x, samp_targets)
                bal_graphs, bal_targets = balance(
                    graphs, targets, self, ratio=self.ratio)
            size = len(bal_targets)
            logger.debug('Dataset size=%d' % (size))
            x = self.transform(bal_graphs)
            self.model = self.model.fit(x, bal_targets)
        else:
            x = self.transform(graphs)
            self.model = self.model.fit(x, targets)
        return self

    def predict(self, graphs):
        """predict."""
        x = self.transform(graphs)
        preds = self.model.predict(x)
        return preds

    def decision_function(self, graphs):
        """decision_function."""
        x = self.transform(graphs)
        preds = self.model.decision_function(x)
        return preds

    @timeit
    def cross_val_score(self, graphs, targets,
                        scoring='roc_auc', cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_score(
            self.model, x, targets, cv=cv, scoring=scoring)
        return scores

    @timeit
    def cross_val_predict(self, graphs, targets, cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_predict(
            self.model, x, targets, cv=cv, method='decision_function')
        return scores

    @timeit
    def cluster(self, graphs, n_clusters=16):
        """cluster."""
        x = self.transform(graphs)
        clust_est = MiniBatchKMeans(n_clusters=n_clusters)
        cluster_ids = clust_est.fit_predict(x)
        return cluster_ids

    @timeit
    def model_selection(self, graphs, targets,
                        n_iter=30, subsample_size=None):
        """model_selection_randomized."""
        param_distr = {"r": list(range(1, 5)), "d": list(range(0, 10))}
        if subsample_size:
            graphs, targets = subsample(
                graphs, targets, subsample_size=subsample_size)

        pool = mp.Pool()
        scores = pool.map(_eval, [(graphs, targets, param_distr)] * n_iter)
        pool.close()
        pool.join()

        best_params = max(scores)[1]
        logger.debug("Best parameters:\n%s" % (best_params))
        self = EdenEstimator(**best_params)
        return self

    @timeit
    def learning_curve(self, graphs, targets,
                       cv=5, n_steps=10, start_fraction=0.1):
        """learning_curve."""
        graphs, targets = paired_shuffle(graphs, targets)
        x = self.transform(graphs)
        train_sizes = np.linspace(start_fraction, 1.0, n_steps)
        scoring = 'roc_auc'
        train_sizes, train_scores, test_scores = learning_curve(
            self.model, x, targets,
            cv=cv, train_sizes=train_sizes,
            scoring=scoring)
        return train_sizes, train_scores, test_scores

    @timeit
    def bias_variance_decomposition(self, graphs, targets,
                                    cv=5, n_bootstraps=10):
        """bias_variance_decomposition."""
        x = self.transform(graphs)
        score_list = []
        for i in range(n_bootstraps):
            scores = cross_val_score(
                self.model, x, targets, cv=cv)
            score_list.append(scores)
        score_list = np.array(score_list)
        mean_scores = np.mean(score_list, axis=1)
        std_scores = np.std(score_list, axis=1)
        return mean_scores, std_scores
# 	cv=3,
# 	scoring="accuracy"))

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

# print(y_train_pred)
# print(y_train_5)

# print(confusion_matrix(y_train_5, y_train_pred))
# print("precision:\n",precision_score(y_train_5, y_train_pred))
# print("recall:\n",recall_score(y_train_5, y_train_pred))

# print("f1:\n", f1_score(y_train_5, y_train_pred))


y_scores = sgd_clf.decision_function([some_digit])
# print(y_scores)

threshold = 0
y_some_digit_pred = (y_scores > threshold)
# print(y_some_digit_pred)

threshold =  200000
y_some_digit_pred = (y_scores > threshold)
# print(y_some_digit_pred)


y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.datasets.samples_generator import make_blobs

# we create 50 separable points
X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)

# fit the model
clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True)
clf.fit(X, Y)

# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-1, 5, 10)
yy = np.linspace(-1, 5, 10)

X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):
    x1 = val
    x2 = X2[i, j]
    p = clf.decision_function([x1, x2])
    Z[i, j] = p[0]
levels = [-1.0, 0.0, 1.0]
linestyles = ['dashed', 'solid', 'dashed']
colors = 'k'
plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

plt.axis('tight')
plt.show()
from sklearn.linear_model import SGDClassifier
enetloglike = SGDClassifier(loss="log", penalty="elasticnet",
                            alpha=0.0001, l1_ratio=0.15, class_weight='balanced')
enetloglike.fit(X, y)

enethinge = SGDClassifier(loss="hinge", penalty="elasticnet",
                            alpha=0.0001, l1_ratio=0.15,  class_weight='balanced')

enetloglike.fit(X, y)
enethinge.fit(X, y)

print(np.corrcoef(enetloglike.coef_, enethinge.coef_))
# The weights vectors are highly correlated

print(np.corrcoef(enetloglike.decision_function(X), enethinge.decision_function(X)))
# The decision function are highly correlated

plt.plot(enetloglike.decision_function(X), enethinge.decision_function(X), "o")
'''
## Exercise

Compare predictions of Enet Logistic regression (LR) and Hinge Enet

- Compute the correlation between pairs of weights vectors.

- Compare the predictions of two classifiers using their decision function: 

    * Compute the correlation decision function.
    * Plot the pairwise decision function of the classifiers.
예제 #50
0
파일: pcg.py 프로젝트: invinciblejha/kaggle
def classify(X, y):   
    print 'classify(X=%s,Y=%s)' % (X.shape, y.shape)
   
    # Normalize
    means = X.mean(axis=0)
    stds = X.std(axis=0)
    if False:
        print '    X:', X.shape, X[0,:]
        print 'means:', means.shape, means
        print ' stds:', stds.shape, stds
    
    for i in range(X.shape[1]):
        X[:,i] = X[:,i] - means[i]
        if abs(stds[i]) > 1e-4:
            X[:,i] = X[:,i]/stds[i]
        
    if False:
        means = X.mean(axis=0)
        stds = X.std(axis=0)
        print 'After normalization'
        print '    X:', X.shape, X[0,:]
        print 'means:', means.shape, means
        print ' stds:', stds.shape, stds
   
    for k in [1,5,20]:
        for i in range(5):
            classify_nn(X,y,k)
        common.SUBHEADING()
    if False:    
        for k in range(1,200):
            for i in range(10):
                classify_nn(X,y,k)
            common.SUBHEADING()    
        exit()
    
    if False:
        X = Xa.tolist()
        y = ya.tolist()
        print 'X: %dx%d' %(len(X),len(X[0]))
        print 'y: %d' %(len(y))
        
        
        if False:
            X2 = []
            Y2 = []
            
            for i in range(len(X)):
                if any(X[i]):
                    print 'X[%d]:%s' %(i,X[i])
                    print 'Y[%d]:%s' %(i,Y[i])
                    X2.append(X[i])
                    Y2.append(Y[i])
            X = X2
            Y = Y2
            
        # fit the model
        clf = SGDClassifier(loss="hinge", alpha = 0.01, n_iter=50) #, fit_intercept=True)
        clf.fit(X, Y)

        # plot the line, the points, and the nearest vectors to the plane
        xx = np.linspace(-5, 5, 10)
        yy = np.linspace(-5, 5, 10)
        X1, X2 = np.meshgrid(xx, yy)
        Z = np.empty(X1.shape)
        for (i,j), val in np.ndenumerate(X1):
            x1 = val
            x2 = X2[i,j]
            p = clf.decision_function([x1, x2])
            Z[i,j] = p[0]
        levels = [-1.0, 0.0, 1.0]
        linestyles = ['dashed','solid', 'dashed']
        colors = 'k'
        pl.set_cmap(pl.cm.Paired)
        pl.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
        pl.scatter(X[:,0], X[:,1], c=Y)
예제 #51
0
class ExperimentalOneClassEstimator:
    '''
    there might be a bug connected to nx.digraph..
    '''

    def __init__(self, nu=.5, cv=2, n_jobs=-1, move_bias_calibrate=True, classifier=SGDClassifier(loss='log')):
        '''
        Parameters
        ----------
        nu: part of graphs that will be placed in the negative set (0~1)
        cv:
        n_jobs: jobs for fitting
        move_bias_calibrate: after moving the bias we can recalibrate
        classifier: calssifier object
        Returns
        -------
        '''
        self.status = 'new'
        self.nu = nu
        self.cv = cv
        self.n_jobs = n_jobs
        self.move_bias_recalibrate = move_bias_calibrate
        self.classifier = classifier
        self.inverse_prediction = False

        self.intercept_ = .5  # PROJECT PRETEND TO BE UNCALLIBRATED TO TRICK EDEN

    # tricking eden th think i am a normal estimator... hehhehe
    def decision_function(self, vector):  # PROJECT PRETEND TO BE UNCALLIBRATED TO TRICK EDEN
        return self.superesti.decision_function(vector)

    def fit(self, data_matrix, random_state=None):

        if random_state is not None:
            random.seed(random_state)

        # use eden to fitoooOoO
        self.estimator = self.fit_estimator(data_matrix, n_jobs=self.n_jobs, cv=self.cv, random_state=random_state)

        # move bias to obtain oneclassestimator
        self.cal_estimator = self.move_bias(data_matrix, estimator=self.estimator, nu=self.nu, cv=self.cv)

        self.status = 'trained'
        return self

    '''
    disabled for now.. since the discsampler is not expected to work
    def fit_2(self, pos_iterator, neg_iterator, vectorizer=None, cv=2, n_jobs=-1):
        """
        This is used in the discsampler .,., i am not sure why i am not using eden directly.
        I will fix this when i look into the disk sampler next time.
        :param pos_iterator:
        :param neg_iterator:
        :param vectorizer:
        :param cv:
        :param n_jobs:
        :return:
        """
        self.vectorizer=vectorizer
        data_matrix = vectorizer.fit_transform(pos_iterator)
        neagtive_data_matrix = vectorizer.transform(neg_iterator)
        estimator = eden_fit_estimator(SGDClassifier(loss='log'),
                                       positive_data_matrix=data_matrix,
                                       negative_data_matrix=neagtive_data_matrix,
                                       cv=cv,
                                       n_jobs=n_jobs,
                                       n_iter_search=10)
        # esti= CalibratedClassifierCV(estimator,cv=cv,method='sigmoid')
        # esti.fit( vstack[ X,Y], numpy.asarray([1]*X.shape[0] + [0]*Y.shape[0]))
        return estimator
    '''

    def fit_estimator(self, data_matrix, n_jobs=-1, cv=2, random_state=42):
        '''
        create self.estimator...
        by inversing the data_matrix set to get a negative set
        and then using edens fit_estimator
        '''
        # create negative set:
        data_matrix_neg = data_matrix.multiply(-1)
        # i hope loss is log.. not 100% sure..
        # probably calibration will fix this#
        return eden_fit_estimator(self.classifier, positive_data_matrix=data_matrix,
                                  negative_data_matrix=data_matrix_neg,
                                  cv=cv,
                                  n_jobs=n_jobs,
                                  n_iter_search=10,
                                  random_state=random_state)

    def move_bias(self, data_matrix, estimator=None, nu=.5, cv=2):
        '''
            move bias until nu of data_matrix are in the negative class
            then use scikits calibrate to calibrate self.estimator around the input
        '''
        #  move bias
        # l = [(estimator.decision_function(g)[0], g) for g in data_matrix]
        # l.sort(key=lambda x: x[0])
        # element = int(len(l) * nu)
        # estimator.intercept_ -= l[element][0]

        scores = [estimator.decision_function(sparse_vector)[0]
                  for sparse_vector in data_matrix]
        scores_sorted = sorted(scores)
        pivot = scores_sorted[int(len(scores_sorted) * self.nu)]
        estimator.intercept_ -= pivot

        # calibrate
        if self.move_bias_recalibrate:
            # data_matrix_binary = vstack([a[1] for a in l])
            # data_y = numpy.asarray([0] * element + [1] * (len(l) - element))
            data_y = numpy.asarray([1 if score >= pivot else -1 for score in scores])
            self.superesti = SGDClassifier(loss='log')  #
            self.superesti.fit(data_matrix, data_y)
            # estimator = CalibratedClassifierCV(estimator, cv=cv, method='sigmoid')
            # estimator = CalibratedClassifierCV(self.testimator, cv=cv, method='sigmoid')
            # estimator.fit(data_matrix, data_y)
        return self.superesti

    def predict_single(self, vectorized_graph):

        return self.superesti.decision_function(vectorized_graph)[0]

    # probably broken ... you should use predict single now o OO
    def predict(self, things):
        # return self.predict_single(things)
        # return numpy.array( [ 1 if self.predict_single(thing)>.5 else 0 for thing in things] )
        return self.superesti.predict(things)
X=[[0., 0.], [1., 1.]]
y=[0, 1]

clf = SGDClassifier(loss="hinge", penalty="l2")

# Model fitting
print("Fitting: ",clf.fit(X, y))

# Model to be used to predict new values
print("Prediction: ",clf.predict([[2.,2.]]))

# Model parameters
print("Model parameter: ",clf.coef_)

# Model intercept (aka offset or bias)
print("Model Intercept: ",clf.intercept_)

# Signed distance to the hyperplane
print("Hyperplane distance: ",clf.decision_function([[2., 2.]])) 

# Concrete loss function (logistic parameter)
clf = SGDClassifier(loss="log").fit(X, y)

print("Classifier with LR: ",clf.predict_proba([[1., 1.]])) 

print(clf)




	def run(self, nFold=3, iter=10, verbose=1, loss='modified_huber', penalty='l2', shuffle=True):
		"""
			CV: -1 => total model (no cv)
			CV: nFold => mean metric over cv
		"""
		self.__database.createGOIDView(self.__goidtable, double=["AUROC", "AUPR", "Fmax"], drop=True)
		self.__database.createProteinView(self.__proteintable, \
						double=["ProteinID", "Label", "Score"], drop=True)
		
		# Get labels
		test = 0
		pp = permutation(self.__numproteins)
		resultid = 0
		for goid in self.__goid:
			print "____________ GOID= %d ____________" % goid
			# Get label for GOID
			goidindex = where(self.__goid==goid)
			goidindex = int(goidindex[0])
			annotations = self.selectAnnotatedProteinsMousefunc(goidindex)

			print "0s=", len([x for x in annotations if x == 0])
			print "1s=", len([x for x in annotations if x == 1])
			print "-1s=", len([x for x in annotations if x == -1])

			annotation = []
                        for value in annotations:
                                annotation.append(value)

			annotation = asarray(annotation).astype(float64)
                        annotation = annotation.ravel()

			model = SGDClassifier(loss=loss, class_weight='auto', penalty=penalty, \
						n_iter=iter, shuffle=shuffle, verbose=verbose)
			model.fit(self.__network, annotation)
			scores = model.decision_function(self.__network)
                        scores = self.convertScore(scores)
			
			per = Performance(annotations, scores)
			roc = per.AUROCGillis()
                        print "AUROC= ", roc
                        pr = per.AUPRGillis()
                        print "AUPR= ", pr
			fmax = per.Fmax()
                        print "Fmax= ", fmax

			self.__database.insertProteinView(self.__proteintable, resultid, goid[0], -1, \
						self.__proteins, annotations, scores)
			self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], -1, [roc, pr, fmax])
			resultid += 1

			del per

			labelIx = range(self.__numproteins)
			offset = 0
			fold = 0
			meanroc = []
			meanpr = []
			meanfmax = []

			while fold < nFold:
				print "____________ Fold= %d ____________" % fold
				lastelem = min(self.__numproteins, offset+floor(self.__numproteins/nFold))
				ix = []
				for index in pp[offset+1:lastelem]:
					ix.append(labelIx[index])
				
				offset = lastelem
	
				labeltmp = []
				for value in annotations:
					labeltmp.append(float(value))
	
				for index in ix:
					labeltmp[index] = 0

				print "0s=", len([x for x in labeltmp if x == 0])
				print "1s=", len([x for x in labeltmp if x == 1])
				print "-1s=", len([x for x in labeltmp if x == -1])

				model = SGDClassifier(loss=loss, class_weight='auto', penalty=penalty, \
							n_iter=iter, shuffle=shuffle, verbose=verbose)
				model.fit(self.__network, labeltmp)
				scores = model.decision_function(self.__network)
	                        scores = self.convertScore(scores)

				score = []
				annotation = []
				proteins = []
				for index in ix:
					score.append(float(scores[index]))
					annotation.append(annotations[index])
					proteins.append(self.__proteins[index])

				per = Performance(annotation, score)
				roc = per.AUROCGillis()
                	        print "AUROC= ", roc
				meanroc.append(roc)
                        	pr = per.AUPRGillis()
	                        print "AUPR= ", pr			
				meanpr.append(pr)
				fmax = per.Fmax()
	                        print "Fmax= ", fmax
				meanfmax.append(fmax)

				self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], fold,\
									[roc, pr, fmax])
				self.__database.insertProteinView(self.__proteintable, resultid, goid[0],\
								fold, proteins, annotation, score)

				del proteins
				del annotation
				del score
				del per
				fold += 1
				resultid += 1

			roc_mean = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0)
			print "Mean AUROC= ", roc_mean
			pr_mean = reduce(lambda x, y: x + y / float(len(meanpr)), meanpr, 0)
			print "Mean AUPR= ", pr_mean
			fmax_mean = reduce(lambda x, y: x + y / float(len(meanfmax)), meanfmax, 0)
			print "Mean Fmax= ", fmax_mean
			
			self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], nFold, \
								[roc_mean, pr_mean, fmax_mean])
			resultid += 1

			test += 1
예제 #54
0
    FN = open('fn.txt','wb')
    for tr_doc,te_doc in kf:
        train_index = doc_to_sen(tr_doc,index_map)
        test_index = doc_to_sen(te_doc,index_map)
        train_data = features.tocsr()[train_index,:]
        train_label = all_labels[train_index]
        test_data = features.tocsr()[test_index,:]
        test_label = all_labels[test_index]
        #train_data = scaler1.fit_transform(train_data)
        clf.fit(train_data,train_label)

        sorted_index_train = []     #the sorted index in the absract
        for t in tr_doc:
            #current_scores = []
            sen_index = doc_to_sen([t],index_map)
            cur_train_score = clf.decision_function(features.tocsr()[sen_index,:])
            #obtain the sorted position of each sentence according to the distance to the boundary
            temp = [i[0] for i in sorted(enumerate(list(cur_train_score)),key=lambda x:x[1],\
                                                 reverse=True)]
            sorted_index = np.zeros(len(temp))
            for i, q in enumerate(temp): sorted_index[q] = i
            sorted_index_train += list(sorted_index)

        #add the previous max score to train new SVM
        train_data = hstack([train_data,np.array(sorted_index_train).reshape(-1,1)])
        #train_data = scaler2.fit_transform(train_data)
        #clf1.fit(train_data,train_label)
        #test_data = scaler1.transform(test_data)
        test_score = clf.decision_function(test_data)
        sorted_index_test = []
        prediction = []
for f in train.columns:
    if train[f].dtype=='object':
        print(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

features = [s for s in train.columns.ravel().tolist() if s != 'QuoteConversion_Flag']
print("Features: ", features)


print("Train a SGDClassifier model")
X_train, X_valid = train_test_split(train, test_size=0.01)
y_train = X_train['QuoteConversion_Flag']
y_valid = X_valid['QuoteConversion_Flag']

clf = SGDClassifier(loss="hinge", penalty="l2", n_jobs=-1)
clf.fit(X_train[features].values, y_train.values)


print("## Validating Data")
preds = clf.decision_function(X_valid[features])
auc_value = roc_auc_score(y_valid, preds)
print("ROC Score : " + str(auc_value))

print("## Predicting test data")
preds = clf.decision_function(test[features].values)
test["QuoteConversion_Flag"] = preds
test[['QuoteNumber',"QuoteConversion_Flag"]].to_csv('test_predictions.csv', index=False)
예제 #56
0
# belang zijn deze parameter hoger in te stellen als de resultaten niet
# consistent zijn.
classifier = SGDClassifier(n_iter=50, loss=config.get("classifier", "loss"),
                           shuffle=True, random_state=random_state)
# We fitten (trainen) de classifier als volgt:
classifier.fit(X_train, y_train)

show_most_informative_features(vectorizer, classifier,
                    n=config.getint("classifier", "top-features"))

if config.get('documents', 'test') == 'no':
    # nu is alles klaar om de classifier te testen op onze test set
    preds = classifier.predict(X_test)

    # de decision_function methode geeft de daadwerkelijke getallen terug
    # op basis waarvan de classificatie wordt gemaakt Dat kan handig zijn
    # later om een drempelwaarde te bepalen
    decisions = classifier.decision_function(X_test)
    print classification_report(y_test, preds)
    print "Area Under the Precision Recall Curve:",  average_precision_score(y_test, decisions)
    precision, recall, _ = precision_recall_curve(y_test, decisions)
    sb.plt.figure()
    sb.plt.plot(recall, precision)
    sb.plt.savefig("Precision-recall-curve.pdf")
else:
    decisions = classifier.decision_function(X_test)
    preds = classifier.predict(X_test)
    for doc_id, decision, pred in sorted(zip(doc_ids, decisions, preds), key=lambda i: i[1]):
        print 'Document:', doc_id, "Score: %.4f, Prediction: %s" % (
            decision, 'NL' if pred == 0 else 'B')
예제 #57
0
 def decision_function(self, X, *args, **kw):
     X = sp.csr_matrix(X)
     return SGDClassifier.decision_function(self, X, *args, **kw)
예제 #58
0
파일: SGD.py 프로젝트: adaminfinitum/daily
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = SGDClassifier(loss="hinge", penalty="l2")
print clf.fit(X, y)
print clf.predict([[2., 2.]])
print clf.coef_
print clf.intercept_ 
print clf.decision_function([[2., 2.]])
clf = SGDClassifier(loss='log').fit(X, y)
print clf.predict_ probab([[1., 1.]])


scaler = StandardScaler()
scaler.fot(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)