Пример #1
0
def lr_with_scale2():
    """
    Submission: lr_with_scale2_0704_03.csv
    E_val:
    E_in: 0.878996
    E_out: 0.8768131004917349
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegressionCV(Cs=50, cv=5, scoring='roc_auc', n_jobs=-1,
                               class_weight='auto')
    clf.fit(X_scaled, y)
    logger.debug('Best C: %f', clf.C_[0])
    logger.debug('Cs: %s', clf.Cs_)
    logger.debug('Grid scores: %f', clf.scores_)
    logger.debug('Ein: %f', Util.auc_score(clf, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('lr', clf)]), 'lr_with_scale2_0704_03')
Пример #2
0
def logistic_test_using_cosine(score_feature=False):
    logger.info('using cosine features in logistic regression')
    if score_feature:
        logger.info('also use score feature')
    Cs = [2**t for t in range(0, 10, 1)]
    Cs.extend([3**t for t in range(1, 10, 1)])
    snli2cosine = SNLI2Cosine('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin')
    logger.info('loading snli data ...')
    train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t')
    train_df = train_df[pd.notnull(train_df.sentence2)]
    train_df = train_df[train_df.gold_label != '-']
    train_df = train_df[:(len(train_df) / 3)]
    train_df.reset_index(inplace=True)
    test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t')
    test_df = test_df[pd.notnull(test_df.sentence2)]
    test_df = test_df[test_df.gold_label != '-']
    test_df.reset_index(inplace=True)
    X_train, train_labels, X_test, test_labels = snli2cosine.calculate_cosine_features(train_df, test_df)
    if score_feature:
        y_train_proba, y_test_proba = joblib.load('./snli/logistic_score_snli.pkl')
        # y_train_proba = y_train_proba.flatten()
        # y_test_proba = y_test_proba.flatten()
        X_train = np.concatenate([X_train, y_train_proba.reshape((-1, 1))], axis=1)
        X_test = np.concatenate([X_test, y_test_proba.reshape((-1, 1))], axis=1)
    logger.info('X_train.shape: {0}'.format(X_train.shape))
    logger.info('X_test.shape: {0}'.format(X_test.shape))

    logreg = LogisticRegressionCV(Cs=Cs, cv=3, n_jobs=10, random_state=919)
    logreg.fit(X_train, train_labels)
    logger.info('best C is {0}'.format(logreg.C_))
    y_test_predicted = logreg.predict(X_test)
    acc = accuracy_score(test_labels, y_test_predicted)
    logger.info('test data predicted accuracy: {0}'.format(acc))
Пример #3
0
def logistic_test(train_data, train_labels, test_data, test_labels, cv=False):
    # Perform logistic regression.
    clf = LogisticRegressionCV() if cv else LogisticRegression()
    clf.fit(train_data, train_labels)
    predicted_labels = clf.predict(test_data)

    # Count true positives, true negatives, false positives, false negatives.
    tp, tn, fp, fn = 0, 0, 0, 0
    for predicted, actual in zip(predicted_labels, test_labels):
        if predicted == 1 and actual == 1:
            tp += 1
        if predicted == 0 and actual == 0:
            tn += 1
        if predicted == 1 and actual == 0:
            fp += 1
        if predicted == 0 and actual == 1:
            fn += 1

    # Compute statistics. 
    accuracy =  (tp + tn) / (tp + tn + fp +fn)
    precision = 0 if (tp + fp) == 0 else tp / (tp + fp)
    recall = 0 if (tp + fn) == 0 else tp / (tp + fn)

    # Print report.
    print "Correctly classified {}/{}".format(tp + tn, tp + tn + fp +fn)
    print "Accuracy:", accuracy
    print "Precision:", precision
    print "Recall:", recall
    print "tp: {}; tn: {}; fp: {}; fn {}".format(tp, tn, fp, fn)

    return accuracy
Пример #4
0
def lr_with_fs():
    """
    Submission: lr_with_fs_0620_02.csv
    E_val: <missing>
    E_in: 0.856252488379
    E_out: 0.8552577388980213
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)
    print(auc_score(clf, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('lr', clf)]), 'lr_with_fs_0620_02')
Пример #5
0
def classify(_char):
    print 'to fetch data'
    start_time = time.time()
    char_count = Character.objects.filter(char=_char, is_correct=1).count()
    if char_count < 10:
        return
    char_lst = Character.objects.filter(char=_char)
    y, X, ty, tX, t_charid_lst, test_accuracy_lst = prepare_data_with_database(char_lst)
    if len(y) == 0 or len(ty) == 0:
        return
    if 1 == len(set(y)) or len(y) < 10:
        return
    fetch_negative_samples(_char, X, y)
    if len(y) == 0 or len(ty) == 0:
        return
    if 1 == len(set(y)) or len(y) < 50:
        return

    print "fetch data done, spent %s seconds." % int(time.time() - start_time)
    start_time = time.time()
    print "traning: data size: %d" % len(y)
    model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1)
    try:
        model.fit(X, y)
        print "training done, spent %s seconds." % int(time.time() - start_time)
        #print 'params: '
        #for k, v in model.get_params().iteritems():
        #    print '\t', k, ' : ', v
        print 'score: ', model.score(X, y)
    except Exception, e:
        print 'except: ', e
        traceback.print_exc()
        return
def optimal_l2(X, y): 
    '''
    Find the optimal level of L2 regularization for logistic regression
    '''
    logit = LogisticRegressionCV(Cs=50, cv=10)
    logit.fit(X, y)
    return logit.C_
Пример #7
0
def LogitSelector(x, y, cv, niter, njob):
    t_size=1 / cv

    lb = prep.LabelBinarizer()
    y = lb.fit_transform(y).ravel()

    model = LogisticRegressionCV(penalty='l1', solver='liblinear', refit=False, cv=cv, n_jobs=njob)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        model.fit(x, y)
    columns = np.arange(x.shape[1])[model.coef_.ravel() != 0]

    accu = []
    prec = []
    rec = []
    f1 = []
    au = []
    cls = LogisticRegression()
    gn_cvset = (Cvset(x[i][:, columns], y[i], x[j][:, columns], y[j]) for (i, j) in ShuffleSplit(len(y), n_iter=niter, test_size=t_size))

    for cvt in gn_cvset:
        cls.fit(cvt.xtr, cvt.ytr)
        accu.append(accuracy_score(cvt.yte, cls.predict(cvt.xte)))
        prec.append(precision_score(cvt.yte, cls.predict(cvt.xte)))
        rec.append(recall_score(cvt.yte, cls.predict(cvt.xte)))
        f1.append(f1_score(cvt.yte, cls.predict(cvt.xte)))
        au.append(__Auc(cls, cvt.xte, cvt.yte))

    cls.fit(x[:,columns], y)
    return Mdc(model=cls, idx=columns, accu=np.mean(accu),
               prec=np.mean(prec), rec=np.mean(rec), f1=np.mean(f1),
               au=np.mean(au))
Пример #8
0
def make_predictions():
    # Fit Logistic Regression Model
    logreg = LogisticRegressionCV(scoring='log_loss', n_jobs=-1, verbose=1, random_state=6156)
    logreg.fit(X=trainX, y=train['y'].values)
    
    # Validate
    pred_pr = logreg.predict_proba(valX)
    loss = log_loss(y_true=val['y'].values, y_pred=pred_pr)
    print "Validation log loss:", loss
    
    # Get Test predictions
    img_files = [os.path.join(IMG_DIR, f) for f in os.listdir(IMG_DIR)]
        
    if os.path.isfile('test_pca.csv'):
        test_pca = pd.read_csv('test_pca.csv', dtype={'id' : str})
    else:
        test_pca = prepare_test_data(img_files, STD_SIZE)
        
    test_predictions = logreg.predict_proba(test_pca.values[:, 1:])
    id_s = [re.sub('\D', '', f) for f in img_files]
    df_id = pd.DataFrame({'id' : id_s})
    col_names = ['col'+str(i) for i in range(1, 9)]
    df_yhat = pd.DataFrame(data=test_predictions, columns=col_names)
    df_id_yhat = pd.concat([test_pca['id'], df_yhat], axis=1)
    yhat = df_id.merge(df_id_yhat, on='id', how='left')
    yhat.fillna(1./8, inplace=True)
    yhat.to_csv('kaggle_430_2pm.csv', index=False)
class Fraud(object):
    def __init__(self):
        self.model = None
        self.fitted = False

    def fit(self, jsonfile, target=0.3):
        self.model = LogisticRegressionCV(cv=15, scoring='recall')
        X, y = featurize_data(jsonfile)

        # Balance the classes
        X_oversample, y_oversample = oversample(X, y, target)
        print X_oversample, y_oversample

        # Fit the model
        self.model.fit(X_oversample, y_oversample)
        self.fitted = True

    def predict(self, X_test):
        return self.model.predict(X_test)[0]

    def save_model(self, picklefile):
        with open(picklefile, 'w') as f:
            pickle.dump(self.model, f)

    def load_model(self, picklefile):
        with open(picklefile, 'r') as f:
            self.model = pickle.load(f)
            self.fitted = True
Пример #10
0
def train(trainingData, pklFile):
	# ========================================================================= #
	# =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= #
	# ========================================================================= #
	if (pklFile == ''):
		os.system('rm -rf learntModel & mkdir learntModel')
		pklFile = 'learntModel/learntModel.pkl'
	
	# ========================================================================= #
	# ================= STEP 2. PREPARE AND FORMATTING DATA =================== #
	# ========================================================================= #
	NUMBER_OF_FEATURES = len(trainingData[0]) - 1
	NUMBER_OF_TRAINING_POINTS = len(trainingData)

	x = trainingData[:, range(0, NUMBER_OF_FEATURES)]
	y = trainingData[:, NUMBER_OF_FEATURES]
	
	# ========================================================================= #
	# ============== STEP 3. DECLARE PRIMITIVES BEFORE THE PARTY ============== #
	# ========================================================================= #
	minSquareError = np.inf
	targetAlpha = None
	alphas = np.logspace(-10, -2, 500)			
	
	# ========================================================================= #
	# ===== STEP 4. PERFORM FITTING WITH THE BEST ALPHA AND SAVE THE MODEL ==== #
	# ========================================================================= #
	clf = LogisticRegressionCV(Cs=alphas)
	clf.fit(x, y)
	joblib.dump(clf, pklFile)
	
	return {"intercept": clf.intercept_, "coef":clf.coef_, "alpha":clf.C_, "accuracy":clf.score(x,y)}
Пример #11
0
def LogitSelector(x, y, cv, njob):

    lb = prep.LabelBinarizer()
    y = lb.fit_transform(y).ravel()

    cls = LogisticRegression()
    def __Auc(xte, yte):
        ypo = cls.predict_proba(xte)
        flt_auc = roc_auc_score(yte, ypo[:,1])
        return flt_auc
    
    skf = StratifiedKFold(y, n_folds=cv)
    model = LogisticRegressionCV(penalty='l1', solver='liblinear', fit_intercept=False, cv=cv, n_jobs=njob)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        model.fit(x, y)
    columns = np.arange(x.shape[1])[model.coef_.ravel() != 0]
    
    mdl_eval = lambda func: lambda idx_tr, idx_te: func(y[idx_te], cls.fit(x[idx_tr][:,columns], y[idx_tr]).predict(x[idx_te][:,columns]))
    auc_eval = lambda idx_tr, idx_te: roc_auc_score(y[idx_te], cls.fit(x[idx_tr][:,columns], y[idx_tr]).predict_proba(x[idx_te][:,columns])[:,1])
    res_eval = lambda func: np.average(map(mdl_eval(func), *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf])))

    accu = res_eval(accuracy_score)
    prec = res_eval(precision_score)
    rec = res_eval(recall_score)
    f1 = res_eval(f1_score)
    au = np.average(map(auc_eval, *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf])))

    cls.fit(x[:,columns], y)
    return Mdc(model=cls, idx=columns, accu=accu, prec=prec, rec=rec, f1=f1, au=au)
Пример #12
0
def compute_roc_auc(test_sa, adv_sa, split=1000):
    tr_test_sa = np.array(test_sa[:split])
    tr_adv_sa = np.array(adv_sa[:split])

    tr_values = np.concatenate(
        (tr_test_sa.reshape(-1, 1), tr_adv_sa.reshape(-1, 1)), axis=0
    )
    tr_labels = np.concatenate(
        (np.zeros_like(tr_test_sa), np.ones_like(tr_adv_sa)), axis=0
    )

    lr = LogisticRegressionCV(cv=5, n_jobs=-1).fit(tr_values, tr_labels)

    ts_test_sa = np.array(test_sa[split:])
    ts_adv_sa = np.array(adv_sa[split:])
    values = np.concatenate(
        (ts_test_sa.reshape(-1, 1), ts_adv_sa.reshape(-1, 1)), axis=0
    )
    labels = np.concatenate(
        (np.zeros_like(ts_test_sa), np.ones_like(ts_adv_sa)), axis=0
    )

    probs = lr.predict_proba(values)[:, 1]

    _, _, auc_score = compute_roc(
        probs_neg=probs[: (len(test_sa) - split)],
        probs_pos=probs[(len(test_sa) - split) :],
    )

    return auc_score
Пример #13
0
def mdl_1d_cat(x, y):
    """builds univariate model to calculate AUC"""
    if x.nunique() > 10 and com.is_numeric_dtype(x):
        x = sb_cutz(x)

    series = pd.get_dummies(x, dummy_na=True)
    lr = LogisticRegressionCV(scoring='roc_auc')

    lr.fit(series, y)

    try:
        preds = (lr.predict_proba(series)[:, -1])
        #preds = (preds > preds.mean()).astype(int)
    except ValueError:
        Tracer()()

    plot = plot_cat(x, y)

    imgdata = BytesIO()
    plot.savefig(imgdata)
    imgdata.seek(0)

    aucz = roc_auc_score(y, preds)
    cmatrix = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdata.getvalue()))
    plt.close()
    return aucz, cmatrix
Пример #14
0
def fit_logistic_regression(y, X):
    """
    Fites a logistic regression
    """
    model_log = LogisticRegressionCV(cv=5, penalty='l2', verbose=1, max_iter=1000)
    fit = model_log.fit(X, y)

    return fit
def classify_maxEnt(train_X, train_Y, test_X):

    print("Classifying using Maximum Entropy ...")
    maxEnt = LogisticRegressionCV()
    maxEnt.fit(train_X, train_Y)
    yHat = maxEnt.predict(test_X)

    return yHat
Пример #16
0
def build_classifier_lr(data, labels, regularization='l2', **kwargs):
    if (regularization == 'l1') or (regularization == 'l2'):
        log_reg = LogisticRegressionCV(penalty=regularization, Cs=100, cv=10, solver='liblinear', refit=False,
                                       n_jobs=10, verbose=1, class_weight='balanced', **kwargs)
    else:
        # lambda = 1/C:  if C->inf lambda -> 0. So if we want no regularization we need to set C to a high value
        log_reg = LogisticRegression(C=100000000., class_weight='balanced', solver='liblinear', n_jobs=10,
                                     verbose=1, **kwargs)
    log_reg.fit(data, labels)
    return log_reg
Пример #17
0
def fitModels(training_data, training_labels, test_data, test_labels):
    print('=========fitModels========:')

    # print('RandomForestClassifier:')
    # clf =RandomForestClassifier(n_estimators=100)
    # clf.fit(training_data, training_labels)  # 训练模型
    # getReport(clf, test_data, test_labels)
    # print('='*50)

    # print('GradientBoostingClassifier: ')
    # gbdt = GradientBoostingClassifier()
    # gbdt.fit(training_data, training_labels)
    # getReport(gbdt, test_data, test_labels)
    # print('='*50)

    # print('MultinomialNB: ')
    # clf =MultinomialNB()
    # clf.fit(training_data, training_labels)  # 训练模型
    # getReport(clf, test_data, test_labels)
    # print('='*50)
    #
    # print('GaussianNB: ')
    # clf =GaussianNB()
    # clf.fit(training_data, training_labels)  # 训练模型
    # getReport(clf, test_data, test_labels)
    # print('='*50)

    print('LogisticRegression: ')
    lr =LogisticRegressionCV()
    lr.fit(training_data, training_labels)  # 训练模型
    print(lr)
    getReport(lr, test_data, test_labels)
    print('='*50)

    print('LinearSVC: ')
    linSVC =LinearSVC()
    linSVC.fit(training_data, training_labels)  # 训练模型
    predict_labels = linSVC.predict(test_data)  # 预测训练集
    getPRF(predict_labels, test_labels)
    print('='*50)
    
    # print('svm: ')
    # clf =svm.SVC()
    # clf.fit(training_data, training_labels)  # 训练模型
    # getReport(clf, test_data, test_labels)
    # print('='*50)

    # print('DecisionTreeClassifier: ')
    # clf =tree.DecisionTreeClassifier()
    # clf.fit(training_data, training_labels)  # 训练模型
    # getReport(clf, test_data, test_labels)
    # print('='*50)

    return lr, linSVC
def classify_maxEnt_twitter(train_X, train_Y, test_X, test_Y):

    print("Classifying using Maximum Entropy ...")
    maxEnt = LogisticRegressionCV()
    maxEnt.fit(train_X, train_Y)
    yHat = maxEnt.predict(test_X)
    conf_mat = confusion_matrix(test_Y,yHat)
    print(conf_mat)
    Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
    print("Accuracy: ", Accuracy)
    evaluate_classifier(conf_mat)
Пример #19
0
def compute_classifier(pow_mat, recalls):
    print 'Computing logistic regression:', pow_mat.shape[0], 'samples', pow_mat.shape[1], 'features'

    lr_classifier = LogisticRegressionCV(penalty='l1', solver='liblinear')
    lr_classifier.fit(pow_mat, recalls)
    probs = lr_classifier.predict_proba(pow_mat)[:,1]
    auc = roc_auc_score(recalls, probs)

    print 'AUC =', auc

    return lr_classifier
Пример #20
0
def doLearn(xtrain,xtest,ytrain,ytest):
    # do the learning by creating an instancee of a sklearn class and fit it to the data
    # score the accuracy of the predictions
    clf=RandomForestClassifier()
    s=clf.fit(xtrain,ytrain).score(xtest,ytest)
    print('rf acc' , s)
    log=LogisticRegressionCV(verbose=6)
    ss=log.fit(xtrain,ytrain).score(xtest,ytest)
    print("logistic acc" , ss)
    svc=SVC()
    sss=svc.fit(xtrain,ytrain).score(xtest,ytest)
    print("svc acc",sss)
Пример #21
0
def lr_with_fs():
    """
    Submission: lr_with_fs_0703_01.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    import pylab as pl

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl')
    rfe = IO.fetch_cache(pkl_path)
    if rfe is None:
        rfe = RFECV(estimator=LogisticRegression(class_weight='auto'),
                    cv=StratifiedKFold(y, 5), scoring='roc_auc')
        rfe.fit(X_scaled, y)
        IO.cache(rfe, pkl_path)

        print("Optimal number of features : %d" % rfe.n_features_)

        # Plot number of features VS. cross-validation scores
        pl.figure()
        pl.xlabel("Number of features selected")
        pl.ylabel("Cross validation score (AUC)")
        pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
        pl.savefig('lr_with_fs.refcv')

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)

    print('CV scores: %s' % clf.scores_)
    print('Ein: %f' % Util.auc_score(clf, X_new, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('rfe', rfe),
                                 ('scale_new', new_scaler),
                                 ('lr', clf)]), 'lr_with_fs_0703_01')
 def classify(self, mp, x_train, y_train, x_test):
     x_train = sm.add_constant(x_train)
     x_test = sm.add_constant(x_test)
     clf = LogisticRegressionCV(verbose=1, cv=5)
     log_to_info('Fitting a Logistic Regression to labeled training data...')
     clf = clf.fit(x_train, y_train)
     log_to_info('Training details')
     log_to_info('Classifier parameters: {}'.format(clf.get_params()))
     log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0))
     log_to_info('Predicting test value')
     y_test = clf.predict(x_test)
     log_to_info('Done!')
     return y_test
Пример #23
0
def lr():
    """
    Submission: lr_0618.csv
    E_val: <missing>
    E_in: <missing>
    E_out: 0.8119110960575004
    """
    from sklearn.linear_model import LogisticRegressionCV
    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))
    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X, y)
    print(auc_score(clf, X, y))
    to_submission(clf, 'lr_0618_xxx')
class LogisticModelCombination(ClassifierMixin):

    """
        Combine multiple models using a Logistic Regression
    """

    def __init__(self, classifiers, cv_folds=1, use_original_features=False, random_state=None, verbose=0):
        self.classifiers = classifiers
        self.cv_folds = cv_folds
        self.use_original_features = use_original_features
        self.logistic = LogisticRegressionCV(
            Cs=[10, 1, 0.1, 0.01, 0.001], refit=True)

        if random_state is None:
            self.random_state = random.randint(0, 10000)
        else:
            self.random_state = random_state

    def fit(self, X, y):
        sss = StratifiedShuffleSplit(
            y, n_iter=self.cv_folds, random_state=self.random_state)
        for train_index, test_index in sss:
            train_x = X[train_index]
            train_y = y[train_index]

            test_x = X[test_index]
            test_y = y[test_index]

            self._fit_logistic(train_x, train_y)

    def _fit_logistic(self, X, y):
        pred_X = self.convert_data(X)
        self.logistic.fit(pred_X, y)
        return self

    def convert_data(self, X):
        preds = []
        for i, clf in enumerate(self.classifiers):
            class_proba = clf.predict(X)
            preds.append(class_proba)
        pred_X = np.vstack(preds).T

        if self.use_original_features:
            pred_X = np.concatenate([X, pred_X], axis=1)
        return pred_X

    def predict_proba(self, X):
        pred_X = self.convert_data(X)
        return self.logistic.predict_proba(pred_X)
Пример #25
0
def classify_with_random_samples(char, positive_sample_count, auto_apply=False, random_sample=0):
    print char, positive_sample_count
    started = timezone.now()
    start_time = time.time()
    query = Character.objects.filter(char=char)
    positive_samples, negative_samples, test_X, test_y, test_char_id_lst, test_accuracy_lst = \
        prepare_data_with_database2(query)
    X = []
    y = []
    if random_sample != 0:
        if positive_sample_count > 0:
            if len(positive_samples) > positive_sample_count:
                positive_samples = random.sample(positive_samples, positive_sample_count)
            if len(negative_samples) > positive_sample_count:
                negative_samples = random.sample(negative_samples, positive_sample_count)
    else:
        if len(positive_samples) > positive_sample_count:
            positive_samples.sort(key=itemgetter(2), reverse=True)
            positive_samples = positive_samples[:positive_sample_count]
        if len(negative_samples) > positive_sample_count:
            negative_samples.sort(key=itemgetter(2))
            negative_samples = negative_samples[:positive_sample_count]
    for sample in positive_samples:
        X.append(sample[0])
        y.append(sample[1])
    for sample in negative_samples:
        X.append(sample[0])
        y.append(sample[1])
    train_count = len(y)
    predict_count = len(test_y)
    if 1 == len(set(y)) or train_count < 10 or predict_count == 0:
        return
    fetch_spent = int(time.time() - start_time)
    print "fetch data done, spent %s seconds." % fetch_spent
    start_time = time.time()
    print "traning: data size: %d" % len(y)
    model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1)
    try:
        model.fit(X, y)
        training_spent = int(time.time() - start_time)
        print "training done, spent %s seconds." % training_spent
        # print 'params: '
        # for k, v in model.get_params().iteritems():
        #    print '\t', k, ' : ', v
        print 'score: ', model.score(X, y)
    except Exception, e:
        print 'except: ', e
        traceback.print_exc()
        return
class SentenceClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 sents_shuffle=False,
                 doc2vec=gensim.models.doc2vec.Doc2Vec()
                 ):
        argdict= locals()
        argdict.pop('argdict',None)
        argdict.pop('self',None)
        vars(self).update(argdict)
        #print argdict
    
    def fit(self, X, y):
        self.sents_train=X
        self.Y_train=y
        return self
    
    def doc2vec_set(self,all_docs):
        #print 'doc2vec_set,SentenceClassifier'
        if hasattr(self.doc2vec, 'syn0'):
            self.doc2vec.reset_weights()
            #del self.doc2vec.syn0
            delattr(self.doc2vec, 'syn0')
        self.doc2vec.build_vocab(all_docs)
        self.doc2vec.train(all_docs)

    def predict(self,X):
        self.sents_test=X
        self.sents_all=self.sents_train + self.sents_test

        if self.sents_shuffle :
            s_indexs=range(len(self.sents_all))
            random.shuffle(s_indexs)
            s_invers_indexs=range(len(s_indexs))
            for n in range(len(s_indexs)):
                s_invers_indexs[s_indexs[n]]=n
            sents_all=[self.sents_all[n] for n in s_indexs]
        else:
            sents_all=self.sents_all
        all_docs = list(LabeledListSentence(self.sents_all))
        
        self.doc2vec_set(all_docs)
        #print 'size',self.doc2vec.vector_size

        self.X_train= [self.doc2vec.infer_vector(s) for s in self.sents_train]
        self.X_test= [self.doc2vec.infer_vector(s) for s in self.sents_test]
        self.logistic =LogisticRegressionCV(class_weight='balanced')#,n_jobs=-1)
        self.logistic.fit(self.X_train,self.Y_train)
        Y_test_predict=self.logistic.predict(self.X_test)
        return Y_test_predict
Пример #27
0
def logit(filepath_and_pathway_ids):

	filepath, first_pathway_id, second_pathway_id = filepath_and_pathway_ids

	# we had done dataset.to_csv(filename, index=True, header=True)
	dataset = pd.read_csv(filepath, index_col=0)
	labels = dataset.index.str.replace(first_pathway_id, "positive").str.replace(second_pathway_id, "positive").tolist()

	classifier = LogisticRegressionCV(solver='liblinear', penalty='l1', Cs=[5], cv=10)
	classifier.fit(dataset.values, labels)
	features = pd.DataFrame(classifier.coef_, columns=dataset.columns)
	features = features.ix[0, features.loc[0].nonzero()[0].tolist()].index.tolist()
	scores = list(classifier.scores_.values())[0].flatten().tolist()

	return first_pathway_id, second_pathway_id, scores, features
Пример #28
0
def logit(pathway_id_and_filepath):

	pathway_id, filepath = pathway_id_and_filepath

	# we had done dataset.to_csv(filename, index=True, header=True)
	dataset = pd.read_csv(filepath, index_col=0)
	labels = dataset.index.tolist()

	classifier = LogisticRegressionCV(solver='liblinear', penalty='l1', Cs=[5], cv=10)
	classifier.fit(dataset.values, labels)
	features = pd.DataFrame(classifier.coef_, columns=dataset.columns)
	features = features.ix[0, features.loc[0].nonzero()[0].tolist()].index.tolist()
	scores = list(classifier.scores_.values())

	return pathway_id, scores, features
Пример #29
0
def mdl_1d(x, y):
    """builds univariate model to calculate AUC"""
    lr = LogisticRegressionCV(scoring='roc_auc')
    lars = LassoLarsIC(criterion='aic')

    if x.nunique() > 10 and com.is_numeric_dtype(x):
        x2 = sb_cutz(x)
        series = pd.get_dummies(x2, dummy_na=True)
    else:
        series = pd.get_dummies(x, dummy_na=True)

    lr.fit(series, y)
    lars.fit(series, y)

    try:
        preds = (lr.predict_proba(series)[:, -1])
        #preds = (preds > preds.mean()).astype(int)
    except ValueError:
        Tracer()()

    # try:
    #    cm = confusion_matrix(y, (preds > y.mean()).astype(int))
    # except ValueError:
    #    Tracer()()

    aucz = roc_auc_score(y, preds)

    ns = num_bin_stats(x, y)

    nplot = plot_num(ns)
    #plot = plot_confusion_matrix(cm, y)

    imgdata = BytesIO()
    nplot.savefig(imgdata)
    imgdata.seek(0)
    nplot = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdata.getvalue()))
    plt.close()

    bplot = plot_bubble(ns)
    imgdatab = BytesIO()
    bplot.savefig(imgdatab)
    imgdatab.seek(0)
    bplot = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdatab.getvalue()))
    plt.close()

    return aucz, nplot, bplot
Пример #30
0
def try_all_k_best(max=13):
    for k in range(1,max+1):
        data = featureFormat(my_dataset, features_list, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        features_train, features_test, labels_train, labels_test = \
            train_test_split(features, labels, test_size=0.3, random_state=42)
        selector = SelectKBest(k=k)
        features_train = selector.fit_transform(features_train, labels_train)
        features_test = selector.transform(features_test)
        choices.append(selector.transform(np.array(features_list[1:]).reshape(1, -1)))
        lr_cv = LogisticRegressionCV()
        lr_cv.fit(features_train, labels_train)
        pred.append(lr_cv.predict(features_test))
        acc.append(accuracy_score(labels_test, pred[k-1]))
        prec.append(precision_score(labels_test, pred[k-1]))
        reca.append(recall_score(labels_test, pred[k-1]))     
Пример #31
0
        kf = KFold(n_splits=5) # Define the split - into 2 folds
        kf.get_n_splits(x_train) # returns the number of splitting iterations in the cross-validator
        # print(kf)
        KFold(n_splits=5, random_state=None, shuffle=True)
        y=[]
        my_score_arr=[]
        for k,(train_index, test_index) in enumerate(kf.split(x_train,y_train)):
            # print('TRAIN:', train_index)
            # print('TEST:', test_index,'\n')
            X_train_K, X_test_K = x_train[train_index], x_train[test_index]
            y_train_K, y_test_K = y_train[train_index], y_train[test_index]

            # X_train_K, X_test_K , y_train_K, y_test_K = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

            model = LogisticRegressionCV(penalty='l1',Cs=10,cv=5,solver='liblinear')
            model.fit(X_train_K, y_train_K)
            preds = model.predict(X_test_K)

            # print(X_test_K)
            # model.fit(x_train[train_index], y_train[train_index])
            my_score=model.score(X_test_K, y_test_K)
            my_score_arr.append(my_score)

            # print("[fold {0}] score: {1:.5f}".format(k, my_score))
            # print('regression coef-values ')
            # print(model.coef_)
            # print('C',model.C_)
            # print('CS_',model.Cs_)

            # scores, pvalues = chi2(X_train_K, y_train_K)
Пример #32
0
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)

# LogisticRegression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)

# LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=5, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)

# SGDClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test).reshape(-1, 1)
accuracy_score(y_test, y_pred)

# Perceptron
Пример #33
0
    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        N = X.shape[0]
        if feature_names is None:
            self.feature_names = [
                'feature_' + str(x) for x in range(0, X.shape[1])
            ]
        else:
            self.feature_names = feature_names
        if 'r' in self.model_type:
            ## initialise tree generator
            if self.tree_generator is None:
                n_estimators_default = int(
                    np.ceil(self.max_rules / self.tree_size))
                self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N)
                if self.rfmode == 'regress':
                    self.tree_generator = GradientBoostingRegressor(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)
                else:
                    self.tree_generator = GradientBoostingClassifier(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)

            if self.rfmode == 'regress':
                if type(self.tree_generator) not in [
                        GradientBoostingRegressor, RandomForestRegressor
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingRegressor"
                    )
            else:
                if type(self.tree_generator) not in [
                        GradientBoostingClassifier, RandomForestClassifier
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingClassifier"
                    )

            ## fit tree generator
            if not self.exp_rand_tree_size:  # simply fit with constant tree size
                self.tree_generator.fit(X, y)
            else:  # randomise tree size as per Friedman 2005 Sec 3.3
                np.random.seed(self.random_state)
                tree_sizes = np.random.exponential(
                    scale=self.tree_size - 2,
                    size=int(np.ceil(self.max_rules * 2 / self.tree_size)))
                tree_sizes = np.asarray([
                    2 + np.floor(tree_sizes[i_])
                    for i_ in np.arange(len(tree_sizes))
                ],
                                        dtype=int)
                i = int(len(tree_sizes) / 4)
                while np.sum(tree_sizes[0:i]) < self.max_rules:
                    i = i + 1
                tree_sizes = tree_sizes[0:i]
                self.tree_generator.set_params(warm_start=False)
                curr_est_ = 0
                for i_size in np.arange(len(tree_sizes)):
                    size = tree_sizes[i_size]
                    self.tree_generator.set_params(n_estimators=curr_est_ + 1)
                    self.tree_generator.set_params(max_leaf_nodes=size)
                    random_state_add = self.random_state if self.random_state else 0
                    self.tree_generator.set_params(
                        random_state=i_size + random_state_add
                    )  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
                    self.tree_generator.get_params()['n_estimators']
                    self.tree_generator.fit(np.copy(X, order='C'),
                                            np.copy(y, order='C'))
                    curr_est_ = curr_est_ + 1
                self.tree_generator.set_params(warm_start=False)
            tree_list = self.tree_generator.estimators_
            if isinstance(self.tree_generator,
                          RandomForestRegressor) or isinstance(
                              self.tree_generator, RandomForestClassifier):
                tree_list = [[x] for x in self.tree_generator.estimators_]

            ## extract rules
            self.rule_ensemble = RuleEnsemble(tree_list=tree_list,
                                              feature_names=self.feature_names)

            ## concatenate original features and rules
            X_rules = self.rule_ensemble.transform(X)

        ## standardise linear variables if requested (for regression model only)
        if 'l' in self.model_type:

            ## standard deviation and mean of winsorized features
            self.winsorizer.train(X)
            winsorized_X = self.winsorizer.trim(X)
            self.stddev = np.std(winsorized_X, axis=0)
            self.mean = np.mean(winsorized_X, axis=0)

            if self.lin_standardise:
                self.friedscale.train(X)
                X_regn = self.friedscale.scale(X)
            else:
                X_regn = X.copy()

        ## Compile Training data
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            X_concat = np.concatenate((X_concat, X_regn), axis=1)
        if 'r' in self.model_type:
            if X_rules.shape[0] > 0:
                X_concat = np.concatenate((X_concat, X_rules), axis=1)

        ## fit Lasso
        if self.rfmode == 'regress':
            if self.Cs is None:  # use defaultshasattr(self.Cs, "__len__"):
                n_alphas = 100
                alphas = None
            elif hasattr(self.Cs, "__len__"):
                n_alphas = None
                alphas = 1. / self.Cs
            else:
                n_alphas = self.Cs
                alphas = None
            self.lscv = LassoCV(n_alphas=n_alphas,
                                alphas=alphas,
                                cv=self.cv,
                                random_state=self.random_state)
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_
            self.intercept_ = self.lscv.intercept_
        else:
            Cs = 10 if self.Cs is None else self.Cs
            self.lscv = LogisticRegressionCV(Cs=Cs,
                                             cv=self.cv,
                                             penalty='l1',
                                             random_state=self.random_state,
                                             solver='liblinear')
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_[0]
            self.intercept_ = self.lscv.intercept_[0]

        return self
Пример #34
0
def main(data_dir, models_dir):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    planes = ['axial', 'coronal', 'sagittal']
    conditions = ['abnormal', 'acl', 'meniscus']

    models = []

    print(f'Loading best CNN models from {models_dir}...')

    for condition in conditions:
        models_per_condition = []
        for plane in planes:
            checkpoint_pattern = glob(f'{models_dir}/*{plane}*{condition}*.pt')
            checkpoint_path = sorted(checkpoint_pattern)[-1]
            checkpoint = torch.load(checkpoint_path, map_location=device)

            model = MRNet().to(device)
            model.load_state_dict(checkpoint['state_dict'])
            models_per_condition.append(model)

        models.append(models_per_condition)

    print(f'Creating data loaders...')

    axial_loader = make_data_loader(data_dir, 'train', 'axial')
    coronal_loader = make_data_loader(data_dir, 'train', 'coronal')
    sagittal_loader = make_data_loader(data_dir, 'train', 'sagittal')

    print(f'Collecting predictions on train dataset from the models...')

    ys = []
    Xs = [[], [], []]  # Abnormal, ACL, Meniscus

    with tqdm(total=len(axial_loader)) as pbar:
        for (axial_inputs, labels), (coronal_inputs, _), (sagittal_inputs, _) in \
                zip(axial_loader, coronal_loader, sagittal_loader):

            axial_inputs, coronal_inputs, sagittal_inputs = \
                axial_inputs.to(device), coronal_inputs.to(device), sagittal_inputs.to(device)

            ys.append(labels[0].cpu().tolist())

            for i, model in enumerate(models):
                axial_pred = model[0](axial_inputs).detach().cpu().item()
                coronal_pred = model[1](coronal_inputs).detach().cpu().item()
                sagittal_pred = model[2](sagittal_inputs).detach().cpu().item()

                X = [axial_pred, coronal_pred, sagittal_pred]
                Xs[i].append(X)

            pbar.update(1)

    ys = np.asarray(ys).transpose()
    Xs = np.asarray(Xs)

    print(f'Training logistic regression models for each condition...')

    clfs = []

    for X, y in zip(Xs, ys):
        clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)
        clfs.append(clf)

    for i, clf in enumerate(clfs):
        print(
            f'Cross validation score for {conditions[i]}: {clf.score(X, y):.3f}'
        )
        clf_path = f'{models_dir}/lr_{conditions[i]}.pkl'
        joblib.dump(clf, clf_path)

    print(f'Logistic regression models saved to {models_dir}')
    get_all = lambda fw: get_all_features(fw.pos) | get_all_features(fw.neg)
    all_features = get_all(res.targets[0].feature_weights)
    if len(all_features) > 1:
        f = list(all_features - {'<BIAS>'})[0]
        flt_res = get_res(x, feature_filter=lambda name, _: name != f)
        flt_features = get_all(flt_res.targets[0].feature_weights)
        assert flt_features == (all_features - {f})
        return True
    return False


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')],
    [LogisticRegression(random_state=42, fit_intercept=False)],
    [LogisticRegressionCV(random_state=42)],
    [SGDClassifier(**SGD_KWARGS)],
    [SGDClassifier(loss='log', **SGD_KWARGS)],
    [PassiveAggressiveClassifier(random_state=42)],
    [Perceptron(random_state=42)],
    [RidgeClassifier(random_state=42)],
    [RidgeClassifierCV()],
    [LinearSVC(random_state=42)],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
])
def test_explain_linear(newsgroups_train, clf):
    assert_multiclass_linear_classifier_explained(newsgroups_train, clf,
                                                  explain_prediction)
    if isinstance(clf, OneVsRestClassifier):
        assert_multiclass_linear_classifier_explained(
            newsgroups_train, clf, explain_prediction_sklearn)
Пример #36
0
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)
x_test_1 = sc.fit_transform(x_test_1)
x_test_2 = sc.fit_transform(x_test_2)
x_test_3 = sc.fit_transform(x_test_3)
x_test_4 = sc.fit_transform(x_test_4)
x_test_5 = sc.fit_transform(x_test_5)
x_test_6 = sc.fit_transform(x_test_6)

#####          Data Prep ####### End

########                *******Logistic Regression,RF***********          #####################
#https://www.edureka.co/blog/logistic-regression-in-python/
clf = LogisticRegressionCV(cv=10, random_state=0).fit(x_train, y_train) # for logistic

#################    for t zero ######################################
predictions = clf.predict(x_test)
probabilities = clf.predict_proba(x_test)[:,1]  

print(classification_report(y_test, predictions))
df_confusion = pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=False)
print(df_confusion)
import matplotlib.pyplot as plt

print ('accuracy:' + str(round(accuracy_score(y_test, predictions)*100,2)) + "%")
print (clf.coef_)
print ("-feature coefficients-")
for i,j in enumerate(list(x.columns)):
    print(str(j)+" :"+str(round(clf.coef_[:,i],3)))
Пример #37
0
y = np.ndarray.astype(user_df.values[:, -1], int)
user_df = user_df.drop([1, user_df.columns[-1]],
                       axis=1)  # drop time and y column
article_df = pd.read_csv(af_name, header=None)

# process joined data
X_df = user_df.merge(article_df, on=0)
X = X_df.as_matrix()
X = np.ndarray.astype(X[:, 1:], float)  # remove user_id
X[np.isnan(X)] = 0  # clear NaNs

# min-max scaling
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler(feature_range=(-1, 1))
scalar_fit = scalar.fit(X)
dmin = scalar.data_min_
dmax = scalar.data_max_
Xnorm = scalar.transform(X)

# sample weights
yrat = np.sum(y == 1) / len(y)
xrat = 1 - yrat
s_weights = np.zeros(len(y))
s_weights[y == 0] = yrat
s_weights[y == 1] = xrat

# Logistic Regression
clf = LR(penalty='l2', class_weight='balanced').fit(Xnorm, y)
preds = clf.predict_proba(Xnorm)[:, 1]
ll = log_loss(y, preds, s_weights)
Пример #38
0
plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic -- XGBoost')
plt.legend(loc="lower right")
plt.savefig('xgboost_roc.pdf', format='pdf')
#plt.show()

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import KFold

kFold = 3
cv = KFold(n_splits=kFold, random_state=seed)

# Default is accuracy_score.
clf = LogisticRegressionCV(penalty='l2', cv=cv, random_state=seed)

clf.fit(X_train, y_train)  # for the ith class
C_optimal = clf.C_[0]

# This is the best model
best_model_lr = clf.C_[0]
print(best_model_lr)

clf = LogisticRegression(penalty='l2', random_state=seed, C=C_optimal)
clf.fit(X_train, y_train)
y_score_logistic = clf.predict_proba(X_test)
y_hat_logistic = clf.predict(X_test)

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
Пример #39
0
# coding=utf-8
Пример #40
0
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark)
    plt.grid(b=True, ls=':')
    plt.xlabel(u'组份1', fontsize=14)
    plt.ylabel(u'组份2', fontsize=14)
    plt.title(u'鸢尾花数据PCA降维', fontsize=18)
    # plt.savefig('1.png')
    plt.show()

    x, x_test, y, y_test = train_test_split(x, y, train_size=0.7)
    model = Pipeline([('poly', PolynomialFeatures(degree=2,
                                                  include_bias=True)),
                      ('lr',
                       LogisticRegressionCV(Cs=np.logspace(-3, 4, 8),
                                            cv=5,
                                            fit_intercept=False))])
    model.fit(x, y)
    print('最优参数:', model.get_params('lr')['lr'].C_)
    y_hat = model.predict(x)
    print('训练集精确度:', metrics.accuracy_score(y, y_hat))
    y_test_hat = model.predict(x_test)
    print('测试集精确度:', metrics.accuracy_score(y_test, y_test_hat))

    N, M = 500, 500  # 横纵各采样多少个值
    x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max())  # 第0列的范围
    x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max())  # 第1列的范围
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
    x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
Пример #41
0
    def logistic(self):
        lr = make_pipeline(LogisticRegressionCV(cv=self.kfolds))
        lr.fit(self.X, self.y)

        return lr
Пример #42
0
print(metrics.confusion_matrix(y_test, lr_predict_test))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))
print(metrics.recall_score(y_test, lr_predict_test))

# ### LogisticRegressionCV

# In[37]:

from sklearn.linear_model import LogisticRegressionCV
lr_cv_model = LogisticRegressionCV(
    n_jobs=-1,
    random_state=42,
    Cs=3,
    cv=10,
    refit=False,
    class_weight="balanced",
    max_iter=500
)  # set number of jobs to -1 which uses all cores to parallelize
lr_cv_model.fit(X_train, y_train.ravel())

# ### Predict on Test data

# In[38]:

lr_cv_predict_test = lr_cv_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(
    metrics.accuracy_score(y_test, lr_cv_predict_test)))
Пример #43
0
    def performance_analysis(self):
        """
        Analyze and print to stdout the performances of a big list of classifiers, in order
        to include only the best ones in the final version of RiskInDroid.
        :return: None.
        """

        # Category of permissions for which to calculate the performances.
        _cat = 'declared'

        _k_fold = StratifiedKFold(n_splits=10,
                                  shuffle=True,
                                  random_state=self.seed)

        # The original list of classifiers taken into consideration, before selecting
        # only the best ones for RiskInDroid.
        _all_models = (SVC(kernel='linear',
                           probability=True,
                           random_state=self.seed), GaussianNB(),
                       MultinomialNB(), BernoulliNB(),
                       DecisionTreeClassifier(random_state=self.seed),
                       RandomForestClassifier(random_state=self.seed),
                       AdaBoostClassifier(random_state=self.seed),
                       GradientBoostingClassifier(random_state=self.seed),
                       SGDClassifier(loss='log', random_state=self.seed),
                       LogisticRegression(random_state=self.seed),
                       LogisticRegressionCV(random_state=self.seed),
                       KNeighborsClassifier(), LinearDiscriminantAnalysis(),
                       QuadraticDiscriminantAnalysis(),
                       MLPClassifier(random_state=self.seed))

        _training_sets = list(self.get_training_vectors_3_sets())

        for model in _all_models:
            print('\n\n\nAnalysis of ' + model.__class__.__name__ + ':')

            # Goodware and malware scores for the current model.
            _malware_scores = numpy.array([])
            _goodware_scores = numpy.array([])

            # Correctly predicted targets for the current model.
            _ok_targets = numpy.array([])

            # We analyze the 3 training sets for each model.
            for (index, current_set) in enumerate(_training_sets):

                # current_set[0] = application set
                # current_set[1] = application targets

                # Goodware and malware scores for the current set.
                _loc_m_scores = numpy.array([])
                _loc_g_scores = numpy.array([])

                # Correctly predicted targets for the current set.
                _loc_ok_targets = numpy.array([])

                # The analysis is done using 10-cross fold validation.
                for train_index, test_index in _k_fold.split(
                        current_set[0][_cat], current_set[1]):

                    _train_data = numpy.array(current_set[0][_cat])
                    _train_targets = numpy.array(current_set[1])

                    model.fit(_train_data[train_index],
                              _train_targets[train_index])

                    # Correctly predicted targets for the current fold.
                    _fold_ok_targets = 0

                    for loc_index in test_index:

                        proba = list(
                            zip(
                                model.classes_,
                                model.predict_proba([_train_data[loc_index]
                                                     ])[0]))

                        # The malware probability is considered as the risk value.
                        if proba[0][0] == b'malware':
                            _result = proba[0]
                        else:
                            _result = proba[1]

                        # We consider only correct predictions for calculating the mean
                        # and the standard deviation.
                        _true_target = _train_targets[loc_index]

                        # If the current app under test is a malware.
                        if _result[1] >= 0.5:
                            # If the prediction is correct.
                            if _result[0] == _true_target:
                                _fold_ok_targets += 1
                                _loc_m_scores = numpy.append(
                                    _loc_m_scores, _result[1])

                        # If the current app under test is not a malware.
                        else:
                            # If the prediction is correct.
                            if _result[0] != _true_target:
                                _fold_ok_targets += 1
                                _loc_g_scores = numpy.append(
                                    _loc_g_scores, _result[1])

                    _loc_ok_targets = numpy.append(
                        _loc_ok_targets, _fold_ok_targets / len(test_index))

                print('    set_{0}:'.format(index + 1))
                print('        accuracy: {0:.2f}'.format(
                    _loc_ok_targets.mean() * 100))
                print('        malware mean: {0:.2f}'.format(
                    _loc_m_scores.mean() * 100))
                print('        malware std_dev: {0:.2f}'.format(
                    _loc_m_scores.std() * 100))
                print('        goodware mean: {0:.2f}'.format(
                    _loc_g_scores.mean() * 100))
                print('        goodware std_dev: {0:.2f}'.format(
                    _loc_g_scores.std() * 100))

                _ok_targets = numpy.append(_ok_targets, _loc_ok_targets)
                _malware_scores = numpy.append(_malware_scores, _loc_m_scores)
                _goodware_scores = numpy.append(_goodware_scores,
                                                _loc_g_scores)

            print('    total:')
            print('        accuracy: {0:.2f}'.format(_ok_targets.mean() * 100))
            print('        malware mean: {0:.2f}'.format(
                _malware_scores.mean() * 100))
            print('        malware std_dev: {0:.2f}'.format(
                _malware_scores.std() * 100))
            print('        goodware mean: {0:.2f}'.format(
                _goodware_scores.mean() * 100))
            print('        goodware std_dev: {0:.2f}'.format(
                _goodware_scores.std() * 100))
    {
        'Reservoir': [
            'Artiodactyl', 'Carnivore', 'Fish', 'Galloanserae', 'Insect',
            'Neoaves', 'Plant', 'Primate', 'Pterobat', 'Rodent', 'Vespbat'
        ]
    }, {'Reservoir': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

data_array = experiment_data.iloc[:, 6:].to_numpy()
label_array = experiment_data['Reservoir'].to_numpy()

train_set, test_set, train_labels, test_labels = train_test_split(
    data_array,
    label_array,
    test_size=0.20,
    random_state=314,
    stratify=label_array)

# train_labels = preprocessing.label_binarize(train_labels, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# test_labels = preprocessing.label_binarize(test_labels, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

lr = LogisticRegressionCV()

parameters = {'Cs': [1, 5, 10, 20, 50], 'cv': [5], 'penalty': ['l2']}

clf = GridSearchCV(lr, parameters, cv=(test_set, test_labels))

clf.fit(train_set, train_labels)

print(accuracy_score(clf.predict(train_set), train_labels))
print(accuracy_score(clf.predict(test_set), test_labels))
Пример #45
0
 def __init__(self):
     self.sentences = list()
     self.features = list()
     self.pos_labels = list()
     self.vectorizer = DictVectorizer()
     self.model = LogisticRegressionCV(random_state=123)
Пример #46
0
plt.show()


def plot_decision_boundary(pred_func):  #援引自CSDN上的边界决策函数,看不太懂,具体意思大概知道
    # 设定最大最小值,附加一点点边缘填充
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # 用预测函数预测一下
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # 然后画出图
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)


from sklearn.linear_model import LogisticRegressionCV
# 生成线性逻辑回归分类器
clf = LogisticRegressionCV()
clf.fit(X, y)

# 画决策边界
plot_decision_boundary(lambda x: clf.predict(x))
plt.title("Logistic Regression")
plt.show()
Пример #47
0
#
#
#
# =============================================================================

# In[51]:

#Run a Kfolds cross validation model on the data set and predicted y from the set

from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import KFold
fold = KFold(len(y_train), n_folds=10, shuffle=True)
classifier = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),
                                  penalty='l2',
                                  scoring='roc_auc',
                                  cv=fold,
                                  max_iter=4000,
                                  fit_intercept=True,
                                  solver='newton-cg')

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#print(classifier.scores_[1].max())

classifier.fit(X_train_scale, y_train)
y_pred_scale = classifier.predict(X_test_scale)

#print(classifier.scores_[1].max())

# In[52]:
Пример #48
0
elif preprocess == "scaler":
    scaler = StandardScaler()

else:
    ValueError("Unknown preprocessing option")

X_train, X_test, y_train, y_test = train_test_split(XX, Y)

ros = RandomOverSampler()

#%%

algorithms = {
    "lr":
    LogisticRegressionCV(n_jobs=-1, penalty="l2", solver="saga", verbose=True),
    "svc":
    SVC(C=10.0, kernel="rbf", gamma="auto", verbose=True),
    "rf":
    RandomForestClassifier(n_estimators=5000, n_jobs=-1),
    "mlp":
    MLPClassifier(hidden_layer_sizes=(100, 100)),
    "grb":
    GradientBoostingClassifier(n_estimators=1000),
    "auto":
    autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=10 * 3600)
}

# Resampling instead of using "class_weight" produces better results (empiricaly)
X_train, y_train = ros.fit_sample(X_train, y_train)
Пример #49
0
     'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'],
     'dataset': 'classifier',
 },
 {
     'model':
     LogisticRegression(max_iter=100, multi_class='multinomial'),
     'methods': [
         'decision_function', 'predict', 'predict_proba',
         'predict_log_proba', 'score'
     ],
     'dataset':
     'classifier',
 },
 {
     'model':
     LogisticRegressionCV(max_iter=100),
     'methods': [
         'decision_function', 'predict', 'predict_proba',
         'predict_log_proba', 'score'
     ],
     'dataset':
     'classifier',
 },
 {
     'model': RandomForestRegressor(n_estimators=10),
     'methods': ['predict', 'score'],
     'dataset': 'regression',
 },
 {
     'model': LinearRegression(),
     'methods': ['predict', 'score'],
Пример #50
0
 def _reset_classifier(self) -> None:
     self.classifier = LogRegCV()
Пример #51
0
def tune_mahalanobis_hyperparams():
    def print_tuning_results(results, stypes):
        mtypes = ['FPR', 'DTERR', 'AUROC', 'AUIN', 'AUOUT']

        for stype in stypes:
            print(' OOD detection method: ' + stype)
            for mtype in mtypes:
                print(' {mtype:6s}'.format(mtype=mtype), end='')
            print('\n{val:6.2f}'.format(val=100. * results[stype]['FPR']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results[stype]['DTERR']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results[stype]['AUROC']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results[stype]['AUIN']),
                  end='')
            print(' {val:6.2f}\n'.format(val=100. * results[stype]['AUOUT']),
                  end='')
            print('')

    print('Tuning hyper-parameters...')
    stypes = ['mahalanobis']

    save_dir = os.path.join('output/hyperparams/', args.name, 'tmp')

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    normalizer = transforms.Normalize((125.3 / 255, 123.0 / 255, 113.9 / 255),
                                      (63.0 / 255, 62.1 / 255.0, 66.7 / 255.0))

    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    if args.in_dataset == "CIFAR-10":
        trainset = torchvision.datasets.CIFAR10('../../data',
                                                train=True,
                                                download=True,
                                                transform=transform)
        trainloaderIn = torch.utils.data.DataLoader(trainset,
                                                    batch_size=args.batch_size,
                                                    shuffle=True,
                                                    num_workers=2)

        testset = torchvision.datasets.CIFAR10(root='../../data',
                                               train=False,
                                               download=True,
                                               transform=transform)
        testloaderIn = torch.utils.data.DataLoader(testset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=2)

        num_classes = 10
    elif args.in_dataset == "CIFAR-100":
        trainset = torchvision.datasets.CIFAR100('./datasets/cifar10',
                                                 train=True,
                                                 download=True,
                                                 transform=transform)
        trainloaderIn = torch.utils.data.DataLoader(trainset,
                                                    batch_size=args.batch_size,
                                                    shuffle=True,
                                                    num_workers=2)

        testset = torchvision.datasets.CIFAR100(root='./datasets/cifar100',
                                                train=False,
                                                download=True,
                                                transform=transform)
        testloaderIn = torch.utils.data.DataLoader(testset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=2)

        num_classes = 100

    valloaderOut = torch.utils.data.DataLoader(
        TinyImages(transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.ToPILImage(),
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor()
        ])),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=2)

    model = dn.DenseNet3(args.layers, num_classes, normalizer=normalizer)

    checkpoint = torch.load(
        "./checkpoints/{name}/checkpoint_{epochs}.pth.tar".format(
            name=args.name, epochs=args.epochs))
    model.load_state_dict(checkpoint['state_dict'])

    model.eval()
    model.cuda()

    # set information about feature extaction
    temp_x = torch.rand(2, 3, 32, 32)
    temp_x = Variable(temp_x)
    temp_list = model.feature_list(temp_x)[1]
    num_output = len(temp_list)
    feature_list = np.empty(num_output)
    count = 0
    for out in temp_list:
        feature_list[count] = out.size(1)
        count += 1

    print('get sample mean and covariance')
    sample_mean, precision = sample_estimator(model, num_classes, feature_list,
                                              trainloaderIn)

    print('train logistic regression model')
    m = 1000
    val_in = []
    val_out = []

    cnt = 0
    for data, target in trainloaderIn:
        for x in data:
            val_in.append(x.numpy())
            cnt += 1
            if cnt == m:
                break
        if cnt == m:
            break

    cnt = 0
    for data, target in valloaderOut:
        for x in data:
            val_out.append(data[0].numpy())
            cnt += 1
            if cnt == m:
                break
        if cnt == m:
            break

    train_lr_data = []
    train_lr_label = []
    train_lr_data.extend(val_in)
    train_lr_label.extend(np.zeros(m))
    train_lr_data.extend(val_out)
    train_lr_label.extend(np.ones(m))
    train_lr_data = torch.tensor(train_lr_data)
    train_lr_label = torch.tensor(train_lr_label)

    best_fpr = 1.1
    best_magnitude = 0.0

    for magnitude in np.arange(0, 0.0041, 0.004 / 20):
        train_lr_Mahalanobis = []
        total = 0
        for data_index in range(
                int(np.floor(train_lr_data.size(0) / args.batch_size))):
            data = train_lr_data[total:total + args.batch_size]
            total += args.batch_size
            Mahalanobis_scores = get_Mahalanobis_score(model, data,
                                                       num_classes,
                                                       sample_mean, precision,
                                                       num_output, magnitude)
            train_lr_Mahalanobis.extend(Mahalanobis_scores)

        train_lr_Mahalanobis = np.asarray(train_lr_Mahalanobis,
                                          dtype=np.float32)

        regressor = LogisticRegressionCV().fit(train_lr_Mahalanobis,
                                               train_lr_label)

        print('Logistic Regressor params:', regressor.coef_,
              regressor.intercept_)

        t0 = time.time()
        f1 = open(os.path.join(save_dir, "confidence_mahalanobis_In.txt"), 'w')
        f2 = open(os.path.join(save_dir, "confidence_mahalanobis_Out.txt"),
                  'w')
        ########################################In-distribution###########################################
        print("Processing in-distribution images")

        count = 0
        for i in range(int(m / args.batch_size) + 1):
            if i * args.batch_size >= m:
                break
            images = torch.tensor(
                val_in[i * args.batch_size:min((i + 1) * args.batch_size, m)])
            # if j<1000: continue
            batch_size = images.shape[0]

            Mahalanobis_scores = get_Mahalanobis_score(model, images,
                                                       num_classes,
                                                       sample_mean, precision,
                                                       num_output, magnitude)

            confidence_scores = regressor.predict_proba(Mahalanobis_scores)[:,
                                                                            1]

            for k in range(batch_size):
                f1.write("{}\n".format(-confidence_scores[k]))

            count += batch_size
            print("{:4}/{:4} images processed, {:.1f} seconds used.".format(
                count, m,
                time.time() - t0))
            t0 = time.time()

        ###################################Out-of-Distributions#####################################
        t0 = time.time()
        print("Processing out-of-distribution images")
        count = 0

        for i in range(int(m / args.batch_size) + 1):
            if i * args.batch_size >= m:
                break
            images = torch.tensor(
                val_out[i * args.batch_size:min((i + 1) * args.batch_size, m)])
            # if j<1000: continue
            batch_size = images.shape[0]

            Mahalanobis_scores = get_Mahalanobis_score(model, images,
                                                       num_classes,
                                                       sample_mean, precision,
                                                       num_output, magnitude)

            confidence_scores = regressor.predict_proba(Mahalanobis_scores)[:,
                                                                            1]

            for k in range(batch_size):
                f2.write("{}\n".format(-confidence_scores[k]))

            count += batch_size
            print("{:4}/{:4} images processed, {:.1f} seconds used.".format(
                count, m,
                time.time() - t0))
            t0 = time.time()

        f1.close()
        f2.close()

        results = metric(save_dir, stypes)
        print_tuning_results(results, stypes)
        fpr = results['mahalanobis']['FPR']
        if fpr < best_fpr:
            best_fpr = fpr
            best_magnitude = magnitude
            best_regressor = regressor

    print('Best Logistic Regressor params:', best_regressor.coef_,
          best_regressor.intercept_)
    print('Best magnitude', best_magnitude)

    return sample_mean, precision, best_regressor, best_magnitude
Пример #52
0
class PosTagger():
    """Part-of-speech(pos) tagger class for the English language"""
    def __init__(self):
        self.sentences = list()
        self.features = list()
        self.pos_labels = list()
        self.vectorizer = DictVectorizer()
        self.model = LogisticRegressionCV(random_state=123)

    def read_data(self, train_datapath):
        """Read sentences from given corpus data"""
        self.sentences = []
        with open(train_datapath, 'r') as infile:
            sent = []
            for line in infile:
                line = str.split(str.strip(line), '\t')
                if len(line) == 3:
                    token, tag_label = line[0], line[2]
                    sent.append((token, tag_label))
                    continue
                self.sentences.append(sent)
                sent = []
        print("-> %d sentences are read from '%s'." %
              (len(self.sentences), train_datapath))
        return

    def get_feature(self, token, token_index, sent):
        """Extract features of given word(token)"""
        token_feature = {
            'token':
            token,
            'is_first':
            token_index == 0,
            'is_last':
            token_index == len(sent) - 1,
            'is_capitalized':
            token[0].upper() == token[0],
            'is_all_capitalized':
            token.upper() == token,
            'is_capitals_inside':
            token[1:].lower() != token[1:],
            'is_numeric':
            token.isdigit(),
            'prefix-1':
            token[0],
            'prefix-2':
            '' if len(token) < 2 else token[:1],
            'suffix-1':
            token[-1],
            'suffix-2':
            '' if len(token) < 2 else token[-2:],
            'prev-token':
            '' if token_index == 0 else sent[token_index - 1][0],
            '2-prev-token':
            '' if token_index <= 1 else sent[token_index - 2][0],
            'next-token':
            '' if token_index == len(sent) - 1 else sent[token_index + 1][0],
            '2-next-token':
            '' if token_index >= len(sent) - 2 else sent[token_index + 2][0]
        }
        return token_feature

    def form_data(self):
        """Create datasets for training/evaluation/testing"""
        self.features = []
        self.pos_labels = []
        for sent in self.sentences:
            for token_index, token_pair in enumerate(sent):
                token = token_pair[0]
                self.features.append(self.get_feature(token, token_index,
                                                      sent))
                try:
                    pos_label = token_pair[1]
                    self.pos_labels.append(pos_label)
                except:
                    pass
        return

    def train(self, train_datapath):
        """Train part-of-speech(pos) tagger model"""
        self.read_data(train_datapath)
        self.form_data()
        print("-> Training phase is started.")
        t0 = time.time()
        self.model.fit(self.vectorizer.fit_transform(self.features),
                       self.pos_labels)
        print("-> Training is completed in %s secs." %
              (str(round(time.time() - t0, 3))))
        preds = self.model.predict(self.vectorizer.transform(self.features))
        acc_score = accuracy_score(self.pos_labels, preds)
        print("## Evaluation accuracy is %.2f on '%s'" %
              (acc_score, train_datapath))
        print()
        return

    def evaluate(self, datapath):
        """Evaluate the accuracy of trained part-of-speech(pos) tagger on given development/test corpus data"""
        self.read_data(datapath)
        self.form_data()
        preds = self.model.predict(self.vectorizer.transform(self.features))
        acc_score = accuracy_score(self.pos_labels, preds)
        print("## Evaluation accuracy is %.2f on '%s'" % (acc_score, datapath))
        print()
        return acc_score

    def test(self, datapath):
        """Measure various score values of part-of-speech(pos) tagger on given development/test corpus data"""
        self.read_data(datapath)
        self.form_data()
        preds = self.model.predict(self.vectorizer.transform(self.features))
        precision = precision_score(self.pos_labels, preds, average='micro')
        recall = recall_score(self.pos_labels, preds, average='micro')
        f1 = f1_score(self.pos_labels, preds, average='micro')
        accuracy = accuracy_score(self.pos_labels, preds)
        conf_matrix = confusion_matrix(self.pos_labels, preds)
        return precision, recall, f1, accuracy, conf_matrix

    def tag(self, sentence):
        """Tag single sentence"""
        self.sentences = list([sentence])
        self.form_data()
        preds = (self.model.predict(self.vectorizer.transform(self.features)))
        tagged_sent = list(zip(sentence, preds))
        return tagged_sent

    def tag_sents(self, sentences):
        """Tag multiple sentences"""
        tagged_sents = list()
        for sent in sentences:
            tagged_sents.append(self.tag(sent))
        return tagged_sents

    def save(self, save_path):
        """Save part-of-speech(pos) tagger"""
        with gzip.GzipFile(save_path, 'wb') as outfile:
            joblib.dump((self.vectorizer, self.model),
                        outfile,
                        compress=('gzip', 9))
        print("-> POS tagger is saved to '%s'" % save_path)
        return

    def load(self, load_path):
        """Load part-of-speech(pos) tagger"""
        with gzip.GzipFile(load_path, 'rb') as infile:
            self.vectorizer, self.model = joblib.load(infile)
        print("-> POS tagger is loaded from '%s'" % load_path)
        return
def main():
    # initial setup
    dataset_list = ['cifar10', 'cifar100', 'svhn']
    adv_test_list = ['FGSM', 'BIM', 'DeepFool', 'CWL2', 'PGD100']

    print('evaluate the LID estimator')
    score_list = [
        'LID_10', 'LID_20', 'LID_30', 'LID_40', 'LID_50', 'LID_60', 'LID_70',
        'LID_80', 'LID_90'
    ]
    list_best_results, list_best_results_index = [], []
    for dataset in dataset_list:
        print('load train data: ', dataset)
        outf = './adv_output/' + args.net_type + '_' + dataset + '/'

        list_best_results_out, list_best_results_index_out = [], []
        for out in adv_test_list:
            best_auroc, best_result, best_index = 0, 0, 0
            for score in score_list:
                print('load train data: ', out, ' of ', score)
                total_X, total_Y = lib_regression.load_characteristics(
                    score, dataset, out, outf)
                X_val, Y_val, X_test, Y_test = lib_regression.block_split_adv(
                    total_X, total_Y)
                pivot = int(X_val.shape[0] / 6)
                X_train = np.concatenate(
                    (X_val[:pivot], X_val[2 * pivot:3 * pivot],
                     X_val[4 * pivot:5 * pivot]))
                Y_train = np.concatenate(
                    (Y_val[:pivot], Y_val[2 * pivot:3 * pivot],
                     Y_val[4 * pivot:5 * pivot]))
                X_val_for_test = np.concatenate(
                    (X_val[pivot:2 * pivot], X_val[3 * pivot:4 * pivot],
                     X_val[5 * pivot:]))
                Y_val_for_test = np.concatenate(
                    (Y_val[pivot:2 * pivot], Y_val[3 * pivot:4 * pivot],
                     Y_val[5 * pivot:]))
                lr = LogisticRegressionCV(n_jobs=-1).fit(X_train, Y_train)
                y_pred = lr.predict_proba(X_train)[:, 1]
                #print('training mse: {:.4f}'.format(np.mean(y_pred - Y_train)))
                y_pred = lr.predict_proba(X_val_for_test)[:, 1]
                #print('test mse: {:.4f}'.format(np.mean(y_pred - Y_val_for_test)))
                results = lib_regression.detection_performance(
                    lr, X_val_for_test, Y_val_for_test, outf)
                if best_auroc < results['TMP']['AUROC']:
                    best_auroc = results['TMP']['AUROC']
                    best_index = score
                    best_result = lib_regression.detection_performance(
                        lr, X_test, Y_test, outf)
            list_best_results_out.append(best_result)
            list_best_results_index_out.append(best_index)
        list_best_results.append(list_best_results_out)
        list_best_results_index.append(list_best_results_index_out)

    print('evaluate the Mahalanobis estimator')
    score_list = ['Mahalanobis_0.0', 'Mahalanobis_0.01', 'Mahalanobis_0.005', \
                  'Mahalanobis_0.002', 'Mahalanobis_0.0014', 'Mahalanobis_0.001', 'Mahalanobis_0.0005']
    list_best_results_ours, list_best_results_index_ours = [], []
    for dataset in dataset_list:
        print('load train data: ', dataset)
        outf = './adv_output/' + args.net_type + '_' + dataset + '/'
        list_best_results_out, list_best_results_index_out = [], []
        for out in adv_test_list:
            best_auroc, best_result, best_index = 0, 0, 0
            for score in score_list:
                print('load train data: ', out, ' of ', score)
                total_X, total_Y = lib_regression.load_characteristics(
                    score, dataset, out, outf)
                X_val, Y_val, X_test, Y_test = lib_regression.block_split_adv(
                    total_X, total_Y)
                pivot = int(X_val.shape[0] / 6)
                X_train = np.concatenate(
                    (X_val[:pivot], X_val[2 * pivot:3 * pivot],
                     X_val[4 * pivot:5 * pivot]))
                Y_train = np.concatenate(
                    (Y_val[:pivot], Y_val[2 * pivot:3 * pivot],
                     Y_val[4 * pivot:5 * pivot]))
                X_val_for_test = np.concatenate(
                    (X_val[pivot:2 * pivot], X_val[3 * pivot:4 * pivot],
                     X_val[5 * pivot:]))
                Y_val_for_test = np.concatenate(
                    (Y_val[pivot:2 * pivot], Y_val[3 * pivot:4 * pivot],
                     Y_val[5 * pivot:]))
                lr = LogisticRegressionCV(n_jobs=-1).fit(X_train, Y_train)
                y_pred = lr.predict_proba(X_train)[:, 1]
                #print('training mse: {:.4f}'.format(np.mean(y_pred - Y_train)))
                y_pred = lr.predict_proba(X_val_for_test)[:, 1]
                #print('test mse: {:.4f}'.format(np.mean(y_pred - Y_val_for_test)))
                results = lib_regression.detection_performance(
                    lr, X_val_for_test, Y_val_for_test, outf)
                if best_auroc < results['TMP']['AUROC']:
                    best_auroc = results['TMP']['AUROC']
                    best_index = score
                    best_result = lib_regression.detection_performance(
                        lr, X_test, Y_test, outf)
            list_best_results_out.append(best_result)
            list_best_results_index_out.append(best_index)
        list_best_results_ours.append(list_best_results_out)
        list_best_results_index_ours.append(list_best_results_index_out)

    count_in = 0
    mtypes = ['TNR', 'AUROC', 'DTACC', 'AUIN', 'AUOUT']
    print("results of LID")
    for in_list in list_best_results:
        print('in_distribution: ' + dataset_list[count_in] + '==========')
        count_out = 0
        for results in in_list:
            print('out_distribution: ' + adv_test_list[count_out])
            for mtype in mtypes:
                print(' {mtype:6s}'.format(mtype=mtype), end='')
            print('\n{val:6.2f}'.format(val=100. * results['TMP']['TNR']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results['TMP']['AUROC']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results['TMP']['DTACC']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results['TMP']['AUIN']),
                  end='')
            print(' {val:6.2f}\n'.format(val=100. * results['TMP']['AUOUT']),
                  end='')
            print('Input noise: ' +
                  list_best_results_index[count_in][count_out])
            print('')
            count_out += 1
        count_in += 1

    count_in = 0
    print("results of Mahalanobis")
    for in_list in list_best_results_ours:
        print('in_distribution: ' + dataset_list[count_in] + '==========')
        count_out = 0
        for results in in_list:
            print('out_distribution: ' + adv_test_list[count_out])
            for mtype in mtypes:
                print(' {mtype:6s}'.format(mtype=mtype), end='')
            print('\n{val:6.2f}'.format(val=100. * results['TMP']['TNR']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results['TMP']['AUROC']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results['TMP']['DTACC']),
                  end='')
            print(' {val:6.2f}'.format(val=100. * results['TMP']['AUIN']),
                  end='')
            print(' {val:6.2f}\n'.format(val=100. * results['TMP']['AUOUT']),
                  end='')
            print('Input noise: ' +
                  list_best_results_index_ours[count_in][count_out])
            print('')
            count_out += 1
        count_in += 1
Пример #54
0
def QuickML_Ensembling(X_train,
                       y_train,
                       X_test,
                       y_test='',
                       modeltype='Regression',
                       Boosting_Flag=False,
                       scoring='',
                       verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Voting models
    estimators = []
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging1', model5, metrics1))
        else:
            model5 = LassoLarsCV(cv=scv)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('LassoLarsCV', model5, metrics1))
        model6 = LassoCV(alphas=np.logspace(-10, -1, 50),
                         cv=scv,
                         random_state=seed)
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = rmse(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('LassoCV', model6, metrics2))
        model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv)
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = rmse(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('RidgeCV', model7, metrics3))
        ## Create an ensemble model ####
        if Boosting_Flag:
            model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging2', model8, metrics4))
        else:
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                min_samples_leaf=2, max_depth=1, random_state=seed),
                                       n_estimators=NUMS,
                                       random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if verbose >= 2:
            print('QuickML_Ensembling Model results:')
            print(
                '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                % (estimator_names[0], metrics1, estimator_names[1], metrics2,
                   estimator_names[2], metrics3, estimator_names[3], metrics4))
    else:
        if scoring == '':
            scoring = 'accuracy'
        scv = StratifiedKFold(n_splits=FOLDS, random_state=seed)
        if Boosting_Flag is None:
            model5 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging', model5, metrics1))
        else:
            model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20),
                                          cv=scv,
                                          scoring=scoring,
                                          random_state=seed)
            results1 = model5.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Logistic Regression', model5, metrics1))
        model6 = LinearDiscriminantAnalysis()
        results2 = model6.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = accu(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('Linear Discriminant', model6, metrics2))
        if modeltype == 'Binary_Classification':
            float_cols = X_train.columns[(
                X_train.dtypes == float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes == int).values].tolist()
            if (X_train[float_cols + int_cols] <
                    0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = GaussianNB()
        else:
            float_cols = X_train.columns[(
                X_train.dtypes == float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes == int).values].tolist()
            if (X_train[float_cols + int_cols] <
                    0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = MultinomialNB()
        results3 = model7.fit(X_train, y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = accu(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('Naive Bayes', model7, metrics3))
        if Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here.
            model8 = ExtraTreesClassifier(n_estimators=NUMS,
                                          min_samples_leaf=2,
                                          random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging', model8, metrics4))
        else:
            ## Create an ensemble model ####
            model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                        n_estimators=NUMS,
                                        random_state=seed)
            results4 = model8.fit(X_train, y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting', model8, metrics4))
        estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if not isinstance(y_test, str):
            if verbose >= 2:
                print('QuickML_Ensembling Model results:')
                print(
                    '    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f'
                    % (estimator_names[0], metrics1, estimator_names[1],
                       metrics2, estimator_names[2], metrics3,
                       estimator_names[3], metrics4))
        else:
            if verbose >= 1:
                print('QuickML_Ensembling completed.')
    stacks = np.c_[results1, results2, results3, results4]
    if verbose == 1:
        print('    Time taken for Ensembling: %0.1f seconds' %
              (time.time() - start_time))
    return estimator_names, stacks


#########################################################
Пример #55
0
datas['A16'] = label.fit_transform(df[classification])
# print(datas.info())

x = datas.iloc[:, :-1]
y = datas.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

# Logistic模型训练
model = Pipeline([('ss', StandardScaler()),
                  ('lr',
                   LogisticRegressionCV(multi_class='ovr',
                                        fit_intercept=True,
                                        Cs=np.logspace(-4, 1, 50),
                                        penalty='l2',
                                        solver='lbfgs',
                                        tol=0.01))])

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

result = model.get_params()['lr']
print('r:', model.score(x_train, y_train))
print('参数:', result.coef_)
print('截距:', result.intercept_)

# KNN模型训练
knn = KNeighborsClassifier(n_neighbors=20,
                           algorithm='kd_tree',
                           weights='distance')
Пример #56
0
#         #使用交叉验证来选择正则化系数C
#         LR_model_2 = LogisticRegressionCV(Cs=[C_penalty], penalty='l2', solver='lbfgs', class_weight={1:bad_weight, 0:1})
#         LR_model_2_fit = LR_model_2.fit(X_train,y_train)
#         y_pred = LR_model_2_fit.predict_proba(X_test)[:,1]
#         scorecard_result = pd.DataFrame({'prob':y_pred, 'target':y_test})
#         performance = KS(scorecard_result,'prob','target')
#         # KS = performance['KS']
#         # KS = performance
#         model_parameter[(C_penalty, bad_weight)] = performance #KS
# sortparam = sorted(model_parameter,key=lambda x:x[1],reverse=True)
# print('sortedparam --> ',sortparam[0],model_parameter[sortparam[0]],sortparam[1],model_parameter[sortparam[0]])
# penalty,badWeight = sortparam[0]
LR_model_2 = LogisticRegressionCV(penalty='l2',
                                  solver='lbfgs',
                                  scoring='roc_auc',
                                  cv=3,
                                  class_weight={
                                      1: 10,
                                      0: 1
                                  })
LR_model_2_fit = LR_model_2.fit(X_train, y_train)
y_prob = LR_model_2_fit.predict_proba(X_test)[:, 1]
print('y_prob --> ', y_prob.shape)
y_pred = LR_model_2_fit.predict(X_test)
print('y_pred --> ', y_pred.shape)
scorecard_result = pd.DataFrame({
    'prob': y_prob,
    'target': y_test,
    'pred': y_pred
})
performance = KS(scorecard_result, 'prob', 'target')
print('ks --> ', performance)
Пример #57
0
# WOE 编码
woe = rpt.preprocessing.WeightOfEvidence(categorical_features=categorical_var,
                                         encoder_na=False)
X = woe.fit_transform(X, y)

# 离散化
#dis=rpt.preprocessing.Discretization(continous_features=continuous_var)
#X2=dis.fit_transform(X,y)

# 补缺和标准化
X = X.fillna(-99)
X[continuous_var] = preprocessing.MinMaxScaler().fit_transform(
    X[continuous_var])


clfs={'LogisticRegression':LogisticRegressionCV(),\
'RandomForest':RandomForestClassifier(),'GradientBoosting':GradientBoostingClassifier()}
y_preds, y_probas = {}, {}
for clf in clfs:
    clfs[clf].fit(X, y)
    y_preds[clf] = clfs[clf].predict(X)
    y_probas[clf] = clfs[clf].predict_proba(X)[:, 1]

models_report, conf_matrix = rpt.ClassifierReport(y, y_preds, y_probas)
print(models_report)

# 信息论度量
p = y_probas['LogisticRegression'][y == 1]
q = y_probas['LogisticRegression'][y == 0]
print(rpt.metrics.entropyc.kl_div(p, q))
Пример #58
0
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from dataprocess import get_data_two
from sklearn.metrics import precision_recall_curve,precision_score,recall_score,f1_score
import numpy as np

stand=StandardScaler()
x,y=get_data_two('/home/cooper/PycharmProjects/sxyl/Assignments/iris2.txt')

x=stand.fit_transform(X=x)

print(x.shape)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
model=LogisticRegressionCV(multi_class="ovr",fit_intercept=True,Cs=np.logspace(-2,2,20),cv=2,penalty="l2",solver="lbfgs",tol=0.01)
#
result=model.fit(x_train,y_train)
#
s=result.score(x_train,y_train)
print(s)
y_pre=model.predict(x_test)
print('recall: ',recall_score(y_test,y_pre))
print('precision',precision_score(y_test,y_pre))
print('f1_score',f1_score(y_test,y_pre))
Пример #59
0
class RuleFit(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_size:      Number of terminal nodes in generated trees. If exp_rand_tree_size=True, 
                        this will be the mean number of terminal nodes.
        sample_fract:   fraction of randomly chosen training observations used to produce each tree. 
                        FP 2004 (Sec. 2)
        max_rules:      approximate total number of rules generated for fitting. Note that actual
                        number of rules will usually be lower than this due to duplicates.
        memory_par:     scale multiplier (shrinkage factor) applied to each new tree when 
                        sequentially induced. FP 2004 (Sec. 2)
        rfmode:         'regress' for regression or 'classify' for binary classification.
        lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2
                        by multiplying the winsorised variable by 0.4/stdev.
        lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear 
                        terms before standardisation.
        exp_rand_tree_size: If True, each boosted tree will have a different maximum number of 
                        terminal nodes based on an exponential distribution about tree_size. 
                        (Friedman Sec 3.3)
        model_type:     'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms
        random_state:   Integer to initialise random objects and provide repeatability.
        tree_generator: Optional: this object will be used as provided to generate the rules. 
                        This will override almost all the other properties above. 
                        Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
                 memory_par=0.01,
                 tree_generator=None,
                 rfmode='regress',
                 lin_trim_quantile=0.025,
                 lin_standardise=True,
                 exp_rand_tree_size=True,
                 model_type='rl',
                 Cs=None,
                 cv=3,
                 random_state=None):
        self.tree_generator = tree_generator
        self.rfmode = rfmode
        self.lin_trim_quantile = lin_trim_quantile
        self.lin_standardise = lin_standardise
        self.winsorizer = Winsorizer(trim_quantile=lin_trim_quantile)
        self.friedscale = FriedScale(self.winsorizer)
        self.stddev = None
        self.mean = None
        self.exp_rand_tree_size = exp_rand_tree_size
        self.max_rules = max_rules
        self.sample_fract = sample_fract
        self.max_rules = max_rules
        self.memory_par = memory_par
        self.tree_size = tree_size
        self.random_state = random_state
        self.model_type = model_type
        self.cv = cv
        self.Cs = Cs

    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        N = X.shape[0]
        if feature_names is None:
            self.feature_names = [
                'feature_' + str(x) for x in range(0, X.shape[1])
            ]
        else:
            self.feature_names = feature_names
        if 'r' in self.model_type:
            ## initialise tree generator
            if self.tree_generator is None:
                n_estimators_default = int(
                    np.ceil(self.max_rules / self.tree_size))
                self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N)
                if self.rfmode == 'regress':
                    self.tree_generator = GradientBoostingRegressor(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)
                else:
                    self.tree_generator = GradientBoostingClassifier(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)

            if self.rfmode == 'regress':
                if type(self.tree_generator) not in [
                        GradientBoostingRegressor, RandomForestRegressor
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingRegressor"
                    )
            else:
                if type(self.tree_generator) not in [
                        GradientBoostingClassifier, RandomForestClassifier
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingClassifier"
                    )

            ## fit tree generator
            if not self.exp_rand_tree_size:  # simply fit with constant tree size
                self.tree_generator.fit(X, y)
            else:  # randomise tree size as per Friedman 2005 Sec 3.3
                np.random.seed(self.random_state)
                tree_sizes = np.random.exponential(
                    scale=self.tree_size - 2,
                    size=int(np.ceil(self.max_rules * 2 / self.tree_size)))
                tree_sizes = np.asarray([
                    2 + np.floor(tree_sizes[i_])
                    for i_ in np.arange(len(tree_sizes))
                ],
                                        dtype=int)
                i = int(len(tree_sizes) / 4)
                while np.sum(tree_sizes[0:i]) < self.max_rules:
                    i = i + 1
                tree_sizes = tree_sizes[0:i]
                self.tree_generator.set_params(warm_start=False)
                curr_est_ = 0
                for i_size in np.arange(len(tree_sizes)):
                    size = tree_sizes[i_size]
                    self.tree_generator.set_params(n_estimators=curr_est_ + 1)
                    self.tree_generator.set_params(max_leaf_nodes=size)
                    random_state_add = self.random_state if self.random_state else 0
                    self.tree_generator.set_params(
                        random_state=i_size + random_state_add
                    )  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
                    self.tree_generator.get_params()['n_estimators']
                    self.tree_generator.fit(np.copy(X, order='C'),
                                            np.copy(y, order='C'))
                    curr_est_ = curr_est_ + 1
                self.tree_generator.set_params(warm_start=False)
            tree_list = self.tree_generator.estimators_
            if isinstance(self.tree_generator,
                          RandomForestRegressor) or isinstance(
                              self.tree_generator, RandomForestClassifier):
                tree_list = [[x] for x in self.tree_generator.estimators_]

            ## extract rules
            self.rule_ensemble = RuleEnsemble(tree_list=tree_list,
                                              feature_names=self.feature_names)

            ## concatenate original features and rules
            X_rules = self.rule_ensemble.transform(X)

        ## standardise linear variables if requested (for regression model only)
        if 'l' in self.model_type:

            ## standard deviation and mean of winsorized features
            self.winsorizer.train(X)
            winsorized_X = self.winsorizer.trim(X)
            self.stddev = np.std(winsorized_X, axis=0)
            self.mean = np.mean(winsorized_X, axis=0)

            if self.lin_standardise:
                self.friedscale.train(X)
                X_regn = self.friedscale.scale(X)
            else:
                X_regn = X.copy()

        ## Compile Training data
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            X_concat = np.concatenate((X_concat, X_regn), axis=1)
        if 'r' in self.model_type:
            if X_rules.shape[0] > 0:
                X_concat = np.concatenate((X_concat, X_rules), axis=1)

        ## fit Lasso
        if self.rfmode == 'regress':
            if self.Cs is None:  # use defaultshasattr(self.Cs, "__len__"):
                n_alphas = 100
                alphas = None
            elif hasattr(self.Cs, "__len__"):
                n_alphas = None
                alphas = 1. / self.Cs
            else:
                n_alphas = self.Cs
                alphas = None
            self.lscv = LassoCV(n_alphas=n_alphas,
                                alphas=alphas,
                                cv=self.cv,
                                random_state=self.random_state)
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_
            self.intercept_ = self.lscv.intercept_
        else:
            Cs = 10 if self.Cs is None else self.Cs
            self.lscv = LogisticRegressionCV(Cs=Cs,
                                             cv=self.cv,
                                             penalty='l1',
                                             random_state=self.random_state,
                                             solver='liblinear')
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_[0]
            self.intercept_ = self.lscv.intercept_[0]

        return self

    def predict(self, X):
        """Predict outcome for X

        """
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(self.rule_ensemble.rules):]
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict(X_concat)

    def predict_proba(self, X):
        """Predict probability of outcome for X

        """
        if self.rfmode == 'regress':
            raise ValueError(
                "Probaility prediction only works for classification tasks.")
        else:
            X_concat = np.zeros([X.shape[0], 0])
            if 'l' in self.model_type:
                if self.lin_standardise:
                    X_concat = np.concatenate(
                        (X_concat, self.friedscale.scale(X)), axis=1)
                else:
                    X_concat = np.concatenate((X_concat, X), axis=1)
            if 'r' in self.model_type:
                rule_coefs = self.coef_[-len(self.rule_ensemble.rules):]
                if len(rule_coefs) > 0:
                    X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs)
                    if X_rules.shape[0] > 0:
                        X_concat = np.concatenate((X_concat, X_rules), axis=1)
            return self.lscv.predict_proba(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self, exclude_zero_coef=False, subregion=None):
        """Return the estimated rules

        Parameters
        ----------
        exclude_zero_coef: If True (default), returns only the rules with an estimated
                           coefficient not equalt to  zero.

        subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over 
                           subregion of inputs (FP 2004 eq. 30/31/32).

        Returns
        -------
        rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
               the coefficients and 'support' the support of the rule in the training
               data set (X)
        """

        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        ## Add coefficients for linear effects
        for i in range(0, n_features):
            if self.lin_standardise:
                coef = self.coef_[i] * self.friedscale.scale_multipliers[i]
            else:
                coef = self.coef_[i]
            if subregion is None:
                importance = abs(coef) * self.stddev[i]
            else:
                subregion = np.array(subregion)
                importance = sum(
                    abs(coef) *
                    abs([x[i] for x in self.winsorizer.trim(subregion)] -
                        self.mean[i])) / len(subregion)
            output_rules += [(self.feature_names[i], 'linear', coef, 1,
                              importance)]

        ## Add rules
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]

            if subregion is None:
                importance = abs(coef) * (rule.support *
                                          (1 - rule.support))**(1 / 2)
            else:
                rkx = rule.transform(subregion)
                importance = sum(
                    abs(coef) * abs(rkx - rule.support)) / len(subregion)

            output_rules += [(rule.__str__(), 'rule', coef, rule.support,
                              importance)]
        rules = pd.DataFrame(
            output_rules,
            columns=["rule", "type", "coef", "support", "importance"])
        if exclude_zero_coef:
            rules = rules.ix[rules.coef != 0]
        return rules
Пример #60
0
class DCTrainer:
    """ Trains Diagnostic Classifiers (DC) on extracted activation data.

    For each activation that is part of the provided activation_names
    argument a different classifier will be trained.

    Parameters
    ----------
    save_dir : str, optional
        Directory to which trained models will be saved, if provided.
    corpus : Corpus
        Corpus containing the token labels for each sentence.
    activation_names : List[ActivationName]
        List of activation names on which classifiers will be trained.
    activations_dir : str, optional
        Path to folder containing the activations to train on. If not
        provided newly extracted activations will be saved to
        `save_dir`.
    test_activations_dir : str, optional
        Directory containing the extracted test activations. If not
        provided the train activation set will be split and partially
        used as test set.
    test_corpus : Corpus, optional
        Corpus containing the test labels for each sentence. If
        provided without `test_activations_dir` newly extracted
        activations will be saved to `save_dir`.
    model : LanguageModel, optional
        LanguageModel that should be provided if new activations need
        to be extracted prior to training the classifiers.
    selection_func : SelectFunc, optional
        Selection function that determines whether a corpus item should
        be taken into account for training. If such a function has been
        used during extraction, make sure to pass it along here as well.

    Attributes
    ----------
    data_loader : DataLoader
        Class that reads and preprocesses activation data.
    classifier : Classifier
        Current classifier that is being trained.
    """

    def __init__(
        self,
        save_dir: str,
        corpus: Corpus,
        activation_names: ActivationNames,
        activations_dir: Optional[str] = None,
        test_activations_dir: Optional[str] = None,
        test_corpus: Optional[Corpus] = None,
        model: Optional[LanguageModel] = None,
        selection_func: SelectFunc = lambda sen_id, pos, example: True,
    ) -> None:
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)

        activations_dir, test_activations_dir = self._extract_activations(
            save_dir,
            corpus,
            activation_names,
            selection_func,
            activations_dir,
            test_activations_dir,
            test_corpus,
            model,
        )

        self.activation_names = activation_names
        self.data_loader = DataLoader(
            activations_dir,
            corpus,
            test_activations_dir=test_activations_dir,
            test_corpus=test_corpus,
            selection_func=selection_func,
        )
        self.classifier = LogRegCV()

    def train(
        self,
        calc_class_weights: bool = False,
        data_subset_size: int = -1,
        train_test_split: float = 0.9,
    ) -> None:
        """ Trains DCs on multiple activation names.

        Parameters
        ----------
        calc_class_weights : bool, optional
            Set to True to calculate the classifier class weights based on
            the corpus class frequencies. Defaults to False.
        data_subset_size : int, optional
            Size of the subset on which training will be performed. Defaults
            to the full set of activations.
        train_test_split : float, optional
            Percentage of the train/test split. If separate test
            activations are provided this split won't be used.
            Defaults to 0.9/0.1.
        """
        for activation_name in self.activation_names:
            self._train(
                activation_name,
                calc_class_weights=calc_class_weights,
                data_subset_size=data_subset_size,
                train_test_split=train_test_split,
            )

    def _train(
        self,
        activation_name: ActivationName,
        calc_class_weights: bool = False,
        data_subset_size: int = -1,
        train_test_split: float = 0.9,
    ) -> None:
        """ Initiates training the DC on 1 activation type. """
        self._reset_classifier()

        data_dict = self.data_loader.create_data_split(
            activation_name, data_subset_size, train_test_split
        )

        # Calculate class weights
        if calc_class_weights:
            self._set_class_weights(data_dict["train_y"])

        # Train
        self._fit(data_dict["train_x"], data_dict["train_y"], activation_name)
        results = self._eval(data_dict["test_x"], data_dict["test_y"])

        if self.save_dir is not None:
            self._save(results, activation_name)

    def _fit(
        self, train_x: Tensor, train_y: Tensor, activation_name: ActivationName
    ) -> None:
        start_time = time()
        print(f"\nStarting fitting model on {activation_name}...")

        self.classifier.fit(train_x, train_y)

        print(f"Fitting done in {time() - start_time:.2f}s")

    def _eval(self, test_x: Tensor, test_y: Tensor) -> Dict[str, Any]:
        pred_y = self.classifier.predict(test_x)

        acc = accuracy_score(test_y, pred_y)
        cm = confusion_matrix(test_y, pred_y)

        results = {"accuracy": acc, "confusion matrix": cm}
        for k, v in results.items():
            print(k, v, "", sep="\n")
        results["pred_y"] = pred_y

        return results

    def _save(self, results: Dict[str, Any], activation_name: ActivationName) -> None:
        l, name = activation_name

        preds_path = os.path.join(self.save_dir, f"{name}_l{l}_results.pickle")
        model_path = os.path.join(self.save_dir, f"{name}_l{l}.joblib")

        dump_pickle(results, preds_path)
        joblib.dump(self.classifier, model_path)

    def _reset_classifier(self) -> None:
        self.classifier = LogRegCV()

    def _set_class_weights(self, train_y: Tensor) -> None:
        classes, class_freqs = torch.unique(train_y, return_counts=True)
        norm = class_freqs.sum().item()
        class_weight = {
            classes[i].item(): class_freqs[i].item() / norm
            for i in range(len(class_freqs))
        }
        self.classifier.class_weight = class_weight

    @staticmethod
    def _extract_activations(
        save_dir: str,
        corpus: Corpus,
        activation_names: ActivationNames,
        selection_func: SelectFunc,
        activations_dir: Optional[str],
        test_activations_dir: Optional[str],
        test_corpus: Optional[Corpus],
        model: Optional[LanguageModel],
    ) -> Tuple[str, Optional[str]]:
        if activations_dir is None:
            activations_dir = os.path.join(save_dir, "activations")
            simple_extract(
                model, activations_dir, corpus, activation_names, selection_func
            )

        if test_corpus is not None and test_activations_dir is None:
            test_activations_dir = os.path.join(save_dir, "test_activations")
            simple_extract(
                model,
                test_activations_dir,
                test_corpus,
                activation_names,
                selection_func,
            )

        return activations_dir, test_activations_dir