Пример #1
0
def feature_selection_class(predictors, responses, test_predictors,
                            selectFeatTech):
    if (selectFeatTech == 0):
        #t=int(predictors.shape[1]*0.40)
        t = 500  # no of features you want to select
        model = SelectKBest(chi2, k=t).fit(predictors.replace(-1, 0),
                                           responses)
        #print model.scores_
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)
    if (selectFeatTech == 1):
        randomized_logistic = RandomizedLogisticRegression()
        model = randomized_logistic.fit(predictors, responses)
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)

    column_names = predictors.columns[indices]
    predictors_new = pd.DataFrame(predictors_new,
                                  index=predictors.index,
                                  columns=column_names)
    predictors_test_new = pd.DataFrame(predictors_test_new,
                                       index=test_predictors.index,
                                       columns=column_names)
    return predictors_new, predictors_test_new
def statiblity(X, Y):
    from sklearn.linear_model import RandomizedLogisticRegression

    clf = RandomizedLogisticRegression(random_state=1)
    clf.fit(X, Y)

    return clf.scores_
Пример #3
0
 def randomized_Logistic_regression(self):
     X = self.data[:, 1:len(self.data[0])]
     y = self.data[:, 0]
     randomized_logistic = RandomizedLogisticRegression()
     randomized_logistic.fit(X, y)
     a = randomized_logistic.get_support()
     selected = np.where(a)
Пример #4
0
 def run(self, df, target_label):
     target = df[target_label]
     feature = df.drop(target_label, axis=1)
     clf = RandomizedLogisticRegression()
     for col in feature.columns:
         if np.any(np.isnan(feature[col].values)) or np.any(
                 np.isinf(feature[col].values)):
             print(list(feature[col].values))
     try:
         clf.fit(feature.values, target.values)
     except:
         for col in feature.columns:
             print(list(feature[col].values))
     scores = {}
     for col_index in range(len(feature.columns)):
         scores[feature.columns[col_index]] = abs(clf.scores_[col_index])
     scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     print(scores)
     position = {}
     i = 0
     for col, _ in scores:
         position[col] = i
         i += 1
     print(position)
     return position
Пример #5
0
def feture_select_RLR():
    data_x, data_y, names = get_data()
    rlr = RLR()
    rlr.fit(data_x, data_y)
    return sorted(zip(names, map(lambda x: round(x, 4), rlr.scores_)),
                  key=lambda x: x[1],
                  reverse=True)
Пример #6
0
def Feature_sort(Feat_scale, Label, threads=4):  ##通过三种特征选择方法对特征进行排序

    ranks = {}
    ## Univariate feature selection
    Selector = SelectKBest(f_classif, k='all')
    Selector.fit_transform(Feat_scale, Label)
    ranks["Univariate_f"] = np.argsort(Selector.pvalues_)

    ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling
    ##从第1900左右起,后续的特征排序得较为可疑。
    rlogreg = RandomizedLogisticRegression(n_jobs=1,
                                           n_resampling=2000,
                                           selection_threshold=0,
                                           verbose=False,
                                           random_state=0)
    ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21.
    ##warnings.warn(msg, category=DeprecationWarning)
    rlogreg.fit(Feat_scale, Label)
    ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_))

    ## boruta based on randomforest n_jobs=**
    rf = RandomForestClassifier(random_state=0,
                                n_jobs=threads,
                                max_features='auto')
    feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0)
    feat_selector.fit(Feat_scale, Label)
    ranks["Boruta_f"] = np.argsort(feat_selector.ranking_)

    return (ranks)
Пример #7
0
	def randomized_Logistic_regression(self):
		X = self.data[:,1:len(self.data[0])]
		y = self.data[:,0]
		randomized_logistic = RandomizedLogisticRegression()
		randomized_logistic.fit(X,y)
		a = randomized_logistic.get_support()
		selected = np.where(a)
Пример #8
0
def feature_selection(train,test,y):
    print "特征选择"
    clf = RLR(C=10,scaling=0.5,sample_fraction=0.6,n_resampling=200,selection_threshold=0.4,n_jobs=3)
    clf.fit(train,y)
    train = clf.transform(train)
    test = clf.transform(test)

    return train,test
Пример #9
0
 def randlogistic(self, selection_threshold=0.25, sample_fraction=0.75):
     rlr_model = RandomizedLogisticRegression(
         C=self.C,
         selection_threshold=selection_threshold,
         normalize=False,
         sample_fraction=sample_fraction)
     rlr_model.fit(self.data.values, self.target.values)
     return rlr_model
def get_features(X_train, y_train, names, selection_threshold=0.2):
    print('\ngetting features with randomized logistic regression...')
    print('using a selection threshold of {}'.format(selection_threshold))
    randomized_logistic = RandomizedLogisticRegression(
        selection_threshold=selection_threshold)
    randomized_logistic.fit(X_train, y_train)
    mask = randomized_logistic.get_support()
    features = np.array(names)[mask]
    print('found {} ngrams:'.format(len([f for f in features])))
    print([f for f in features])
    return features
Пример #11
0
def evaluate_stability(vocab, id_to_vec, mesh_to_id):
    labels = ('Male', 'Female', 'Both')
    Xs, ids = get_basic_Xs(id_to_vec, mesh_to_id, shuffle=True)
    Xtr, Ytr, Itr, Xte, Yte, Ite = get_test_train(labels, ids, Xs, 5)
    print 'Fitting RandomizedLR...'
    logreg = RandomizedLogisticRegression(verbose=True,
                                          n_resampling=1000,
                                          n_jobs=16)
    logreg.fit(Xtr, Ytr)
    scores = logreg.scores_
    return {vocab[i]: score for i, score in enumerate(scores)}
Пример #12
0
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how='all')
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance'])
    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
def classify_logistic():
    print "logistic regression"
    (X_train, y_train), (X_test, y_test) = util.load_all_feat()
    print "original X_train shape", X_train.shape
    clf = RandomizedLogisticRegression(n_jobs=2)
    clf.fit(X_train, y_train)
    # clf = LogisticRegression()
    # clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print "accuracy score:", accuracy_score(y_test, pred)

    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
Пример #14
0
def getElgiibleFeatures(allFeatureParam, allLabelParam):
    '''
    reff for paper : 
    http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1
    http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html
  '''

    logiRegObj = RandomizedLogisticRegression()
    logiRegObj.fit(allFeatureParam, allLabelParam)
    ### Output ###
    #print "Model score: ", logiRegObj.scores_
    eligible_indices = logiRegObj.get_support(indices=True)
    return eligible_indices
Пример #15
0
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5):
    # Create the random forest object which will include all the parameters
    # for the fit
    randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200)

    # Fit the training data to the Survived labels and create the decision trees
    randomlr = randomlr.fit(train_x,train_y)

    train_x = randomlr.fit_transform(train_x,train_y)
    cv_x = randomlr.transform(cv_x)
    test_x = randomlr.transform(test_x)

    return train_x,cv_x,test_x
	def set_data(self, user_atts, inter_atts, responses):
		self.build_data_representations(user_atts, inter_atts)
		# Convert from dict representation into matrix:
		predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray()
		print(predictor_rows)
		print('Finding optimal feature set...')
		self.ff_model = RandomizedLogisticRegression() # Finds best set of features
		# Fit data and get transformed input rows:
		X_new = self.ff_model.fit_transform(predictor_rows, responses)
		print(X_new)
		print('Done! Final Shape: ' + str(X_new.shape))
		print('Building Final model...')		  
		self.model = LogisticRegression().fit(X_new, responses)
		print('Done!')
Пример #17
0
def test_rflasso():
    train_X, test_X, train_Y, test_Y = train_test_split(index_data,
                                                        index_lable,
                                                        test_size=0.25,
                                                        random_state=1)
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import SelectFromModel
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.linear_model import RandomizedLogisticRegression
    randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2)
    randomized_logistic.fit(train_X, train_Y)
    XX = randomized_logistic.transform(train_X)
    print XX.shape
Пример #18
0
def lasso_regression(X, y):
    """
	Use Randomized Logistic Regression to select the features based on the coefficient values
	"""

    clf = RandomizedLogisticRegression(C=1.0)
    clf.fit(X, y)
    print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0))
    imp_feature_idx = clf.scores_.argsort()

    qualities = []

    X_train, X_test, y_train, y_test = split_examples(X, y)

    for i in range(0, 100, 4):
        clf = LogisticRegression(C=0.1)
        clf.fit(X_train[:, imp_feature_idx[i:]], y_train)
        q = roc_auc_score(
            y_test,
            clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1])

        qualities.append(q)
    plt.plot(range(0, 100, 4), qualities)
    plt.show()

    return qualities
Пример #19
0
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
    """
    Feature selection based on the scores given to the features by the 
    RandomizedLogisticRegression algorithm.
    """
    
    rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.], 
                                       sample_fraction=0.7,
                                       n_resampling=200, selection_threshold=0.25,
                                       verbose=5, n_jobs=-1, random_state=0)                                   
    rlr.fit(X_train, y_train)
    np.save('save/feat_sel_log_reg.npy', rlr.scores_)
    
    return rlr.scores_
Пример #20
0
def select_features(X, y):
    '''
    Select the relevant features from X that are useful for predicting
    the labels in y.

    Args:
        X: numpy 2D array containing input features
        y: numpy 1D array containing labels

    Returns:
        feature_list: List of indices of the selected important features
    '''

    # Get the selection model (stability selection)
    selection_model = RandomizedLogisticRegression(random_state=0)
    selection_model.fit(X, y)

    # Use a cross validated logistic regression to choose the importance
    # threshold at which a feature is included
    step_size = 50
    max_weight = int(max(selection_model.scores_)) + 1
    trial_thresholds = [
        i / step_size for i in range(1, max_weight * step_size + 1)
    ]
    threshold = 0
    max_score = 0
    for trial in trial_thresholds:
        selected_features = [
            i for i, score in enumerate(selection_model.scores_)
            if score > trial
        ]
        if len(selected_features) > 0:
            X_reduced = X[:, selected_features]
            model = LogisticRegression(multi_class='multinomial',
                                       class_weight='balanced',
                                       solver='newton-cg',
                                       random_state=0,
                                       max_iter=1000)
            scores = cross_val_score(model, X_reduced, y, cv=5)
            score = scores.mean()
            if score >= max_score:
                max_score = score
                threshold = trial / step_size

    importance = {i: s for i, s in enumerate(selection_model.scores_)}
    return [
        i for i, score in enumerate(selection_model.scores_)
        if score > threshold
    ]
def learning_curves(X, y, clf, params, train_sizes=None, feature_selection=False, n_folds=3, scoring='accuracy'):
    """
    Builds learning curves on test set, with parameters chosen on train and validation set using nested cross validation
    :param X: data
    :param y: labels
    :param clf: classificator
    :param params: parameters for grid search
    :param train_sizes: train sizes for building learning curves
    :param feature_selection: whether to choose features by randomized logistic regression
    :param n_folds: number of outed cv folds
    :param scoring: scoring metric
    :return: train and test curve
    """
    if train_sizes is None:
        train_sizes = np.linspace(0.5, 1.0, 5)

    kf = KFold(X.shape[0], n_folds=n_folds)

    train_curve = np.zeros_like(train_sizes)
    test_curve = np.zeros_like(train_sizes)

    for train_inds, test_inds in kf:
        train_data = X[train_inds]
        test_data = X[test_inds]
        train_labels = y[train_inds]
        test_labels = y[test_inds]

        if feature_selection:
            rlr = RandomizedLogisticRegression()
            rlr.fit(train_data, train_labels)

            inds = [i for i in range(X.shape[1]) if rlr.all_scores_[i] > 0.0]
            print len(inds), ' features chosen'
            train_data = train_data[:, inds]

        gs = GridSearchCV(clf, params, scoring=scoring, cv=5)
        gs.fit(train_data, train_labels)
        bp = gs.best_params_
        print 'chosen params: ', bp

        for p in bp:
            setattr(clf, p, bp[p])
        lc = learning_curve(clf, test_data, test_labels, scoring=scoring, train_sizes=train_sizes)
        train_curve += lc[1].mean(axis=1)
        test_curve += lc[2].mean(axis=1)

    train_curve /= n_folds
    test_curve /= n_folds
    return train_curve, test_curve
Пример #22
0
 def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
     if(selectFeatTech==0):
         t=int(predictors.shape[1]*0.40);
         t=40;
         model = SelectKBest(chi2, k=t).fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     if(selectFeatTech==1):
         randomized_logistic = RandomizedLogisticRegression();
         model = randomized_logistic.fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     return predictors_new, predictors_test_new, indices;
Пример #23
0
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
    if(selectFeatTech==0):
        t=int(predictors.shape[1]*0.40);
        t=40;
        model = SelectKBest(chi2, k=t).fit(predictors, responses);
        predictors_new = model.transform(predictors);
        predictors_test_new = model.transform(test_predictors);
        indices = model.get_support(indices=True);
    if(selectFeatTech==1):
        randomized_logistic = RandomizedLogisticRegression();
        model = randomized_logistic.fit(predictors, responses);
        predictors_new = model.transform(predictors);
        predictors_test_new = model.transform(test_predictors);
        indices = model.get_support(indices=True);
    return predictors_new, predictors_test_new, indices;
Пример #24
0
def predictWithAdaBoost(config, X, Y, testFeatures):
    adaConfig = config.getConfig('model/adaboost')
    if adaConfig.get('useRandomLog', False):
        clf = RandomizedLogisticRegression()
        clf.fit(X, Y)
        X_new = clf.transform(X)
        if not X_new.size == 0:
            X = X_new
            testFeatures = clf.transform(testFeatures)
    clf = AdaBoostClassifier(n_estimators=50,learning_rate=1.0, algorithm='SAMME.R')
    clf.fit(X,Y)
    return clf.predict(testFeatures)
def perform_stability_selection(X_train, y_train, round_id = 0) :
	# Defaults: RandomizedLasso(alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, n_jobs = 1)
	X_train = perform_scaling (X_train, scaling = 'minmax')
	
	#logistic = LogisticRegression(penalty = 'l2', class_weight = 'auto', max_iter = 1000, random_state = 30)
	#logistic.fit(X_train, y_train)
	print ("Round%d - Stability selection -" %(round_id))
	#print ("Logistic (L1 penalty) Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), logistic.coef_), header[1:]), 
  #           reverse=True))
	#print ("Logistic Feature_Importances: ", logistic.coef_)

	rlog = RandomizedLogisticRegression(random_state = 30, n_jobs = 3, n_resampling = 400)
	rlog.fit(X_train, y_train)
	print ("Randomized Logistic Feature_Importances: ", rlog.scores_)
	print ("Randomized Logistic Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), rlog.scores_), header[1:]), 
             reverse=True))
Пример #26
0
def build_classifier(definition, datas):

    if definition['classification'] == 'lr':
        classifier = LogisticRegression(C=1.5)
    elif definition['classification'] == 'sgd':
        classifier = SGDClassifier(alpha=0.0001, n_iter=10**2)
    elif definition['classification'] == 'sgd_grid':

        best_params = grid_search_params(datas)
        classifier = SGDClassifier(n_iter=10**2, **best_params)

    rlr_feature_selection = RandomizedLogisticRegression(C=1.5,
                                                         n_jobs=-1,
                                                         verbose=0)

    # Standard sklearn classifier

    clf = Pipeline([
        #        ('string_encoder', pp_encode_strings),
        #        ('drop_nan_cols', pp_drop_nan_cols),
        #        ('fix_collinear', pp_fix_collinear),
        #
        #        ('float_imputer', pp_imputer),
        #        ('scaler', pp_scaler),
        # ('feature_selection', rlr_feature_selection),
        ('classification', classifier)
    ])

    return clf
def select_top_features(X, y, ques_dict, top_count=7):
    '''
    Run RandomizedLogisticRegression and return top number of features
    
    Args:
        X(dataframe) -- features
        y(dataframe) -- outcome
        ques_dict(dict) -- variable name to questions dictionary
        top_count(int) -- number of top features to return
    '''
    rand_log = RandomizedLogisticRegression()
    X_feat = rand_log.fit(X, y)
    questions = features_to_questions(X.columns, ques_dict)
    all_features = sorted(zip(questions, X_feat.scores_), key=lambda tup: tup[1], reverse=True)
    top_features = [f for f in all_features if f[1] > 0][:top_count]
    return top_features
Пример #28
0
 def _get_clfs(self):
     clf_dict = {"rlrclf": RandomizedLogisticRegression(),
                 "rfclf": RandomForestClassifier(criterion='entropy'),
                 "dtrclf": DecisionTreeClassifier(criterion='entropy'),
                 "lrclf": LogisticRegression()
                 }
     return clf_dict
Пример #29
0
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable)
    # TODO(PRESTON): optimize the params used here
    model_map = {
        'classifier': {
            'SelectFromModel':
            SelectFromModel(RandomForestClassifier(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1),
                           step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel':
            SelectFromModel(RandomForestRegressor(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1),
                           step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Пример #30
0
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel':
            SelectFromModel(RandomForestClassifier(n_jobs=-1,
                                                   max_depth=10,
                                                   n_estimators=15),
                            threshold='20*mean'),
            'RFECV':
            RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect':
            GenericUnivariateSelect(),
            'RandomizedSparse':
            RandomizedLogisticRegression(),
            'KeepAll':
            'KeepAll'
        },
        'regressor': {
            'SelectFromModel':
            SelectFromModel(RandomForestRegressor(n_jobs=-1,
                                                  max_depth=10,
                                                  n_estimators=15),
                            threshold='0.7*mean'),
            'RFECV':
            RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect':
            GenericUnivariateSelect(),
            'RandomizedSparse':
            RandomizedLasso(),
            'KeepAll':
            'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Пример #31
0
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
    """
    Feature selection based on the scores given to the features by the 
    RandomizedLogisticRegression algorithm.
    """

    rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.],
                                       sample_fraction=0.7,
                                       n_resampling=200,
                                       selection_threshold=0.25,
                                       verbose=5,
                                       n_jobs=-1,
                                       random_state=0)
    rlr.fit(X_train, y_train)
    np.save('save/feat_sel_log_reg.npy', rlr.scores_)

    return rlr.scores_
Пример #32
0
def predictWithQDA(config, X, Y, testFeatures):
    qdaConfig = config.getConfig('model/qda')
    if qdaConfig.get('useRandomLog', False):
        clf = RandomizedLogisticRegression()
        clf.fit(X, Y)
        X_new = clf.transform(X)
        if not X_new.size == 0:
            X = X_new
            testFeatures = clf.transform(testFeatures)

    priors = qdaConfig.get('priors', None)
    clf = QDA(priors = priors)
    clf.fit(X, Y)
    return clf.predict(testFeatures)
class LogisticModelBuilder(object):

	def __init__(self):
		self.inter_levels = None
		self.dicts_rep = None
		self.dict_vectorizer = DictVectorizer()
		self.ff_model = None
		self.model = None
		
	def set_data(self, user_atts, inter_atts, responses):
		self.build_data_representations(user_atts, inter_atts)
		# Convert from dict representation into matrix:
		predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray()
		print(predictor_rows)
		print('Finding optimal feature set...')
		self.ff_model = RandomizedLogisticRegression() # Finds best set of features
		# Fit data and get transformed input rows:
		X_new = self.ff_model.fit_transform(predictor_rows, responses)
		print(X_new)
		print('Done! Final Shape: ' + str(X_new.shape))
		print('Building Final model...')		  
		self.model = LogisticRegression().fit(X_new, responses)
		print('Done!')
	
	# Set data based on tuples/rows
	def set_data_rows(self, tuples):
		self.set_data(*ut.unzip(tuples))
	
	# Builds a list-of-dictionaries representation and builds 
	# msg/interaction factor level matrix.
	def build_data_representations(self, user_atts, inter_atts):
		print('Building internal data representations...')
		print('   Building factor level matrix...')
		itp = map(lambda x: set(x), zip(*inter_atts)) # transpose and get row sets
		self.inter_levels = map(lambda x: x if len(filter(lambda y: type(y) == type(''), x)) > 0 else (min(x), max(x)), itp)
		print('   Building dict list representation...')
		self.dicts_rep = dict_list_representation(user_atts, inter_atts)
		print('Done!')	
	
	# Returns a function of form f: X x Y -> P
	# where X = <user_att vals>, Y = <inter. att vals>, and P = P(R = 1)
	def prob_f(self):
		dv = self.dict_vectorizer
		dlr = lambda x, y: dict_list_representation([x], [y])
		ff = self.ff_model
		mod = self.model
		f = lambda X, Y: mod.predict_proba(ff.transform(dv.transform(dlr(X, Y)).toarray()))
		return lambda X, Y: map(lambda z: z[1], f(X, Y))[0]
	
	# Return a vector of interaction attribute levels corresponding to each
	# interaction attribute. For each attribute the following rule is applied:
	# 1) If the attribute is categorical the attribute levels are a list of unique values
	# 2) If the attribute is numeric then a pair (min, max) is returned bounding the values.
	def inter_attr_levels(self):
		return map(lambda lv: lv if type(lv) == type(()) else list(lv), self.inter_levels) 
		
Пример #34
0
def programmer_1():
    filename = "data/bankloan.xls"
    data = pd.read_excel(filename)

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()
    rlr.fit(x, y)
    rlr_support = rlr.get_support()
    support_col = data.drop('违约', axis=1).columns[rlr_support]

    print(
        "rlr_support_columns: {columns}".format(columns=','.join(support_col)))
    x = data[support_col].as_matrix()

    lr = LR()
    lr.fit(x, y)

    print("lr: {score}".format(score=lr.score(x, y)))
def programmer_1():
    filename = "data/bankloan.xls"
    data = pd.read_excel(filename)

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()
    rlr.fit(x, y)
    rlr_support = rlr.get_support()
    support_col = data.drop('违约', axis=1).columns[rlr_support]

    print(
        "rlr_support_columns: {columns}".format(columns=','.join(support_col)))
    x = data[support_col].as_matrix()

    lr = LR()
    lr.fit(x, y)

    print("lr: {score}".format(score=lr.score(x, y)))
def tipdm_chapter5_test():
	# 参数初始化
	filename = '../../../MyFile/chapter5/data/bankloan.xls'
	data = pd.read_excel(filename)
	x = data.iloc[:,:8].as_matrix()
	y = data.iloc[:,8].as_matrix()

	# feature selection
	rlr = RLR()	# 建立随机逻辑回归模型,筛选变量
	rlr.fit(x, y)	# 训练模型
	features = rlr.get_support()	# 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
	print(u'通过随机逻辑回归模型筛选特征结束。')
	print(u'有效特征为: {0}'.format(','.join(data.columns[features])))
	x = data[data.columns[features]].as_matrix()	# 筛选好特征

	# training and test
	lr = LR()	# 建立逻辑货柜模型
	lr.fit(x, y)	# 用筛选后的特征数据来训练模型
	print(u'逻辑回归模型训练结束。')
	print(u'模型的平均正确率为: {0}'.format(lr.score(x, y))) # 给出模型的平均正确率
Пример #37
0
    def rank_random_logistic_regression(self,
                                        features_indep_df: PandasDataFrame,
                                        feature_target: List,
                                        n_jobs: int = -1,
                                        **kwargs: Any) -> object:
        """Use Randomized Logistic Regression to rank features.
        Attributes:
        model.scores_
        model.all_scores_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001,
        fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs'
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Random Logistic Regression.")
        classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs)
        return classifier.fit(features_indep_df, feature_target)
Пример #38
0
def feature_method_selection(data, label, fsname):
    """
    select features by option 'fsname'
    :param data:
    :param label:
    :param fsname:
    :return: new_data, selected data
    :return: selected_features_inx, the index of selected feature, starts with 0
    """
    if fsname == 'variance_threshold': #变化不大就舍弃,离散值
        model = VarianceThreshold() #th=1
        return model.fit_transform(data)

    elif fsname == 'select_kbest':
        model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类

    elif fsname == 'rfe':#递归消除,耗时很长
        svc = SVC(kernel='linear', C=1)
        model = RFE(estimator=svc, n_features_to_select=10, step=1)

    elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值
        svc = SVC(kernel="linear")
        rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1),
                      scoring='accuracy')

    elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type
        model = RandomizedLogisticRegression()


    elif fsname == 'linear_svc':
        model = LinearSVC() #没有importance

    elif fsname == 'tree':
        model = ExtraTreesClassifier()

    elif fsname == 'fclassif':
        model = SelectFpr() #默认是f_classif,值越大,特征越有用

    elif fsname == 'pearsonr': #label必须是数值
        label = turn_label_2num(label)#结果是两个sample的相关性
        res = pearsonr(data,label)

    elif fsname == 'RandForReg': #label必须是数值
        label = turn_label_2num(label)
        model = RandomForestRegressor()

    else:
        logging.error('ERROR: feature selection option is wrong')

    model.fit(data, label)
    new_data = model.transform(data)  # selected importanted data

    return new_data
def run_logreg(X_train, y_train, selection_threshold=0.2):
    print('\nrunning logistic regression...')
    print('using a selection threshold of {}'.format(selection_threshold))
    pipe = Pipeline([
        ('feature_selection', RandomizedLogisticRegression(
            selection_threshold=selection_threshold)),
        ('classification', LogisticRegression())
    ])
    pipe.fit(X_train, y_train)
    print('training accuracy : {}'.format(pipe.score(X_train, y_train)))
    print('testing accuracy : {}'.format(pipe.score(X_test, y_test)))
    return pipe
Пример #40
0
def rank_features(algorithm, X, y):
    # The RFE approach can be used with various different classifiers
    if algorithm == 'random_forest_rfe':
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.feature_selection import RFE
        estimator = RandomForestClassifier(n_estimators=50,
                                           random_state=R_SEED,
                                           n_jobs=1)
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'svm_rfe':
        from sklearn.svm import SVC
        from sklearn.feature_selection import RFE
        estimator = SVC(random_state=R_SEED, kernel='linear')
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'random_logistic_regression':
        # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/
        from sklearn.linear_model import RandomizedLogisticRegression
        rlasso = RandomizedLogisticRegression(random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'random_lasso':
        from sklearn.linear_model import RandomizedLasso
        rlasso = RandomizedLasso(random_state=R_SEED)
        #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'anova':
        from sklearn.feature_selection import f_classif
        F, pval = f_classif(X, y)
        random_array = random.random(len(pval))
        order = lexsort((random_array, pval))  # will break ties by random
        for i in order:
            print features[i]
    else:
        print "Invalid algorithm: %s" % algorithm
        exit(1)
def get_support_fields(X,Y):
    '''
    Function for getting support fields
    '''
    rlr = RLR() #建立随机逻辑回归模型,筛选变量
    rlr.fit(X, Y) #训练模型
    rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
    print rlr.scores_
    print(u'有效特征为:%s' % (','.join(data.columns[rlr.get_support()])).decode('utf-8'))
    X = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征
    return X
Пример #42
0
def rdlg_variables(X, y, threshold=0.25):#默认阈值0.25
    """
    #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
    #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
    #不同的子集上建立模型,然后汇总最终确定特征得分
    稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
    它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
    比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
    理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
    总的来说,好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。
    对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一。
    """

    rlr = RandomizedLogisticRegression(selection_threshold = threshold)  #随机逻辑回归
    rlr.fit(X, y)
    scoretable = pd.DataFrame(rlr.all_scores_, index = X.columns) #汇总最终确定特征得分
    scoretable = scoretable.reset_index()    
    scoretable = scoretable.rename(columns = {'index':'Col', 0:'value_retio'}, copy = False)    
    df_score = scoretable[scoretable.value_retio > threshold] #删掉缺失值<0.25的数据   
    refesh_data = X[list(df_score['Col'])] 
         
    return scoretable,refesh_data
Пример #43
0
 def pick_variables(self,
                    descover=True,
                    method="rlr",
                    threshold=0.25,
                    auto_pick=True):  #默认阈值0.25
     #挑选变量助手(特征选择)
     if method == "rlr":
         """
         #顶层特征选择算法
         #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
         #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
         #不同的子集上建立模型,然后汇总最终确定特征得分
         稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
         它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
         比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
         理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
         RandomizedLogisticRegression()
         fit(X, y)	Fit the model using X, y as training data.
         fit_transform(X[, y])	Fit to data, then transform it.
         get_params([deep])	Get parameters for this estimator.
         get_support([indices])	Get a mask, or integer index, of the features selected
         inverse_transform(X)	Reverse the transformation operation
         set_params(**params)	Set the parameters of this estimator.
         transform(X)	Reduce X to the selected features.
         """
         rlr = RandomizedLogisticRegression(
             selection_threshold=threshold)  #随机逻辑回归
         rlr.fit(self.X_train, self.y_train)
         scoretable = pd.DataFrame(rlr.all_scores_,
                                   index=self.X_train.columns,
                                   columns=['var_score'])  #汇总最终确定特征得分
         columns_need = list(self.X_train.columns[rlr.get_support(
         )])  #	Get a mask, or integer index, of the features selected
         self.X_train = self.X_train[columns_need]
         self.X_test = self.X_test[columns_need]
         columns_need.append("y")
         if auto_pick:
             self.picked_data = self.data[columns_need]
         return scoretable
Пример #44
0
def programmer_1():
    # 参数初始化
    filename = r'bankloan.xls'
    data = pd.read_excel(filename)
    x = data.iloc[:, :8].as_matrix()  # 使用pandas读取文件  就可以不用管label column标签
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()  # 建立随机逻辑回归模型,进行特征选择和变量筛选
    rlr.fit(x, y)  # 训练模型
    egeList = rlr.get_support()  # 获取筛选后的特征
    egeList = np.append(
        egeList, False)  # 往numpy数组中 添加一个False元素  使用np.append(array,ele)方法
    print("rlr.get_support():")
    print(egeList)
    print(u'随机逻辑回归模型特征选择结束!!!')
    print(u'有效特征为:%s' % ','.join(data.columns[egeList]))
    x = data[data.columns[egeList]].as_matrix()  # 筛选好特征值

    lr = LR()  # 建立逻辑回归模型
    lr.fit(x, y)  # 用筛选后的特征进行训练
    print(u'逻辑回归训练模型结束!!!')
    print(u'模型的平均正确率:%s' % lr.score(x, y))  # 给出模型的平均正确率,本例为81.4%
Пример #45
0
def lasso_regression(X, y):
	"""
	Use Randomized Logistic Regression to select the features based on the coefficient values
	"""

	clf = RandomizedLogisticRegression(C=1.0)
	clf.fit(X, y)
	print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0))
	imp_feature_idx = clf.scores_.argsort()
	
	qualities = []
	
	X_train, X_test, y_train, y_test = split_examples(X, y)
	
	for i in range(0, 100, 4):
		clf = LogisticRegression(C=0.1)
		clf.fit(X_train[:, imp_feature_idx[i:]], y_train)
		q = roc_auc_score(y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1])
		
		qualities.append(q)
	plt.plot(range(0, 100, 4), qualities)
	plt.show()
	
	return qualities
Пример #46
0
from __future__ import division
import numpy as np
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model import LogisticRegression

X = np.load("../feats/train_formatted.npy")
y = np.load("../feats/train_y.npy")
X_test = np.load("../feats/test_formatted.npy")
y_test = np.load("../feats/test_y.npy")

clf = RandomizedLogisticRegression()
clf.fit(X, y) 
scores = clf.scores_
print 'Index    :   score'
sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)]
top = 30
for i in range(top):
    print str(sortedIdx[i]) + ' :   ' + str(scores[sortedIdx[i]])

lr = LogisticRegression()
lr.fit(clf.transform(X), y)
pred = lr.predict(clf.transform(X_test))
accuracy = sum(pred == y_test)/y_test.size
print 'Logistic Regression Accuracy: ' + str(accuracy)

Пример #47
0
    y_inv = Counter(lb_encoder.inverse_transform(y))
    print("Classes:", y_inv)

    # 'Normalize/Scale features if needed. Our data is standardized by default'
    # X = StandardScaler(copy=False).fit_transform(X)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    featureNames=featureNames[Fwe.get_support()]
    print("F-test filter ->",X.shape)

    FeatSelection_SVM=True
    FeatSelection_RandLogReg=False

    if FeatSelection_RandLogReg == True:
        LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5,
         sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1)
        X = LogRegFeats.fit_transform(X,y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print("RandomizedLogisticRegression Feature Selection ->:",X.shape)

    elif FeatSelection_SVM == True:
        X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y)
        # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print ("SVC Transformed X:",X.shape)

    '''
    print("Plot #Feats vs Classification performance:")
    PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100))
    '''
Пример #48
0
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
# -*- coding:utf-8 -*-
# 逻辑回归:自动建模
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

data = pd.read_excel("c://mldata//bankloan.xls", header=0)
# x = data.iloc[:, :8].as_matrix()
# y = data.iloc[:, 8].as_matrix()   和下边的两种读取数据的方式,都会带来精度的影响
train_data = data.values  # 将读取的数据其转换为矩阵形式
train_x = train_data[0::, :8]
train_label = train_data[0::, 8]

rlr = RLR()  # 建立随机回归模型,筛选变量
rlr.fit(train_x, train_label)  # 训练模型
rlr.get_support()  # 获取特征筛选结果
print u"特征筛选结束"
print u"有效特征为:%s" % u'、'.join(data.columns[rlr.get_support()])

x = data[data.columns[rlr.get_support()]].as_matrix()  # 筛选好的特征

lr = LR()
lr.fit(x, train_label)  # 用筛选好的特征数据来训练模型
print u'逻辑回归训练结束'
print u'模型的平均正确率为:%s' % lr.score(x, train_label)
# Useful sources:
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
X, y = iris.data, iris.target
print(X)
print(y)
ff_model = RandomizedLogisticRegression() # Finds best set of features
X_new = ff_model.fit_transform(X, y)  # Fit data and get transformed input rows
print(X_new)
print(X.shape)
print(X_new.shape)
print(X[0:4])
print(ff_model.transform(X[0:4]))  # Transform the first 4 rows of data to get only best features
model = LogisticRegression().fit(X_new, y) # Fit logistic regression with best features
print(model.predict_proba(ff_model.transform(X[0:4]))) # predict probabilities for first 4 rows of data
print(ff_model.inverse_transform(ff_model.transform(X[0:4]))) # Test inverse transforming
arr = np.array([[1,1,1]])
print(ff_model.inverse_transform(arr)) # Get original matrix structure with 1's only in columns of retained features.
#-*- coding: utf-8 -*-
#逻辑回归 自动建模
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
#参数初始化
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()#8个属性
y = data.iloc[:,8].as_matrix()#第九列  结果标签

#稳定性选择方法  挑选特征
rlr = RLR(selection_threshold=0.5) #建立随机逻辑回归模型,筛选变量  特征筛选用了默认阈值0.25
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选结果
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))

x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征,重新训练模型
lr = LR() #建立逻辑货柜模型
lr.fit(x, y) #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y))
#-*- coding: utf-8 -*-
#逻辑回归 自动建模
import pandas as pd

#参数初始化
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR 
rlr = RLR() #建立随机逻辑回归模型,筛选变量
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征

lr = LR() #建立逻辑货柜模型
lr.fit(x, y) #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%
# 代码清单5-1 逻辑回归代码

import pandas as pd
# 参数初始化
fileName = 'data/bankloan.xls'
data = pd.read_excel(fileName)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

# 逻辑回归模型
from sklearn.linear_model import LogisticRegression as LR
# 随机逻辑回归模型
from sklearn.linear_model import RandomizedLogisticRegression as RLR
# 建立随机逻辑回归模型,筛选变量
rlr = RLR()
# 训练模型
rlr.fit(x,y)
# 获取特筛选结果,也可以通过.score_方法获取各个特征的分数
rlr.get_support()
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为 %s' %'.'.join(data.columns[rlr.get_support()]))
# 筛选好特征
x = data[data.columns[rlr.get_support()]].as_matrix()

# 建立逻辑回归模型
lr = LR()
# 用筛选后的特征数据来训练模型
lr.fit(x,y)
print(u'逻辑回归模型训练结束。')
# 给出模型的平均正确率,本例为81.48
Пример #54
0
import pandas as pda
fname = "C:/Users/Administrator/Desktop/data/luqu.xls"
dataf = pda.read_excel(fname)
#DataFrame.as_matrix: Convert the frame to its Numpy-array representation
#DataFrame.iloc: Purely integer-location based indexing for selection by position
x = dataf.iloc[:, 1:4].as_matrix()
y = dataf.iloc[:, 0:1].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
r1 = RLR()
r1.fit(x, y)
eff = r1.get_support()#find the effective features, remove noneffective ones
#print(dataf.columns[eff])
t = dataf[dataf.columns[r1.get_support()]].as_matrix()
r2 = LR()
r2.fit(t, y)
print("training ends")
print("accuracy: " + str(r2.score(x,y))) #score():Returns the mean accuracy on the given test data and labels
Пример #55
0
def evaluate_model(model, X, y, labels, save_features=False, group_ablation=False, feature_output_name="features.csv", ticks=XTICKS):

    model_fs = RandomizedLogisticRegression(C=1, random_state=1)
    # Split into folds using labels
    # label_kfold = LabelKFold(labels, n_folds=10)
    group_kfold = GroupKFold(n_splits=10).split(X,y,groups=labels)
    folds  = []
    
    # For feature analysis
    feat_scores = []

    # For ablation study
    # Group ablation study
    feature_groups = feature_sets.get_all_groups()
    ablated = {key: set() for key in feature_groups.keys()}
    roc_ab  = {key: list() for key in feature_groups.keys()}
    roc_ab['true_roc_score'] = []

    for train_index, test_index in group_kfold:
        print "processing fold: %d" % (len(folds) + 1)

        # Split
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        scores   = []

        for k in XTICKS:
            indices = util.get_top_pearson_features(X_train, y_train, k)

            # Select k features
            X_train_fs = X_train[:, indices]
            X_test_fs  = X_test[:, indices]

            model = model.fit(X_train_fs, y_train)
            # summarize the selection of the attributes
            yhat  = model.predict(X_test_fs)                  # Predict
            scores.append(f1_score(y_test, yhat))     # Save
            if group_ablation:
                true_roc_score = roc_auc_score(y_test, yhat)
                roc_ab['true_roc_score'].append(true_roc_score)

                for group in feature_groups.keys():
                    # Get group features
                    features     = feature_groups[group]
                    features_idx = util.get_column_index(features, X)

                    # Get indices
                    indices_ab      = [i for i in indices if i not in features_idx]
                    removed_indices = [i for i in indices if i in features_idx]

                    # Filter
                    X_train_ab = X_train[:, indices_ab]
                    X_test_ab  = X_test[:, indices_ab]

                    # Fit
                    model_ab = model.fit(X_train_ab, y_train)
                    # Predict
                    yhat_ab  = model_ab.predict(X_test_ab)

                    # Save
                    ablated[group].update(X.columns[removed_indices])
                    roc_ab_score = roc_auc_score(y_test, yhat_ab)
                    roc_ab[group].append(roc_ab_score - true_roc_score)

        # ----- save row -----
        folds.append(scores)
        # ----- save row -----

        # ----- save features -----
        if save_features:
            model_fs = model_fs.fit(X_train, y_train)
            feat_scores.append(model_fs.scores_)
        # --------------------

    if save_features:
        feat_scores = np.asarray(feat_scores)  # convert to np array
        feat_scores = feat_scores.mean(axis=0)  # squash

        # This command maps scores to features and sorts by score, with the feature name in the first position
        feat_scores = sorted(zip(X.columns, map(lambda x: round(x, 4), model_fs.scores_)),
                             reverse=True, key=lambda x: x[1])
        feat_scores = pd.DataFrame(feat_scores)

        csv_path = "output/feature_scores/" + feature_output_name
        feat_scores.to_csv(csv_path, index=False)
        util.print_full(feat_scores)

    if group_ablation:
        roc_ab = pd.DataFrame(roc_ab).mean()
        print "======================="
        print "True AUC Score: %f" % roc_ab['true_roc_score']
        print "=======================\n\n"

        for group in ablated.keys():
            print "-----------------------"
            print "Group: %s " % group
            print "Removed: %s" % list(ablated[group])
            print "Change in AUC: %f" % (roc_ab[group])
            print "-----------------------\n"

    folds = np.asarray(folds)
    return folds
Пример #56
0
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import RandomizedLogisticRegression
import fmriUtils as fm  #自定义函数

n_folds = 10

f = fm.outTo() #输出重定向到文件
X,y = fm.loadData2()   
X2,y2 = fm.loadData2()   

y = fm.defineClass(y)

randomized_logistic = RandomizedLogisticRegression(C=0.1,n_jobs=2)
randomized_logistic.fit(X,y)
XX = randomized_logistic.transform(X)
print "============选择后剩余的特征================"
print XX.shape

yy = y
cv = StratifiedKFold(yy,n_folds)
cv_scores = []
for train, test in cv:
    svc = SVC(kernel='linear')
    svc.fit(XX[train], yy[train])
    prediction = svc.predict(XX[test])
    cv_scores.append( np.sum(prediction == yy[test]) / float(np.size(yy[test])) )
    
print "========分类准确率======="