示例#1
0
def featureSelection(train_x, train_y):
    # Create the RFE object and compute a cross-validated score.
    svc = LinearSVC(C=1, class_weight='balanced')
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    lasso = RandomizedLasso()
    lasso.fit(train_x, train_y)
    rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy')
    rfecv.fit(train_x, train_y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    rankings = rfecv.ranking_
    lasso_ranks = lasso.get_support()
    lassoFeats = []
    recursiveFeats = []
    shouldUseFeats = []

    for i in range(len(rankings)):
        if lasso_ranks[i]:
            lassoFeats.append(feats[i])
        if rankings[i] == 1:
            recursiveFeats.append(feats[i])
            if lasso_ranks[i]:
                shouldUseFeats.append(feats[i])
    keyboard()
    print 'Should use ' + ', '.join(shouldUseFeats)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
def plot_stable_features(X_train,y_train,featnames,**kwargs):
    from sklearn.linear_model import LassoLarsCV,RandomizedLasso

    n_resampling = kwargs.pop('n_resampling',200)
    n_jobs = kwargs.pop('n_jobs',-1)
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        # estimate alphas via xvalidation 
        lars_cv = LassoLarsCV(cv=6,n_jobs=n_jobs).fit(X_train,y_train)        
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)

        clf = RandomizedLasso(alpha=alphas, random_state=42, n_jobs=n_jobs,
                              n_resampling=n_resampling)
        clf.fit(X_train,y_train)
        importances = clf.scores_ 
        indices = np.argsort(importances)[::-1]

        pl.bar(range(len(featnames)), importances[indices],
               color="r", align="center")
        pl.xticks(np.arange(len(featnames))+0.5,featnames[indices],
                  rotation=45,horizontalalignment='right')
        pl.xlim(-0.5,len(featnames)-0.5)
        pl.subplots_adjust(bottom=0.2)
        
        pl.ylim(0,np.max(importances)*1.01)
        pl.ylabel('Selection frequency (%) for %d resamplings '%n_resampling)
        pl.title("Stability Selection: Selection Frequencies")
def lasso_fs(X, y):
    rlasso = RandomizedLasso()
    rlasso.fit(X, y)
    classes = range(0, X.shape[1])

    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), classes), reverse=True)
示例#4
0
def select_feature_importance():
    data4columns = dataset.drop(['Max overdue'], axis=1)
    column_names = np.asarray(data4columns.columns.values)
    lasso = RandomizedLasso(alpha=0.025)
    scaled_data = scaler.fit_transform(data)
    lasso.fit(scaled_data, target)
    scores = lasso.scores_
    #  column_names
    #  print scores
    print sorted(zip(map(lambda x: round(x, 4), scores), column_names), reverse=True)
示例#5
0
def featureselection(datset, output='results'):
    t0 = time()
    dataset = pd.read_csv(datset)
    dataset.to_csv(output[:-4] + '.csv')
    # DO feature evaluation and write a xlsx file.
    wb = Workbook()
    ws1 = wb.active
    ws1.title = "feature selection scores"
    rownum = 2
    ws1.cell(column=1, row=1).value = 'Feature name'
    ws1.cell(column=2, row=1).value = 'Stability Selection'
    ws1.cell(
        column=3,
        row=1).value = 'Univariate using random forest regressor (r2 measure)'
    ws1.cell(
        column=4,
        row=1).value = 'Univariate using random forest regressor (auc measure)'
    ws1.cell(column=5, row=1).value = 'L1 regularization / Lasso'
    ws1.cell(column=6, row=1).value = 'L3 regularization / Ridge'
    ws1.cell(column=7, row=1).value = "Mean decrease impurity"
    ws1.cell(column=8, row=1).value = 'Recursive feature elimination'
    Collumnheadeers = list(dataset.columns.values)
    for imagebiom in Collumnheadeers:
        print imagebiom
        ws1.cell(column=1, row=rownum).value = imagebiom
        rownum += 1
    print dataset
    print Collumnheadeers
    # Create and save correlation plots.
    ## Get labels
    y = dataset['label'].values
    ## Delete labes from list
    Collumnheadeers.remove('label')
    X = dataset[Collumnheadeers]
    corplot(X, filesavename='all.pdf')
    # performe feature selection
    rlasso = RandomizedLasso(alpha=0.00025)
    rlasso.fit(X, y)
    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                     Collumnheadeers),
                 reverse=True)
    rownum = 2
    for score_val in rlasso.scores_.tolist():
        ws1.cell(column=2, row=rownum).value = score_val
        rownum += 1
        rlasso.scores_
    print np.where(rlasso.scores_ > 0.8)[0] + 1
    print Collumnheadeers
    elementselect = np.where(rlasso.scores_ > 0.8)[0]
    Collumnheadeersel = []
    for i in elementselect:
        Collumnheadeersel.append(Collumnheadeers[i])
    corplot(X[Collumnheadeersel], filesavename='selectedstability.pdf')
    wb.save(filename='ResultTableIndividualFeature.xlsx')

    rf = RandomForestRegressor(n_estimators=20, max_depth=4)
    scores = []
    scoresval = []
    X1 = X.as_matrix()
    for i in range(X.shape[1]):
        score = cross_val_score(rf,
                                X1[:, i:i + 1],
                                y,
                                scoring="r2",
                                cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), Collumnheadeers[i]))
        scoresval.append(round(np.mean(score), 3))
    print sorted(scores, reverse=True)
    rownum = 2
    for score_val in scoresval:
        ws1.cell(column=3, row=rownum).value = score_val
        rownum += 1
    scoresval = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf,
                                X1[:, i:i + 1],
                                y,
                                scoring="roc_auc",
                                cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), Collumnheadeers[i]))
        scoresval.append(round(np.mean(score), 3))
    print sorted(scores, reverse=True)
    rownum = 2
    for score_val in scoresval:
        ws1.cell(column=4, row=rownum).value = score_val
        rownum += 1
    scaler = StandardScaler()
    X3 = scaler.fit_transform(X1)
    lasso = Lasso(alpha=.0003)
    lasso.fit(X3, y)
    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), lasso.coef_), Collumnheadeers),
                 reverse=True)
    rownum = 2
    for score_val in lasso.coef_.tolist():
        ws1.cell(column=5, row=rownum).value = score_val
        rownum += 1

    scaler = StandardScaler()
    X3 = scaler.fit_transform(X1)
    ridge = Ridge(alpha=10)
    ridge.fit(X3, y)
    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), lasso.coef_), Collumnheadeers),
                 reverse=True)
    rownum = 2
    for score_val in lasso.coef_.tolist():
        ws1.cell(column=6, row=rownum).value = score_val
        rownum += 1

    rf = RandomForestRegressor()
    rf.fit(X, y)
    rownum = 2
    for score_val in rf.feature_importances_.tolist():
        ws1.cell(column=7, row=rownum).value = score_val
        rownum += 1
    #use linear regression as the model
    lr = LinearRegression()
    #rank all features, i.e continue the elimination until the last one
    rfe = RFE(lr, n_features_to_select=1)
    rfe.fit(X, y)
    rownum = 2
    for score_val in rfe.ranking_.tolist():
        ws1.cell(column=8, row=rownum).value = score_val
        rownum += 1
    wb.save(filename='ResultTableIndividualFeature.xlsx')
    path_ = os.getcwd()
    directory = path_ + '/output/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    types = ('*.pdf', '*.csv', '*.xlsx')  # the tuple of file types
    files_grabbed = []
    for files in types:
        files_grabbed.extend(glob.glob(files))
    for file in files_grabbed:
        if os.path.isfile(file):
            shutil.copy2(file, directory)
    shutil.make_archive(output[:-4], 'zip', directory)
    return 0
def lass_varselect(train, num_vars, target, alpha):
    lass = RandomizedLasso(alpha=alpha, n_resampling=5)
    lass.fit(train[num_vars], train[target])
    return lass.get_support()
示例#7
0
rfe = rfe.fit(X_train, y_train)

rfe2 = RFE(estimator=RandomForestClassifier(criterion='entropy', n_estimators=10,random_state=3,n_jobs=2), n_features_to_select=2)
rfe2 = rfe2.fit(X_train, y_train)
"""
"""
from sklearn.svm import SVR
rfe3 = RFE(estimator=SVR(kernel="linear"), n_features_to_select=2)
rfe3 = rfe3.fit(X_train, y_train)
"""
"""stability selection: see which feature is selected most"""

from sklearn.linear_model import RandomizedLasso

rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(X_train, y_train.values.ravel())

rlasso_score = rlasso.scores_

temp = rlasso.scores_.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(rlasso.scores_))

#print (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True))

rank_summary = pd.DataFrame()
rank_summary['features'] = names
#rank_summary['ranking_logi']=94-rfe.ranking_
#rank_summary['ranking_RF']=94-rfe2.ranking_
#rank_summary['ranking_svm']=94-rfe3.ranking_
rank_summary['ranking_stab_sel'] = ranks
def feature_selection(df, target_column, id_column):
    print("IDENTIFYING TYPES...")
    """
    df = The training dataframe
    target_column = The column containing the target variable
    id_column = The column containing the id variable

    Based on the output column type (binary or numeric), it decides on the type of problem we are trying to solve.
    If the output column is binary (0/1), we use Genetic Algorithms for feature selection.
    If the output column is numeric, we use the best half of the features using the feature importance from RandomForests.
    """
    df = df
    lists = set(list(df))
    output_var = target_column
    list_inputs = [x for x in lists if not x == target_column]

    if (df[output_var].isin([0, 1]).all()):
        method_type = 'categorical'
    else:
        method_type = 'numerical'

    print(method_type)

    if method_type == "categorical":
        methods = [
            "SVM", "Decision Trees", "KNNs", "Logistic Regression",
            "Naive Bayes"
        ]
    elif method_type == "numerical":
        methods = [
            "Linear Regression", "Random Forest", "Correlation", "Ridge",
            "Lasso"
        ]

    if method_type == "categorical":
        print("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):")

        #####
        #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION)
        #####
        creator.create("FitnessMax", base.Fitness, weights=(1.0, ))
        creator.create("Individual", list, fitness=creator.FitnessMax)
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual",
                         tools.initRepeat,
                         creator.Individual,
                         toolbox.attr_bool,
                         n=len(list_inputs))
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)

        def evalOneMax(individual):
            return sum(individual),

        toolbox.register("evaluate", evalOneMax)
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
        toolbox.register("select", tools.selTournament, tournsize=3)

        NPOPSIZE = 50  #RANDOM STARTING POOL SIZE
        population = toolbox.population(n=NPOPSIZE)

    #####
    #ASSESSING GINI ON THE STARTING POOL
    #####
    dic_gini = {}
    for i in range(np.shape(population)[0]):

        # TRASLATING DNA INTO LIST OF VARIABLES (1-81)
        var_model = []
        for j in range(np.shape(population)[0]):
            if (population[i])[j] == 1:
                var_model.append(list(list_inputs)[j])

        # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL

        X_train = df[var_model]
        Y_train = df[output_var]

        ######
        # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####
        if "SVM" in methods:
            svc = svm.SVC(probability=True)
            model = svc.fit(X_train, Y_train)
            Y_predict = model.predict(X_train)
            ######
            # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
            #####

            ######
            # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
            #####
            fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
            auc = metrics.auc(fpr, tpr)
            gini_power = abs(2 * auc - 1)
            ######
            # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
            #####

            gini = str(gini_power) + ";" + str(population[j]).replace(
                '[', '').replace(', ', '').replace(']', '')
            dic_gini[gini] = population[j]
        list_gini = sorted(dic_gini.keys(), reverse=True)

    ####
    # ASSESSING RMSE ON THE STARTING POOL
    ####
    if method_type == "numerical":
        X_train = df[var_model]
        Y_train = df[output_var]

        names = list(X_train)
        ranks = {}
        # Linear Regression Model and trying to get the feature scores for the features in Linear Regression
        lr = LinearRegression(normalize=True)
        lr.fit(X_train, Y_train)
        ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)
        # Ridge Regression Model and trying to get the feature scores for the features in Ridge Regression

        ridge = Ridge(alpha=7)
        ridge.fit(X_train, Y_train)
        ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

        # Lasso Regression Model and trying to get the feature scores for the features in Lasso Regression

        lasso = Lasso(alpha=.05)
        lasso.fit(X_train, Y_train)
        ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

        #Randomized Lasso Regression Model and trying to get the feature scores for the features in Randomized Lasso Regression

        rlasso = RandomizedLasso(alpha=0.04)
        rlasso.fit(X_train, Y_train)
        ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)
        # Random Forest Regression Model and trying to get the feature scores for the features in Random Forest Regression

        rf = RandomForestRegressor()
        rf.fit(X_train, Y_train)
        ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

        # Correlation Model and trying to get the feature scores for the features in Correlation

        f, pval = f_regression(X_train, Y_train, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        r = {}
        for name in names:
            r[name] = round(
                np.mean([ranks[method][name] for method in ranks.keys()]), 2)

# Truncating to 2 decimal points
        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")
        print(ranks["Mean"])

        print("\t\t%s" % "\t".join(methods))
        for name in names:
            print("%s\t%s" % (name, "\t".join(
                map(str, [ranks[method][name] for method in methods]))))
#Printing out feature scores
        ranks_f = pd.DataFrame(ranks)
        ranks_f.sort_values("RF", 0, 0, inplace=True)
        # Sorting features by importance with respect to random forests regression
        print(ranks_f)
        #Printing out sorted feature scores
        featureset = ranks_f.index.values[0:(len(rank_f) / 2)]
        #Printing out the selected features
        print(featureset)

    if method_type == "categorical":
        #GENETIC ALGORITHM MAIN LOOP - START
        # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES)
        #####
        sum_current_gini = 0.0
        sum_current_gini_1 = 0.0
        sum_current_gini_2 = 0.0
        first = 0
        OK = 1
        a = 0
        while OK:  #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS
            a = a + 1
            print('loop ', a)
            OK = 0

            ####
            # GENERATING OFFSPRING - START
            ####
            offspring = algorithms.varAnd(
                population, toolbox, cxpb=0.5, mutpb=0.1
            )  #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10%
            fits = toolbox.map(toolbox.evaluate, offspring)
            for fit, ind in zip(fits, offspring):
                ind.fitness.values = fit
            population = toolbox.select(offspring, k=len(population))
            ####
            # GENERATING OFFSPRING - END
            ####

            sum_current_gini_2 = sum_current_gini_1
            sum_current_gini_1 = sum_current_gini
            sum_current_gini = 0.0

            #####
            #ASSESSING GINI ON THE OFFSPRING - START
            #####
            for j in range(np.shape(population)[0]):
                if population[j] not in dic_gini.values():
                    var_model = []
                    for i in range(np.shape(population)[0]):
                        if (population[j])[i] == 1:
                            var_model.append(list(list_inputs)[i])

                    X_train = df[var_model]
                    Y_train = df[output_var]

                    ######
                    # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####
                    if "SVM" in methods:
                        svc = svm.SVC(probability=True)
                        model = svc.fit(X_train, Y_train)
                        Y_predict = model.predict(X_train)
                    ######
                    # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####

                    ######
                    # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####
                    fpr, tpr, thresholds = metrics.roc_curve(
                        Y_train, Y_predict)
                    auc = metrics.auc(fpr, tpr)
                    gini_power = abs(2 * auc - 1)
                    ######
                    # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####

                    gini = str(gini_power) + ";" + str(population[j]).replace(
                        '[', '').replace(', ', '').replace(']', '')
                    dic_gini[gini] = population[j]
            #####
            #ASSESSING GINI ON THE OFFSPRING - END
            #####

            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START
            #####
            list_gini = sorted(dic_gini.keys(), reverse=True)
            population = []
            for i in list_gini[:NPOPSIZE]:
                population.append(dic_gini[i])
                gini = float(i.split(';')[0])
                sum_current_gini += gini
            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END
            #####

            #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS
            print('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=',
                  sum_current_gini_1, 'sum_current_gini_2=',
                  sum_current_gini_2)
            if (sum_current_gini > sum_current_gini_1 + 0.0001
                    or sum_current_gini > sum_current_gini_2 + 0.0001):
                OK = 1
        #####
        #GENETIC ALGORITHM MAIN LOOP - END
        #####

    if method_type == "categorical":

        gini_max = list_gini[0]
        gini = float(gini_max.split(';')[0])
        features = gini_max.split(';')[1]

        ####
        # PRINTING OUT THE LIST OF FEATURES
        #####
        f = 0
        l = list()
        for i in range(len(features)):
            if features[i] == '1':
                f += 1
                print('feature ', f, ':', list(list_inputs)[i])
                l.append(list(list_inputs)[i])

        print('gini: ', gini)

        featureset = l

# Returns the featureset from regression if output column is numerical otherwise returns the featureset from categorical if
# output column is categorical
    return (df[featureset])
示例#9
0
    new_y = y[pass_vals]
    new_X = X[pass_vals]
    return new_X, new_y


X, labels = transform_Xy(X, labels)

expr = center_data(labels['Multicov'].values)
spec = center_data(labels['Specificity'].values)
slope, intercept, r_value, p_value, stderr = linregress(spec, expr)
residues = residual(spec, expr, slope, intercept)

if exp == 'full':
    y = expr
elif exp == 'res':
    y = residues

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.5,
                                                    random_state=0)

rl = RandomizedLasso()
fs = rl.fit(X_train, y_train)
hist_scores = dict(zip(features, fs.scores_))

pickle.dump(
    hist_scores,
    open(folder + 'results/histScores' + condition + cell + exp + '.pkl',
         'wb'))
示例#10
0
def main(train_label, train_feat, modelsdir, selfeat):

  X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' '))
  y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' '))

  X_trains = X_train
  scaler = StandardScaler().fit(X_train)
  X_trains = scaler.transform(X_train)


    # performs feature selection
  featsel_str = ".all-feats"
  if int(selfeat):
    print "Performing feature selection ..."
    # initializes selection estimator
    sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                              n_jobs=int(config['n_jobs']), random_state=42,
                              n_resampling=1000)
  
    sel_est.fit(X_trains, y_train)
    X_trains = sel_est.transform(X_trains)
  
    selected_mask = sel_est.get_support()
    selected_features = sel_est.get_support(indices=True)
  
    sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)])
  
    # saves indices
    np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
    # saves mask
    np.save(sel_feats_path + ".mask", selected_mask)
    featsel_str = ".randcv"


  estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs']))

  mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
  #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

  # performs parameter optimization using random search
  print "Performing parameter optimization ... "


  param_distributions = \
    {"n_estimators": [5, 10, 50, 100, 200, 500],
     "max_depth": [3, 2, 1, None],
     "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
     "min_samples_split": sp_randint(1, 11),
     "min_samples_leaf": sp_randint(1, 11),
     "bootstrap": [True, False]}
   # "criterion": ["gini", "entropy"]}

  search = RandomizedSearchCV(estimator, param_distributions,
            n_iter=int(config['RR_Iter']),
            scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True,
            cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42),
            verbose=1, random_state=42)
  
  # fits model using best parameters found
  search.fit(X_trains, y_train)

  # ................SHAHAB ........................ 
  
  models_dir = sorted(glob.glob(modelsdir + os.sep + "*"))
  
  estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
       max_depth=search.best_params_["max_depth"], 
       max_features=search.best_params_["max_features"],
       min_samples_leaf=search.best_params_["min_samples_leaf"], 
       min_samples_split=search.best_params_["min_samples_split"], 
       n_estimators=search.best_params_["n_estimators"], 
       verbose=1, 
       random_state=42, 
       n_jobs=int(config['n_jobs']))

  print "Train the model with the best parameters ..."
  estimator2.fit(X_trains,y_train)

  from sklearn.externals import joblib
  joblib.dump(estimator2, modelsdir+"/XRT.pkl")
  joblib.dump(scaler, modelsdir+"/scaler.pkl")
  joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
def feature_selection(df,target_column):
    print("IDENTIFYING TYPES...")
    in_model = []
    list_ib = set()  #input binary
    list_icn = set() #input categorical nominal
    list_ico = set() #input categorical ordinal
    list_if = set()  #input numerical continuos (input float)
    list_inputs = set()
    output_var = target_column



    for var_name in df.columns:
        if re.search('^ib_',var_name):
            list_inputs.add(var_name)
            list_ib.add(var_name)
            print (var_name,"is input binary")
        elif re.search('^icn_',var_name):
            list_inputs.add(var_name)
            list_icn.add(var_name)
            print (var_name,"is input categorical nominal")
        elif re.search('^ico_',var_name):
            list_inputs.add(var_name)
            list_ico.add(var_name)
            print (var_name,"is input categorical ordinal")
        elif re.search('^if_',var_name):
            #list_inputs.add(var_name)
            list_if.add(var_name)
            print (var_name,"is input numerical continuos (input float)")
        elif re.search('^ob_',var_name):
            output_var = var_name
        else:
            print ("ERROR: unable to identify the type of:", var_name)


    if (df[output_var].isin([0,1]).all()):
        method_type = 'categorical'
    else:
        method_type = 'numerical'

    print(method_type)

    if method_type == "categorical":
        methods = ["SVM","Decision Trees","KNNs","Logistic Regression","Naive Bayes"]
    elif method_type == "numerical":
        methods = ["SVM","Ridge","Lasso"]


    if method_type == "categorical":
        print ("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):")

        #####
        #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION)
        #####
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs))
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        def evalOneMax(individual):
            return sum(individual),

        toolbox.register("evaluate", evalOneMax)
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
        toolbox.register("select", tools.selTournament, tournsize=3)

        NPOPSIZE = 50 #RANDOM STARTING POOL SIZE
        population = toolbox.population(n=NPOPSIZE)


    #####
    #ASSESSING GINI ON THE STARTING POOL
    #####
    dic_gini={}
    for i in range(np.shape(population)[0]):

        # TRASLATING DNA INTO LIST OF VARIABLES (1-81)
        var_model = []
        for j in range(np.shape(population)[0]):
            if (population[i])[j]==1:
                var_model.append(list(list_inputs)[j])

        # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL

        X_train=df[var_model]
        Y_train=df[output_var]

        ######
        # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####
        if "Logistic Regression" in methods:
            lr = sm.Logit(Y_train, X_train)
            model=lr.fit()
            Y_predict=model.predict(X_train)
        ######
        # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####


        ######
        # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
        #####
            fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
            auc = metrics.auc(fpr, tpr)
            gini_power = abs(2*auc-1)
        ######
        # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
        #####

            gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
            dic_gini[gini]=population[j]
        list_gini=sorted(dic_gini.keys(),reverse=True)


    ####
    # ASSESSING RMSE ON THE STARTING POOL
    ####
    if method_type == "numerical":
        X_train=df[var_model]
        Y_train=df["if_var_73"]

        names = list(X_train)
        ranks = {}

        lr = LinearRegression(normalize=True)
        lr.fit(X_train, Y_train)
        ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

        ridge = Ridge(alpha=7)
        ridge.fit(X_train, Y_train)
        ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)


        lasso = Lasso(alpha=.05)
        lasso.fit(X_train, Y_train)
        ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)


        rlasso = RandomizedLasso(alpha=0.04)
        rlasso.fit(X_train, Y_train)
        ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

        rf = RandomForestRegressor()
        rf.fit(X_train,Y_train)
        ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

        f, pval  = f_regression(X_train, Y_train, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        r = {}
        for name in names:
            r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2)



        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")
        print(ranks["Mean"])

        print("\t\t%s" % "\t".join(methods))
        for name in names:
            print ("%s\t%s" % (name, "\t".join(map(str,
                [ranks[method][name] for method in methods]))))

        ranks_f = pd.DataFrame(ranks)
        ranks_f.sort_values("RF",0,0,inplace = True)

        print(ranks_f)

        featureset = ranks_f.index.values[0:5]

        print(featureset)

    if method_type == "categorical":
        #GENETIC ALGORITHM MAIN LOOP - START
        # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES)
        #####
        sum_current_gini=0.0
        sum_current_gini_1=0.0
        sum_current_gini_2=0.0
        first=0
        OK = 1
        a=0
        while OK:  #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS
            a=a+1
            print('loop ', a)
            OK=0

            ####
            # GENERATING OFFSPRING - START
            ####
            offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10%
            fits = toolbox.map(toolbox.evaluate, offspring)
            for fit, ind in zip(fits, offspring):
                ind.fitness.values = fit
            population =toolbox.select(offspring, k=len(population))
            ####
            # GENERATING OFFSPRING - END
            ####

            sum_current_gini_2=sum_current_gini_1
            sum_current_gini_1=sum_current_gini
            sum_current_gini=0.0

            #####
            #ASSESSING GINI ON THE OFFSPRING - START
            #####
            for j in range(np.shape(population)[0]):
                if population[j] not in dic_gini.values():
                    var_model = []
                    for i in range(np.shape(population)[0]):
                        if (population[j])[i]==1:
                            var_model.append(list(list_inputs)[i])

                    X_train=df[var_model]
                    Y_train=df[output_var]

                    ######
                    # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####
                    lr = sm.Logit(Y_train, X_train)
                    model=lr.fit()
                    Y_predict=model.predict(X_train)
                    ######
                    # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####


                    ######
                    # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####
                    fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
                    auc = metrics.auc(fpr, tpr)
                    gini_power = abs(2*auc-1)
                    ######
                    # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####

                    gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
                    dic_gini[gini]=population[j]
            #####
            #ASSESSING GINI ON THE OFFSPRING - END
            #####

            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START
            #####
            list_gini=sorted(dic_gini.keys(),reverse=True)
            population=[]
            for i in list_gini[:NPOPSIZE]:
                population.append(dic_gini[i])
                gini=float(i.split(';')[0])
                sum_current_gini+=gini
            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END
            #####

            #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS
            print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2)
            if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001):
                OK=1
        #####
        #GENETIC ALGORITHM MAIN LOOP - END
        #####

    if method_type == "categorical":

        gini_max=list_gini[0]
        gini=float(gini_max.split(';')[0])
        features=gini_max.split(';')[1]


        ####
        # PRINTING OUT THE LIST OF FEATURES
        #####
        f=0
        for i in range(len(features)):
            if features[i]=='1':
                f+=1
                print('feature ', f, ':', list(list_inputs)[i])
        print ('gini: ', gini)

        featureset = features

    return(featureset)
def lasso_hq(X, y, alpha=0.3):
    ## feature select based on Lasso=====
    from sklearn.linear_model import RandomizedLasso
    rlasso = RandomizedLasso(alpha=alpha)
    rlasso.fit(X, y)
    return(rlasso.scores_)
    def do_rank(self):
        house = pd.read_csv(self.data)
        house.head()

        #dropping the id and date columns
        house = house.drop(['date'], axis=1)

        str_list = []
        for colname, colvalue in house.iteritems():
            if type(colvalue[1]) == str:
                str_list.append(colname)

        num_list = house.columns.difference(str_list)

        house_num = house[num_list]

        Y = house.price.values
        house = house.drop(['price'], axis=1)
        X = house.as_matrix()
        colnames = house.columns

        ranks = {}

        def ranking(ranks, names, order=1):
            minmax = MinMaxScaler()
            ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
            ranks = map(lambda x: round(x, 2), ranks)
            return dict(zip(names, ranks))

        rlasso = RandomizedLasso(alpha=0.04)
        # long time
        rlasso.fit(X, Y)
        ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames)
        print('finished')

        lr = LinearRegression(normalize=True)
        lr.fit(X, Y)

        rfe = RFE(lr, n_features_to_select=1, verbose=3)
        rfe.fit(X, Y)
        ranks["RFE"] = ranking(list(map(float, rfe.ranking_)),
                               colnames,
                               order=-1)

        #Using linear regression
        lr = LinearRegression(normalize=True)
        lr.fit(X, Y)
        ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames)

        #using Ridge
        ridge = Ridge(alpha=7)
        ridge.fit(X, Y)
        ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames)

        #using lasso
        lasso = Lasso(alpha=0.05)
        lasso.fit(X, Y)
        ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames)

        # long time
        rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3)
        rf.fit(X, Y)
        ranks["RF"] = ranking(rf.feature_importances_, colnames)

        r = {}
        for name in colnames:
            r[name] = round(
                np.mean([ranks[method][name] for method in ranks.keys()]), 2)

        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")

        meanplot = pd.DataFrame(list(r.items()),
                                columns=['Feature', 'Mean Ranking'])

        meanplot = meanplot.sort_values('Mean Ranking', ascending=False)

        sns.factorplot(x='Mean Ranking',
                       y='Feature',
                       data=meanplot,
                       kind='bar',
                       size=4,
                       aspect=1.9,
                       palette='coolwarm')
        plt.savefig('..\\Images\\feature_ranking.jpg')
示例#14
0
    def run(self):
        loanfreature_df = pd.read_csv(
            processData(loginemail=self.loginemail,
                        loginpassword=self.loginpassword).output().path,
            low_memory=False,
            encoding='ISO-8859-1')
        Y = loanfreature_df.int_rate
        loanfreature_df.drop('int_rate', axis=1, inplace=True)
        cols_to_keep = [
            'loan_amnt', 'term', 'emp_length', 'home_ownership_category',
            'annual_inc', 'verification_status_category', 'purpose',
            'addr_state', 'dti', 'delinq_2yrs', 'last_meanfico',
            'inq_last_6mths', 'open_acc', 'revol_bal', 'revol_util',
            'total_acc', 'mths_since_last_major_derog', 'funded_amnt_inv',
            'installment', 'application_type', 'pub_rec', 'addr_state'
        ]
        loanfreature_df = loanfreature_df[cols_to_keep]
        loanfreature_df = createDummies(loanfreature_df)

        X = loanfreature_df._get_numeric_data()
        names = ["%s" % i for i in X]
        ranks = {}

        lr = LinearRegression(normalize=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            lr.fit(X, Y)
            ranks["Linear reg"] = rank_to_dict((lr.coef_), names)

        ridge = Ridge(alpha=7)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            ridge.fit(X, Y)
            ranks["Ridge"] = rank_to_dict((ridge.coef_), names)

        lasso = Lasso(alpha=.05)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            lasso.fit(X, Y)
            ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

        rlasso = RandomizedLasso(alpha=0.00)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            rlasso.fit(X, Y)
            ranks["Stability"] = rank_to_dict((rlasso.scores_), names)

        rf = RandomForestRegressor()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            rf.fit(X, Y)
            ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

        # stop the search when 5 features are left (they will get equal scores)
        rfe = RFE(lr, n_features_to_select=15)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            rfe.fit(X, Y)
            ranks["RFE"] = rank_to_dict(rfe.ranking_, X.columns, order=-1)

        f, pval = f_regression(X, Y, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        r = {}
        for name in names:
            r[name] = round(
                np.mean([ranks[method][name] for method in ranks.keys()]), 2)
        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")

        #     f_rank = pd.DataFrame()
        print("\t%s" % "\t".join(methods))
        temp = "\t".join(methods)
        f = open("testing.txt", 'w')
        f.write(temp)
        f.write("\n")
        for name in names:
            temp = name + "\t" + " \t".join(
                map(str, [ranks[method][name] for method in methods]))
            f.write(temp)
            f.write("\n")
            print("%s\t%s" % (name, "\t".join(
                map(str, [ranks[method][name] for method in methods]))))
        f.close()
        feature = pd.read_csv('testing.txt', sep='\t')
        feature.to_csv(self.output().path)
示例#15
0
# Using Randomized Lasso
import pandas as pd
import numpy as np
from sklearn.linear_model import RandomizedLasso

dataset = pd.read_csv('ARJUNANADHI.csv')
X = dataset.iloc[:, 1:4].values
Y = dataset.iloc[:, [6,9,12]].values
y_paddy=(Y[:,[0]]).ravel()
y_maize=Y[:,[1]].ravel()
y_cereals=Y[:,[2]].ravel()

rlasso = RandomizedLasso(alpha=0.04)
mapping = {0:'Meteorological', 1:'Hydrological',2:'Agricultural '}
fit=rlasso.fit(X,y_paddy)
Paddy=pd.DataFrame(fit.scores_)
Paddy.columns = ['Scores']
Paddy=Paddy.rename(mapping)
Paddy.plot.bar(title='Paddy',color='g',rot=0)

fit=rlasso.fit(X,y_maize)
Maize=pd.DataFrame(fit.scores_)
Maize.columns = ['Scores']
Maize=Maize.rename(mapping)
Maize.plot.bar(title='Maize',color='y',rot=0)

fit=rlasso.fit(X,y_cereals)
Cereals=pd.DataFrame(fit.scores_)
Cereals.columns = ['Scores']
Cereals=Cereals.rename(mapping)
Cereals.plot.bar(title='Cereals',color='c',rot=0)
示例#16
0
def machinelearningpipeline(datset, output='results.zip'):
    t0 = time()
    dataset = pd.read_csv(datset)
    dataset.to_csv(output[:-4] + '.csv')
    # DO feature evaluation and write a xlsx file.
    wb = Workbook()
    ws1 = wb.active
    ws1.title = "ResultTableIndividualFeature"
    rownum = 2
    ws1.cell(column=1, row=1).value = 'Feature name'
    ws1.cell(column=2, row=1).value = 'Az'
    ws1.cell(column=3, row=1).value = 'Optimal threshold'
    ws1.cell(column=4, row=1).value = 'Sensitivity'
    ws1.cell(column=5, row=1).value = 'Specificity'
    ws1.cell(column=6, row=1).value = 'Confidence interval: low'
    ws1.cell(column=7, row=1).value = 'Confidence interval: high'
    Collumnheadeers = list(dataset.columns.values)
    for imagebiom in Collumnheadeers:
        ValuesMetric = dataset[imagebiom].values
        Targets = dataset['label'].values
        roc_auc_score, optimalval, sens, spec, confidence_lower, confidence_upper = analyticscalc(
            ValuesMetric, Targets, imagebiom)
        ws1.cell(column=1, row=rownum).value = imagebiom
        ws1.cell(column=2, row=rownum).value = "{:0.3f}".format(roc_auc_score)
        ws1.cell(column=3, row=rownum).value = "{:0.3f}".format(optimalval)
        ws1.cell(column=4, row=rownum).value = "{:0.3f}".format(sens)
        ws1.cell(column=5, row=rownum).value = "{:0.3f}".format(spec)
        ws1.cell(column=6,
                 row=rownum).value = "{:0.3f}".format(confidence_lower)
        ws1.cell(column=7,
                 row=rownum).value = "{:0.3f}".format(confidence_upper)
        rownum += 1
    wb.save(filename='ResultTableIndividualFeature.xlsx')
    print dataset
    print Collumnheadeers
    # Create and save correlation plots.
    ## Get labels
    y = dataset['label'].values
    ## Delete labes from list
    Collumnheadeers.remove('label')
    X = dataset[Collumnheadeers]
    corplot(X, filesavename='all.pdf')
    # performe feature selection
    rlasso = RandomizedLasso(alpha=0.00025)
    rlasso.fit(X, y)
    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                     Collumnheadeers),
                 reverse=True)
    print np.where(rlasso.scores_ > 0.8)[0] + 1
    print Collumnheadeers
    elementselect = np.where(rlasso.scores_ > 0.8)[0]
    Collumnheadeersel = []
    for i in elementselect:
        Collumnheadeersel.append(Collumnheadeers[i])
    corplot(X[Collumnheadeersel], filesavename='selected.pdf')
    # optimize and evalute classifier
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    # Compare classifier
    calibrationplot(X_train,
                    X_test,
                    y_train,
                    y_test,
                    filesavename='calibrationplot.pdf')
    X = X_train.copy()
    y = y_train.copy()
    # Run a quick example on non optimal classifiers
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()
    clf4 = SVC()
    print('5-fold cross validation:\n')
    for clf, label in zip(
        [clf1, clf2, clf3, clf4],
        ['Logistic Regression', 'Random Forest', 'naive Bayes', 'SVM']):
        scores = cross_validation.cross_val_score(clf,
                                                  X,
                                                  y,
                                                  cv=5,
                                                  scoring='roc_auc',
                                                  n_jobs=1)
        print("roc_auc: %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))
    # SVM
    scaler = StandardScaler()
    X1 = scaler.fit_transform(X)
    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(kernel='rbf'),
                        param_grid=param_grid,
                        cv=cv,
                        scoring='roc_auc')
    grid.fit(X1, y)
    scores = [x[1] for x in grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    # Draw heatmap of the validation accuracy as a function of gamma and C
    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.jet)
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('Validation accuracy')
    plt.savefig('SVMheatmap.pdf', dpi=300)
    print("The best parameters are %s with a score of %0.2f" %
          (grid.best_params_, grid.best_score_))

    # Check out random forest accuracy...
    scores = ['roc_auc']  #['precision_weighted', 'recall_weighted','roc_auc']
    Random_plot = []
    tuned_parameters = [{
        'n_estimators': [
            1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500,
            600, 700, 800, 900, 1000
        ]
    }]
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(RandomForestClassifier(),
                           tuned_parameters,
                           cv=5,
                           n_jobs=40,
                           scoring='%s' % score)
        clf.fit(X, y)
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r" %
                  (mean_score, scores.std() * 2, params))
            Random_plot.append(mean_score)
    param = clf.best_params_

    f, ax = plt.subplots(figsize=(20, 20))
    plt.plot([
        1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600,
        700, 800, 900, 1000
    ],
             Random_plot,
             lw=2)
    plt.title(
        "The best parameter is n_estimators=%s with area under ROC of %0.2f" %
        (param.get("n_estimators"), clf.best_score_),
        fontweight='bold')
    plt.xlabel('Numer of estimators', fontweight='bold')
    plt.ylabel('Area Under ROC (Az)', fontweight='bold')
    plt.savefig('RandomForrest.pdf', tight_layout=True, dpi=600)
    f, ax = plt.subplots(figsize=(20, 20))
    title = 'Learning Curves (SVM)'
    param = grid.best_params_
    estimator = SVC(kernel='rbf', C=param.get("C"), gamma=param.get("gamma"))
    print y_train
    cv = cross_validation.ShuffleSplit(X.shape[0],
                                       n_iter=10,
                                       test_size=0.2,
                                       random_state=0)
    plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)
    plt.savefig('LearningCurvesSVM.pdf', tight_layout=True, dpi=600)
    print str((time() - t0))
    estimator.fit(X, y)
    y_true, y_pred = y_test, estimator.predict(X_test)
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(cm)
    plt.figure()
    # Normalize the confusion matrix by row (i.e by the number of samples
    # in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print('Normalized confusion matrix')
    print(cm_normalized)
    plt.figure()
    plot_confusion_matrix(y_test, y_pred)
    plt.savefig('ConfusionMatrixSVM.pdf', tight_layout=True, dpi=600)
    # os.chdir(os.getcwd())
    # for file in glob.glob("*.pdf"):
    #     print(file)
    # shutisl.make_archive(output_filename, 'zip', dir_name)
    path_ = os.getcwd()
    directory = path_ + '/output/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    types = ('*.pdf', '*.csv', '*.xlsx')  # the tuple of file types
    files_grabbed = []
    for files in types:
        files_grabbed.extend(glob.glob(files))
    for file in files_grabbed:
        if os.path.isfile(file):
            shutil.copy2(file, directory)
    shutil.make_archive(output[:-4], 'zip', directory)
    return 0
def feature_selection(df,dfo,target_column,id_column):
  """
  df = The training dataframe
  dfo = The test dataframe
  target_column = The column containing the target variable
  id_column = The column containing the id variable
  
  Based on the output column type (binary or numeric), it decides on the type of problem we are trying to solve.
  If the output column is binary (0/1), we use Genetic Algorithms for feature selection.
  If the 
  """
    print("IDENTIFYING TYPES...")
    in_model = []
    list_ib = set()  #input binary
    list_icn = set() #input categorical nominal
    list_ico = set() #input categorical ordinal
    list_if = set()  #input numerical continuos (input float)
    list_inputs = set()
    output_var = target_column



    for var_name in df.columns:
        if re.search('^ib_',var_name):
            list_inputs.add(var_name)      
            list_ib.add(var_name)
            print (var_name,"is input binary")
        elif re.search('^icn_',var_name):
            list_inputs.add(var_name)      
            list_icn.add(var_name)
            print (var_name,"is input categorical nominal")
        elif re.search('^ico_',var_name):
            list_inputs.add(var_name)      
            list_ico.add(var_name)
            print (var_name,"is input categorical ordinal")
        elif re.search('^if_',var_name):
            #list_inputs.add(var_name)      
            list_if.add(var_name)
            print (var_name,"is input numerical continuos (input float)")
        elif re.search('^ob_',var_name):
            output_var = var_name
        else:
            print ("ERROR: unable to identify the type of:", var_name)
            
            
    if (df[output_var].isin([0,1]).all()):
        method_type = 'categorical'
    else:
        method_type = 'numerical'
        
    print(method_type)

    if method_type == "categorical":
        methods = ["SVM","Decision Trees","KNNs","Logistic Regression","Naive Bayes"]
    elif method_type == "numerical":
        methods = ["SVM","Ridge","Lasso"]


    if method_type == "categorical":
        print ("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):")

        #####
        #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION)
        #####
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs))
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        def evalOneMax(individual):
            return sum(individual),

        toolbox.register("evaluate", evalOneMax)
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
        toolbox.register("select", tools.selTournament, tournsize=3)

        NPOPSIZE = 50 #RANDOM STARTING POOL SIZE
        population = toolbox.population(n=NPOPSIZE)


    #####
    #ASSESSING GINI ON THE STARTING POOL
    #####
    dic_gini={}
    for i in range(np.shape(population)[0]): 

        # TRASLATING DNA INTO LIST OF VARIABLES (1-81)
        var_model = []    
        for j in range(np.shape(population)[0]): 
            if (population[i])[j]==1:
                var_model.append(list(list_inputs)[j])

        # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL 
                
        X_train=df[var_model]
        Y_train=df[output_var]

        ######
        # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####     
        if "Logistic Regression" in methods:
            lr = sm.Logit(Y_train, X_train)
            model=lr.fit()   
            Y_predict=model.predict(X_train)
        ######
        # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####             


        ######
        # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
        #####                
            fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
            auc = metrics.auc(fpr, tpr)
            gini_power = abs(2*auc-1)
        ######
        # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
        #####                
        
            gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
            dic_gini[gini]=population[j]   
        list_gini=sorted(dic_gini.keys(),reverse=True)


    ####
    # ASSESSING RMSE ON THE STARTING POOL
    ####
    if method_type == "numerical":
        X_train=df[var_model]
        Y_train=df[output_var]
        
        names = list(X_train)
        ranks = {}
        
        lr = LinearRegression(normalize=True)
        lr.fit(X_train, Y_train)
        ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

        ridge = Ridge(alpha=7)
        ridge.fit(X_train, Y_train)
        ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)


        lasso = Lasso(alpha=.05)
        lasso.fit(X_train, Y_train)
        ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)


        rlasso = RandomizedLasso(alpha=0.04)
        rlasso.fit(X_train, Y_train)
        ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)  
        
        rf = RandomForestRegressor()
        rf.fit(X_train,Y_train)
        ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
        
        f, pval  = f_regression(X_train, Y_train, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        r = {}
        for name in names:
            r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2)
            
        

        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")
        print(ranks["Mean"])
        
        print("\t\t%s" % "\t".join(methods))
        for name in names:
            print ("%s\t%s" % (name, "\t".join(map(str, 
                [ranks[method][name] for method in methods]))))
        
        ranks_f = pd.DataFrame(ranks)
        ranks_f.sort_values("RF",0,0,inplace = True)
        
        print(ranks_f)
        
        featureset = ranks_f.index.values[0:5]
        
        print(featureset)

    if method_type == "categorical":   
        #GENETIC ALGORITHM MAIN LOOP - START
        # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES)
        #####
        sum_current_gini=0.0
        sum_current_gini_1=0.0
        sum_current_gini_2=0.0
        first=0    
        OK = 1
        a=0
        while OK:  #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS
            a=a+1
            print('loop ', a)
            OK=0

            ####
            # GENERATING OFFSPRING - START
            ####
            offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10%
            fits = toolbox.map(toolbox.evaluate, offspring)
            for fit, ind in zip(fits, offspring):
                ind.fitness.values = fit
            population =toolbox.select(offspring, k=len(population))
            ####
            # GENERATING OFFSPRING - END
            ####

            sum_current_gini_2=sum_current_gini_1
            sum_current_gini_1=sum_current_gini
            sum_current_gini=0.0

            #####
            #ASSESSING GINI ON THE OFFSPRING - START
            #####
            for j in range(np.shape(population)[0]): 
                if population[j] not in dic_gini.values(): 
                    var_model = [] 
                    for i in range(np.shape(population)[0]): 
                        if (population[j])[i]==1:
                            var_model.append(list(list_inputs)[i])

                    X_train=df[var_model]
                    Y_train=df[output_var]

                    ######
                    # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####            
                    lr = sm.Logit(Y_train, X_train)
                    model=lr.fit()
                    Y_predict=model.predict(X_train)
                    ######
                    # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####            


                    ######
                    # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####                       
                    fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
                    auc = metrics.auc(fpr, tpr)
                    gini_power = abs(2*auc-1)
                    ######
                    # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####                       

                    gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
                    dic_gini[gini]=population[j]  
            #####
            #ASSESSING GINI ON THE OFFSPRING - END
            #####

            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START
            #####           
            list_gini=sorted(dic_gini.keys(),reverse=True)
            population=[]
            for i in list_gini[:NPOPSIZE]:
                population.append(dic_gini[i])
                gini=float(i.split(';')[0])
                sum_current_gini+=gini
            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END
            #####           

            #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS
            print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2)
            if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001):
                OK=1
        #####
        #GENETIC ALGORITHM MAIN LOOP - END
        #####

    if method_type == "categorical":
        
        gini_max=list_gini[0]        
        gini=float(gini_max.split(';')[0])
        features=gini_max.split(';')[1]


        ####
        # PRINTING OUT THE LIST OF FEATURES
        #####
        f=0
        for i in range(len(features)):
            if features[i]=='1':
                f+=1
                print('feature ', f, ':', list(list_inputs)[i])
        print ('gini: ', gini)
        
        featureset = features

    return featureset
示例#18
0
文件: main.py 项目: valeman/AutoLearn
def stable(ress, test, labels):  # ress is training data
    x, y = ress.shape
    names = np.arange(y)
    rlasso = RandomizedLasso()
    rlasso.fit(ress, labels)

    #print "Features sorted by their scores according to the stability scoring function"
    val = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names),
                 reverse=True)

    print("len of val")  # newly constructed features
    print(len(val))
    global nc_val
    nc_val += len(val)

    finale = []
    for i in range(0, len(val)):
        r, s = val[i]  # 'r' represents scores, 's' represents column name
        if (r > 0.1):  # This is eta for stability selection
            finale.append(s)

        #finale.append(s)

    print("Total features after stability selection:")
    print(len(
        finale))  # finale stores col names - 2nd, 4th etc of stable features.
    global stable_val
    stable_val += len(finale)

    dataset1 = np.zeros((len(ress), len(finale)), dtype=float)
    dataset3 = np.zeros((len(test), len(finale)), dtype=float)
    dataset1 = ress[:, finale]
    dataset3 = test[:, finale]
    #dataset3=test.iloc[:,finale]

    if os.path.exists(
            "sonar_stable_testfeatures.csv"):  # Name of Ouput file generated
        os.remove("sonar_stable_testfeatures.csv")
    if os.path.exists(
            "sonar_stable_trainfeatures.csv"):  # Name of Ouput file generated
        os.remove("sonar_stable_trainfeatures.csv")

    with open("sonar_stable_testfeatures.csv", "wb") as myfile:
        np.savetxt(myfile, dataset3, delimiter=",", fmt="%s")
    with open("sonar_stable_trainfeatures.csv", "wb") as myfile:
        np.savetxt(myfile, dataset1, delimiter=",", fmt="%s")

#-----------------------------------------------------------------------------------
# check the inter-feature dependence - 2nd phase of ensemble

    ress_new = SelectKBest(mutual_info_classif, k='all')
    ress_new.fit_transform(ress[:, finale], labels)

    #print "Features sorted by their scores according to the scoring function - mutual information gain:"
    feats = sorted(zip(map(lambda x: round(x, 4), ress_new.scores_), names),
                   reverse=True)

    ensemble_finale = []
    for i in range(0, len(feats)):
        r, s = feats[i]
        if (r > 0):  # This is eta-o
            ensemble_finale.append(s)

    print("Total features after 2 phase selection:")
    print(
        len(ensemble_finale)
    )  # ensemble_finale stores col names further pruned in the 2nd phase of feature selection
    global ensemble_val
    ensemble_val += len(ensemble_finale)
    #print(ensemble_select)

    dataset2 = np.zeros((len(ress), len(ensemble_finale)), dtype=float)
    dataset4 = np.zeros((len(test), len(ensemble_finale)), dtype=float)
    dataset2 = ress[:, ensemble_finale]
    dataset4 = test[:, ensemble_finale]

    if os.path.exists(
            "sonar_ensemble_testfeatures.csv"):  # Name of Ouput file generated
        os.remove("sonar_ensemble_testfeatures.csv")
    if os.path.exists("sonar_ensemble_trainfeatures.csv"
                      ):  # Name of Ouput file generated
        os.remove("sonar_ensemble_trainfeatures.csv")

    with open("sonar_ensemble_testfeatures.csv", "wb") as myfile:
        np.savetxt(myfile, dataset4, delimiter=",", fmt="%s")
    with open("sonar_ensemble_trainfeatures.csv", "wb") as myfile:
        np.savetxt(myfile, dataset2, delimiter=",", fmt="%s")
	Lasso picks out the top performing features, while forcing other features to be close to zero. 
	It is useful when reducing the number of features is required.
	"""
    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names_hashingfile)
    """
	Stability selection applies a feature selection algorithm on different subsets of data and with different subsets of features. 
	After repeating the process a number of times, the selection results can be aggregated, for example by checking how many times 
	a feature ended up being selected as important when it was in an inspected feature subset. We can expect strong features to have 
	scores close to 100%, since they are always selected when possible. Weaker, but still relevant features will also have non-zero 
	scores, since they would be selected when stronger features are not present in the currently selected subset, while irrelevant 
	features would have scores (close to) zero, since they would never be among selected features.
	"""
    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_),
                                      names_hashingfile)
    """
	Recursive feature elimination is a greedy optimization based on the idea to repeatedly construct a model and choose either the 
	best or worst performing feature setting the feature aside and then repeating the process with the rest of the features.
	We have constructed the model using Linear Regression.
	"""
    rfe = RFE(
        lr, n_features_to_select=topk
    )  #stop the search when topk features are left (they will get equal scores)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)),
                                names_hashingfile,
                                order=-1)
    """
示例#20
0
## RandomizedLasso, feature stability selection
from sklearn.linear_model import (RandomizedLasso, lasso_stability_path,
                                  LassoLarsCV)
import warnings
from sklearn.exceptions import ConvergenceWarning

with warnings.catch_warnings():
    warnings.simplefilter('ignore', UserWarning)
    warnings.simplefilter('ignore', ConvergenceWarning)
    lars_cv = LassoLarsCV(cv=6).fit(X, y)

alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
names = df_merge3.columns.tolist()[:-1]
print(sorted(zip(map(lambda x: round(x, 4), clf.scores_), names),
             reverse=True))

from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
df_tree = pd.DataFrame(clf.feature_importances_)
df_tree['fea_index'] = df_merge3.columns.tolist()[:-1]
df_tree.columns = ["weight", "feature_index"]
df_tree.sort_values("weight").tail(10)

#model = SelectFromModel(lsvc, prefit=True)
#X_new = model.transform(X)
#X_new.shape
#4 两种顶层特征选择算法

#4.1 稳定性选择 (Stability selection)  [0,1]
#它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
#比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)

from sklearn.linear_model import RandomizedLasso  #随机Lasso
from sklearn.datasets import load_boston
boston = load_boston()
#using the Boston housing data.
#Data gets scaled automatically by sklearn's implementation
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rlasso = RandomizedLasso(alpha=0.025) #alpha自动选择最优的值
rlasso.fit(X, Y)
print "Features sorted by their score:"      #得分:rlasso.scores_
print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)

#结论:好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。
#对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一


#4.2 递归特征消除 (Recursive feature elimination (RFE))    最优特征子集贪心算法
#反复的构建模型(如SVM或者回归模型)然后选出最好的(或者最差的)的特征(可以根据系数来选),把选出来的特征放到一遍,
#然后在剩余的特征上重复这个过程,直到所有特征都遍历了。这个过程中特征被消除的次序就是特征的排序

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression,Ridge
boston = load_boston()
X = boston["data"]
示例#22
0
def feature_importance(
        df,
        train,
        target,
        dummies=[],
        fill_na=-999,
        methods=['rlasso', 'RFE', 'LinReg', 'Ridge', 'Lasso', 'RF', 'GBM']):

    # in lower and upper form of the methods names
    methods_lower = [x.lower() for x in methods]
    methods_upper = [x.upper() for x in methods]

    # combine names
    methods = methods + methods_lower + methods_upper

    # target
    Y = df[target].values

    # deal the training data
    df = df[train].fillna(fill_na)

    # dummies
    if dummies != []:
        for x in dummies:
            dummie_x = pd.get_dummies(df.x, prefix=x + '_').iloc[:, 1:]
            df = pd.concat([df, dummie_x], axis=1)

    def get_cat_features(df):
        return list(df.select_dtypes(include=['object']).columns)

    # automatically detect categorical variables
    cat_attr = get_cat_features(df)

    for x in cat_attr:
        dummie_x = pd.get_dummies(df.x, prefix=x + '_').iloc[:, 1:]
        df = pd.concat([df, dummie_x], axis=1)

    # get all attributes names
    colnames = df.columns

    # attributes
    X = df.values

    # Define dictionary to store our rankings
    ranks = {}

    # Create our function which stores the feature rankings to the ranks dictionary
    def ranking(ranks, names, order=1):
        minmax = MinMaxScaler()
        ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
        ranks = map(lambda x: round(x, 2), ranks)
        return dict(zip(names, ranks))

    '''
    Randomized Lasso
    '''
    if 'rlasso' in methods:
        # Selection Stability method with Randomized Lasso
        rlasso = RandomizedLasso(alpha=0.04)
        rlasso.fit(X, Y)
        ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames)
    '''
    Recursive Feature Elimination ( RFE )
    '''
    if 'RFE' in methods:
        # Construct our Linear Regression model
        lr = LinearRegression(normalize=True)
        lr.fit(X, Y)
        # stop the search when only the last feature is left
        rfe = RFE(lr, n_features_to_select=1, verbose=0)
        rfe.fit(X, Y)
        ranks['RFE'] = ranking(list(map(float, rfe.ranking_)),
                               colnames,
                               order=-1)
    '''
    Linear Model Feature Ranking
    '''
    if 'LinReg' in methods:
        # Using Linear Regression
        lr = LinearRegression(normalize=True)
        lr.fit(X, Y)
        ranks['LinReg'] = ranking(np.abs(lr.coef_), colnames)

    # Using Ridge

    if 'Ridge' in methods:
        ridge = Ridge(alpha=7)
        ridge.fit(X, Y)
        ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames)

    # Using Lasso

    if 'Lasso' in methods:
        lasso = Lasso(alpha=.05)
        lasso.fit(X, Y)
        ranks['Lasso'] = ranking(np.abs(lasso.coef_), colnames)
    '''
    random forest
    '''
    if 'RF' in methods:
        # parameters
        rf_params = {
            'n_jobs': -1,
            'n_estimators': 100,
            'warm_start': True,
            'max_features': 0.3,
            'max_depth': 3,
            'min_samples_leaf': 2,
            'max_features': 'sqrt',
            'random_state': 100,
            'verbose': 0
        }
        rf = RandomForestRegressor(**rf_params)
        rf.fit(X, Y)

        ranks['RF'] = ranking(rf.feature_importances_, colnames)
    '''
    Gradient Boosting Machine
    '''
    if 'GBM' in methods:
        # parameters
        gbm_params = {
            'nthread': -1,
            'colsample_bytree': 0.4,
            'gamma': 0,
            'reg_alpha': 0.75,
            'reg_lambda': 0.45,
            'subsample': 0.6,
            'learning_rate': 0.07,
            'max_depth': 3,
            'min_child_weight': 1.5,
            'n_estimators': 100,
            'seed': 100
        }

        gbm = xgb.XGBRegressor(**gbm_params)
        gbm.fit(X, Y)
        ranks['GBM'] = ranking(gbm.feature_importances_, colnames)

    # Create empty dictionary to store the mean value calculated from all the scores
    r = {}
    for name in colnames:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)

    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    matrix_importance = pd.DataFrame(ranks)

    print(matrix_importance.columns)

    # change the display oder of cols
    ordered_cols = [
        'rlasso/Stability', 'LinReg', 'Lasso', 'Ridge', 'RFE', 'GBM', 'RF',
        'Mean'
    ]

    matrix_importance = matrix_importance[ordered_cols]

    # display the summary table
    display(HTML(matrix_importance.to_html()))

    # Put the mean scores into a Pandas dataframe
    meanplot = pd.DataFrame(list(r.items()),
                            columns=['Feature', 'Mean Ranking'])

    # Sort the dataframe
    meanplot = meanplot.sort_values('Mean Ranking', ascending=False)

    # Let's plot the ranking of the features
    sns.factorplot(x="Mean Ranking",
                   y="Feature",
                   data=meanplot,
                   kind="bar",
                   size=14,
                   aspect=1.9,
                   palette='coolwarm')
示例#23
0
    for train_index, test_index in skf:
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train.shape)
        if feature_selection == "randomized_lasso":
            feature_selector = RandomizedLasso(sample_fraction=0.5,
                                               n_resampling=50,
                                               verbose=False,
                                               n_jobs=-1)
        elif feature_selection == "RFECV_linearSVM":
            #            print(feature_selection % "selected")
            feature_selector = RFECV(SVC(kernel="linear"),
                                     step=1,
                                     cv=StratifiedKFold(y, 5),
                                     scoring="accuracy")
        else:
            print("Options are: randomized_lasso, RFECV_linearSVM")

        feature_selector.fit(X_train, y_train)
        result = {
            'X_train': X_train,
            'y_train': y_train,
            'X_test': X_test,
            'y_test': y_test,
            'feature_selector': feature_selector
        }
        list_dicts.append(result)

    dict_for_attribute[attribute] = list_dicts
    print("done in %0.3fs" % (time() - t0))
'from_this_person_to_poi', 'shared_receipt_with_poi','from_poi_fraction','to_poi_fraction',\
'tot_to_salary','tot_to_bonus','restr_to_total']
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

#SCALE FEATURES:
#For RandomForest and DecisionTree, scaling is not necessary. 

#scaler = MinMaxScaler()
#features = scaler.fit_transform(features)


#Stability Selection:
#http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/
rlasso = RandomizedLasso(random_state=2)
rlasso.fit(features,labels)
scores = rlasso.scores_
print scores

for j in range(len(scores)):
    print features_list[j+1],": ",scores[j]
    
features_list_selected = ['poi']
for j in np.where(scores > 0.3)[0]:
    features_list_selected.append(features_list[j+1])


print "-------------Selected features:-------------"
print features_list_selected

data = featureFormat(data_dict, features_list_selected)
from sklearn.cross_validation import train_test_split
from scipy import io as sio
from tensorflow.python.framework import ops
from dfs2 import DeepFeatureSelectionNew
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import normalize

# ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat")
ourdataB = sio.loadmat("/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat")
# ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat")

inputX = ourdataB['X']
inputX = normalize(inputX, axis=0)
inputY = ourdataB['Y'][0,:]
columnNames = ourdataB['columnNames']

X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42)

randomized_lasso = RandomizedLasso()
randomized_lasso.fit(X_train, y_train)

featureMask = randomized_lasso.get_support()

X_train_lasso = X_train[:,featureMask]
X_test_lasso = X_train[:,featureMask]

columnNames[0][:100][featureMask]

sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \
			'X_train_lasso':X_test_lasso, 'featureMask':featureMask})
示例#26
0
def main():
    start = time.time()
    MAX_TRAIN_SIZE = 126838
    train_size = 20000
    val_size = MAX_TRAIN_SIZE - train_size
    data, test_data = get_data('data')
    X = data[0:train_size,0:-1]
    y = [lbl for lbl in data[0:train_size,-1]]
    print(X.shape)
    print(len(y))
    # use randomized log regression for feature selection    
    clfR = RandomizedLasso(     alpha='aic', 
                                scaling=0.5, 
                                sample_fraction=0.75, 
                                n_resampling=200, 
                                selection_threshold=0.25, 
                                fit_intercept=True, 
                                verbose=False, 
                                normalize=True, 
                                precompute='auto', 
                                max_iter=500, 
                                eps=2.2204460492503131e-16, 
                                random_state=None, 
                                n_jobs=1, 
                                pre_dispatch='3*n_jobs', 
                                #memory=Memory(cachedir=None)     
                          )  
    # fit regresion
    clfR.fit(X,y)

    # Transform Train Data to selected features
    X = np.array(X).copy() # little hack to fix assignment dest. read only error
    X_new = clfR.transform(X) 
    X = X_new
    ## transform Quiz Dataset
    test_data = np.array(test_data).copy() # little hack to fix assignment dest. read only error
    transformed_test_data = clfR.transform(test_data)
    test_data = transformed_test_data

    print('Dimensions after feature Reduction: ' + str(X.shape) ) 
    print("Elapsed Time For Feature Reduction: " + str(duration))
    
    # Training classifier
    clf1 = DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  max_depth=None,
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  max_features=None,
                                  random_state=None,
                                  max_leaf_nodes=None,
                                  class_weight=None,
                                  presort=False)

    # fit sub-classifiers
    clf1.fit(X,y)
    # fit voting classifier
    print("Elapsed Time For Classifier Training: " + str(duration))

    # predict & calculate training error
    y_hat = clf1.predict(X)
    test_err = 1
    for yi, y_hati in zip(y, y_hat):
        test_err += (yi == y_hati)
    test_err /= train_size
    print("train: " + str(test_err))

    # validation data - calculate valdiation error
    val_start = train_size
    val_end = train_size + val_size

    # get validation data set
    # TODO: put this back in
    if MAX_TRAIN_SIZE - train_size > val_size:
         print("Beginning test validation...")
         X_val = data[val_start:val_end,0:-1]
         y_val = [lbl for lbl in data[val_start:val_end,-1]]
         y_val_hat = clf1.predict(X_val)
         test_err = 1
         for yi, y_hati in zip(y_val, y_val_hat):
             test_err += (yi == y_hati)
         test_err /= X_val.shape[0]
         print("val: " + str(test_err))

    #quiz data
    print("Beginning quiz validation...")
    # test_data = get_data('quiz')
    X_test = test_data[:,:]
    print(X_test.shape)
    y_test = [lbl for lbl in data[:,-1]]
    y_test_hat = clf1.predict(X_test)
    test_err = 1
#    for yi, y_hati in zip(y_test, y_test_hat):
#        test_err += (yi == y_hati)
#    test_err /= X_test.shape[0]
#    print("test: " + str(test_err))
    store_csv(y_test_hat, "prediction")
    end = time.time()
    duration = end - start
    print("Took this many seconds: " + str(duration))
示例#27
0
def regression(file_name):
    import time
    startTime = time.time()

    import pandas as pd
    import numpy as np
    import sklearn
    from sklearn.ensemble import RandomForestRegressor
    import matplotlib.pyplot as plt
    from boruta import BorutaPy
    from sklearn.model_selection import train_test_split
    df = pd.read_csv(file_name)
    df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna()
    df = df.astype(float)
    y = df['Target'].values
    X = df.drop(['Target'], axis=1)
    col = X.columns.tolist()
    col = ",".join(col)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.3,
                                                        random_state=33)

    ################################################# SELECT K BEST ##############################################################################

    #Selected
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2, mutual_info_regression, f_regression

    num_features = len(X_train.columns)

    test = SelectKBest(score_func=f_regression, k=2)
    test.fit(X_train, y_train)
    scores = []
    for i in range(num_features):
        scores.append(test.scores_[i])

    Ranks = sorted(scores, reverse=True)

    writefp = open("Ranks_reg.csv", 'w')

    s = [str(i) for i in Ranks]
    res = (",".join(s))
    writefp.write('Classifiers,' + col + '\n')
    writefp.write('Select K Best,' + res + '\n')
    writefp.close()

    ##################################################### EXTRA TREES REGRESSOR ###########################################################################

    #Selected
    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesRegressor

    # Build a forest and compute the feature importances
    forest = ExtraTreesRegressor(n_estimators=250, random_state=0)

    forest.fit(X_train, y_train)
    importances = forest.feature_importances_

    writefp = open("Ranks_reg.csv", 'a')

    s = [str(i) for i in importances]
    res = (",".join(s))
    writefp.write('Extra Trees Regressor,' + res + '\n')
    writefp.close()

    ############################################# RANDOM FOREST REGRESSOR ####################################################################3

    # Takes 186 seconds to complete

    #Selected

    clf = RandomForestRegressor(n_estimators=10000, random_state=0, n_jobs=-1)

    clf.fit(X_train, y_train)

    writefp = open("Ranks_reg.csv", 'a')

    s = [str(i) for i in clf.feature_importances_]
    res = (",".join(s))
    writefp.write('Random Forest Regressor,' + res + '\n')
    writefp.close()

    ######################################### RIDGE ########################################

    # fast
    #Selected
    from sklearn.linear_model import Ridge

    ridge = Ridge(alpha=7)
    ridge.fit(X_train, y_train)

    writefp = open("Ranks_reg.csv", 'a')

    s = [str(i) for i in np.abs(ridge.coef_)]
    res = (",".join(s))
    writefp.write('Ridge Regressor,' + res + '\n')
    writefp.close()

    ###################################### LINEAR REGRESSION ###################################

    #Fast
    #Selected
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression(normalize=True)
    lr.fit(X_train, y_train)

    writefp = open("Ranks_reg.csv", 'a')

    s = [str(i) for i in np.abs(lr.coef_)]
    res = (",".join(s))
    writefp.write('Linear Regression ,' + res + '\n')
    writefp.close()

    ################################ F_REGRESSOR #################################################

    from sklearn.feature_selection import RFE, f_regression
    f, pval = f_regression(X_train, y_train, center=True)

    writefp = open("Ranks_reg.csv", 'a')

    s = [str(i) for i in f]
    res = (",".join(s))
    s = [str(i) for i in pval]
    res1 = (",".join(s))
    writefp.write('F_regressor,' + res + '\n')
    writefp.close()

    ################################# LASSO ######################################################

    from sklearn.linear_model import Lasso
    lasso = Lasso(alpha=0.05, max_iter=5000)
    lasso.fit(X_train, y_train)

    writefp = open("Ranks_reg.csv", 'a')
    s = [str(i) for i in np.abs(lasso.coef_)]
    res = (",".join(s))
    writefp.write('Lasso ,' + res + '\n')
    writefp.close()

    ############################# RANDOMIZED LASSO ################################################

    from sklearn.linear_model import RandomizedLasso
    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X_train, y_train)

    writefp = open("Ranks_reg.csv", 'a')
    s = [str(i) for i in np.abs(rlasso.scores_)]
    res = (",".join(s))
    writefp.write('Randomized Lasso,' + res + '\n')
    writefp.close()

    ############################ CORRELATION ########################################################

    corr = []
    for i in X.columns.tolist():
        corr.append(df['Target'].corr(df[i]))

    writefp = open("Ranks_reg.csv", 'a')
    s = [str(i) for i in corr]
    res = (",".join(s))
    writefp.write('Correlation With Target,' + res + '\n')
    writefp.close()

    #################
    endTime = time.time()
    final_time = endTime - startTime

    def convert(seconds):
        seconds = seconds % (24 * 3600)
        hour = seconds // 3600
        seconds %= 3600
        minutes = seconds // 60
        seconds %= 60

        return "%d:%02d:%02d" % (hour, minutes, seconds)

    n = final_time
    print(convert(n))
    for key in final_feats:
        final_inputs[x][count] = final_feats[key][x]
        count = count+1

inputs = [input for input in final_inputs.values()]

# Recursive feature elimination

svr = SVR(kernel="linear")
rfe = RFE(svr, step=1)
rfe = rfe.fit(inputs,outputs[1])
rfe.support_
rfe.ranking_


# selected features by RFE
selected_features = []
count = 0
for key in final_feats.keys():
    if (rfe.support_[count] == True):
        selected_features.append(key)
    count = count + 1
    
    
# Randomized Lasso for feature selection
rlasso = RandomizedLasso(alpha=1)
rlasso.fit(inputs, outputs[2])
rlasso.scores_


示例#29
0
def score_calculate(flag):
    # 行为特征选择的算法,列为特征的名称
    algorithm = {}
    if flag=='whole':
        tmp_sta,tmp_rf,tmp_gbdt,tmp_extra={},{},{},{}
        for n in range(10):
            #stability
            rlasso = RandomizedLasso(random_state=n)
            rlasso.fit(data, mark)
            new1=rank_to_dict(np.abs(rlasso.scores_), names,cv=True)
            new_sta.append(new1['白球比'])
            tmp_sta = add(tmp_sta,rank_to_dict(np.abs(rlasso.scores_), names,cv=True))

            #rf
            rf = RandomForestClassifier(random_state=n)
            rf.fit(data, mark)
            new2=rank_to_dict(rf.feature_importances_, names,cv=True)
            new_rf.append(new2['白球比'])
            tmp_rf = add(tmp_rf,rank_to_dict(rf.feature_importances_, names,cv=True))

            #GBDT
            gbdt=GradientBoostingClassifier(random_state=n)
            gbdt.fit(data, mark)
            new3 = rank_to_dict(gbdt.feature_importances_, names, cv=True)
            new_gbdt.append(new3['白球比'])
            tmp_gbdt = add(tmp_gbdt, rank_to_dict(gbdt.feature_importances_, names, cv=True))

            #Extra
            model = ExtraTreesClassifier(random_state=n)
            model.fit(data, mark)
            new4 = rank_to_dict(model.feature_importances_, names, cv=True)
            new_ex.append(new4['白球比'])
            tmp_extra = add(tmp_extra, rank_to_dict(model.feature_importances_, names, cv=True))

        algorithm["stability"],algorithm["RF"],algorithm["GBDT"],algorithm["Extra"] \
            = tmp_sta,tmp_rf,tmp_gbdt,tmp_extra
        #print(len(algorithm["stability"]))
        #MIC
        mine = MINE()
        mic_scores = []
        res=[]
        for i in range(len(data[0])):
            for num in data:
                res.append(num[i])
            mine.compute_score(res, mark)
            m = mine.mic()
            mic_scores.append(m)
            res = []
        algorithm["MIC"] = rank_to_dict(mic_scores, names)

        #线性回归
        lr = LinearRegression(normalize=True)
        lr.fit(data, mark)
        algorithm["Linear"] = rank_to_dict(np.abs(lr.coef_), names)

        #ridge
        ridgecv = RidgeCV()
        ridgecv.fit(data, mark)
        #print(ridgecv.alpha_)
        ridge = Ridge(alpha=ridgecv.alpha_)
        ridge.fit(data, mark)
        algorithm["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

        #lasso
        lassocv = LassoCV()
        lassocv.fit(data, mark)
        #print(lassocv.alpha_)
        lasso = Lasso(alpha=lassocv.alpha_)
        lasso.fit(data, mark)
        algorithm["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

        #rfe
        log=LogisticRegression()
        rfe = RFE(log, n_features_to_select=10)
        rfe.fit(data, mark)
        algorithm["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1)
        '''
        #f值检验
        f, pval = f_classif(data, mark)
        algorithm["Corr"] = rank_to_dict(f, names)
        '''
    elif flag=='extra':
        model = ExtraTreesClassifier()
        model.fit(data, mark)
        algorithm["Extra"] = rank_to_dict(model.feature_importances_, names)
    elif flag=='gbdt':
        gbdt = GradientBoostingClassifier()
        gbdt.fit(data, mark)
        algorithm["GBDT"] = rank_to_dict(gbdt.feature_importances_, names)
    elif flag=='rf':
        rf = RandomForestClassifier()
        rf.fit(data, mark)
        algorithm["RF"] = rank_to_dict(rf.feature_importances_, names)
    r = {}
    for name in names:
        r[name] = round(np.mean([algorithm[method][name] for method in algorithm.keys()]), 4)
    methods = sorted(algorithm.keys())
    algorithm["Mean"] = r
    methods.append("Mean")

    content=[]
    for name in names:
        content.append([algorithm[method][name] for method in methods])
    fea_matrix = pd.DataFrame(content,index=names)
    #fea_matrix.to_csv('/Users/hhy/Desktop/fea_importance_'+flag+'.csv',encoding='utf-8-sig',header=methods)
    return algorithm
示例#30
0
# ###identify features and labels

# In[ ]:

Abalone_data_features = Abalone_data.iloc[:, :-1]
Abalone_data_labels = Abalone_data.iloc[:, -1:]
print(Abalone_data_features.head())
print(Abalone_data_labels.head())

# ###predictive models

# In[ ]:

model = RandomizedLasso(alpha=0.01)
model.fit(Abalone_data_features, Abalone_data_labels["Rings"])
names = list(Abalone_data_features)

print("Features by their score:")
print(
    sorted(zip(map(lambda x: round(x, 4), model.scores_), names),
           reverse=True))

# In[ ]:

sring_labels = Abalone_data.iloc[:, -1:]

# In[ ]:

splits = tts(Abalone_data_features, sring_labels, test_size=0.2)
X_train, X_test, y_train, y_test = splits
def data_analyse(path,file_name):
    
    #This is used to load the dataset
    dataset = data_loader(path,file_name)
    
    #Clean up the dataset by assigning mean
    dataset = DataFrameImputer().fit_transform(dataset)
    
    #Our Class/Prediction Variable
    Y = dataset['loan_status']
    
    '''
    1.) Transformations
    '''    
    
    #Transforming the Class Variable
    le = p.LabelEncoder()
    le.fit(Y)
    tr = le.transform(Y)
    
    #Penaly Application Variable for the lasso model
    alpha = 0.001
    
    #Just to keep a count on if any exceptions occur
    count = 0
    
    #Used as Attribute list from which we will select the best predictors for our Class Variable
    X = dataset[["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","pymnt_plan","url","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","acc_now_delinq","tot_coll_amt","tot_cur_bal","total_rev_hi_lim","acc_open_past_24mths","avg_cur_bal","bc_open_to_buy","bc_util","chargeoff_within_12_mths","delinq_amnt","mo_sin_old_il_acct","mo_sin_old_rev_tl_op","mo_sin_rcnt_rev_tl_op","mo_sin_rcnt_tl","mort_acc","mths_since_recent_bc","mths_since_recent_bc_dlq","mths_since_recent_inq","mths_since_recent_revol_delinq","num_accts_ever_120_pd","num_actv_bc_tl","num_actv_rev_tl","num_bc_sats","num_bc_tl","num_il_tl","num_op_rev_tl","num_rev_accts","num_rev_tl_bal_gt_0","num_sats","num_tl_120dpd_2m","num_tl_30dpd","num_tl_90g_dpd_24m","num_tl_op_past_12m","pct_tl_nvr_dlq","percent_bc_gt_75","pub_rec_bankruptcies","tax_liens","tot_hi_cred_lim","total_bal_ex_mort","total_bc_limit","total_il_high_credit_limit"]]
    
    #Just to display the score and values of each attribute    
    names = ["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","pymnt_plan","url","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","acc_now_delinq","tot_coll_amt","tot_cur_bal","total_rev_hi_lim","acc_open_past_24mths","avg_cur_bal","bc_open_to_buy","bc_util","chargeoff_within_12_mths","delinq_amnt","mo_sin_old_il_acct","mo_sin_old_rev_tl_op","mo_sin_rcnt_rev_tl_op","mo_sin_rcnt_tl","mort_acc","mths_since_recent_bc","mths_since_recent_bc_dlq","mths_since_recent_inq","mths_since_recent_revol_delinq","num_accts_ever_120_pd","num_actv_bc_tl","num_actv_rev_tl","num_bc_sats","num_bc_tl","num_il_tl","num_op_rev_tl","num_rev_accts","num_rev_tl_bal_gt_0","num_sats","num_tl_120dpd_2m","num_tl_30dpd","num_tl_90g_dpd_24m","num_tl_op_past_12m","pct_tl_nvr_dlq","percent_bc_gt_75","pub_rec_bankruptcies","tax_liens","tot_hi_cred_lim","total_bal_ex_mort","total_bc_limit","total_il_high_credit_limit"]
    
    #Walk through each attribute
    for x in X:
        try:
            #Transformation of Categorical Variable
            le = p.LabelEncoder()
            le.fit(dataset[x])
            dataset[x] = le.transform(dataset[x])
        except Exception:
            #If there are no values
            count += 1
    
    #Just Refreshing X after transformation
    X = dataset[["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","pymnt_plan","url","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","acc_now_delinq","tot_coll_amt","tot_cur_bal","total_rev_hi_lim","acc_open_past_24mths","avg_cur_bal","bc_open_to_buy","bc_util","chargeoff_within_12_mths","delinq_amnt","mo_sin_old_il_acct","mo_sin_old_rev_tl_op","mo_sin_rcnt_rev_tl_op","mo_sin_rcnt_tl","mort_acc","mths_since_recent_bc","mths_since_recent_bc_dlq","mths_since_recent_inq","mths_since_recent_revol_delinq","num_accts_ever_120_pd","num_actv_bc_tl","num_actv_rev_tl","num_bc_sats","num_bc_tl","num_il_tl","num_op_rev_tl","num_rev_accts","num_rev_tl_bal_gt_0","num_sats","num_tl_120dpd_2m","num_tl_30dpd","num_tl_90g_dpd_24m","num_tl_op_past_12m","pct_tl_nvr_dlq","percent_bc_gt_75","pub_rec_bankruptcies","tax_liens","tot_hi_cred_lim","total_bal_ex_mort","total_bc_limit","total_il_high_credit_limit"]]
    
    '''
    2.) Lasso Implementation
    '''
    rlasso = RandomizedLasso(alpha=alpha)
    rlasso.fit(X, tr)
    
    #To sort the attributes according to the Lasso Suggested Score
    output = (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), 
                 names), reverse=True))
    
    #Just to tag the alpha related to the output, so that keeping track is easy
    output.insert(0,[alpha,"Alpha:"])
    
    #Writing the output to a file for further reference.
    with open('./output/d_analysis-'+str(alpha)+'-.json','w') as output_file:
        json.dump(output,output_file,indent=4,ensure_ascii=False)
        
    '''
    3.) Linear Regression Model
    '''
    
    #Selecting the output from Lasso and selecting variables with high scores
    dataset = dataset[['total_rec_late_fee','total_rec_int','last_pymnt_amnt','recoveries','acc_open_past_24mths','last_pymnt_d']]
    
    #Splitting the train data into parts        
    len_train = int(len(dataset.index)*0.75) * (-1)
    X_train = dataset[:len_train]
    Y_train = tr[:len_train]
    
    #Splitting the data into test
    len_test = int(len(dataset.index)*0.25) * (-1)
    X_test = dataset[len_test:]
    Y_test = tr[len_test:]
    
    # Create linear regression object
    regr = linear_model.LinearRegression()
    regr.fit(X_train, Y_train)
    
    #Just to see how algorithm performed
    print('Coefficients: \n', regr.coef_)
    
    #R2 score
    print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(X_test) - Y_test) ** 2))
     
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % regr.score(X_test, Y_test))
示例#32
0
def do_ml(day):
    ################################################################
    # Modules to use
    ###############################################################
    USE_SAX = False
    FEATURE_REDUCTION = False
    DIM_REDUCTION_SEARCH = False

    #Create folder for administration
    try:
        os.mkdir('performance-{}-days'.format(day))
        os.mkdir('performance-{}-days/models'.format(day))
    except FileExistsError as e:
        None
    ################################################################

    print('Using sax: {}'.format(USE_SAX))
    if USE_SAX:
        X = pickle.load(open('X_sax.p', 'rb'))
        y = pickle.load(open('y_sax.p', 'rb'))
    else:
        X = pickle.load(open('X_reg.p', 'rb'))
        y = pickle.load(open('y_reg.p', 'rb'))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1234)

    ################################################################
    # Dimensionality reduction
    ################################################################

    def pca_reduce(X, dim):
        pca = PCA(n_components=dim)

        X_reduced = pca.fit_transform(X)
        return X_reduced

    def isomap_reduce(X, dim):
        iso = Isomap(n_components=dim)

        X_reduced = iso.fit_transform(X)
        return X_reduced

    def _find_best_dim_red(dims, model, model_name, X, y, params):

        rows_list = []
        for dim in dims:
            for f in [pca_reduce, isomap_reduce]:

                print(
                    'Start reducing dimensionality using {} to {} dimensions'.
                    format(f.__name__, dim))
                t0 = time.time()
                #reduce dimensionality
                #print(X.shape)
                X_red = f(X, dim)
                #print(X_red.shape)
                X_train_red, X_test_red, y_train, y_test = train_test_split(
                    X_red, y, test_size=0.2, random_state=1234)
                X_train_red = f(X_train_red, dim)
                X_test_red = f(X_test_red, dim)
                t1 = time.time()
                print('Reducing dimensions cost {} seconds'.format(t1 - t0))
                #Optimize model using grid search and cross validation
                print('Start optimizing {} model'.format(model_name))
                t0 = time.time()
                optimized_model = GridSearchCV(model,
                                               params,
                                               cv=10,
                                               refit=True)
                optimized_model.fit(X_train_red, y_train)
                t1 = time.time()
                print('Optimizing took {} seconds'.format(t1 - t0))
                #print('best found parameters')
                #print(optimized_model.best_params_)

                y_pred = optimized_model.predict(X_test_red)

                mse = sk.metrics.mean_squared_error(y_test, y_pred)
                print("MSE on test set: {}".format(mse))
                #administration
                rows_list.append({
                    'model':
                    deepcopy(model_name),
                    'dimensions':
                    deepcopy(dim),
                    'reduction technique':
                    deepcopy(f.__name__),
                    'mse':
                    deepcopy(mse),
                    'parameters':
                    str(deepcopy(optimized_model.best_params_))
                })
                #store model
                doc = open(
                    'performance-{}-days/models/{}-{}-{}.pickle'.format(
                        day, f.__name__, model_name, dim), 'wb')
                pickle.dump(optimized_model, doc)
                doc.close()

        adm_df = pd.DataFrame(rows_list)
        adm_df.to_csv('performance-{}-days/{}-dim_reduction.csv'.format(
            day, model_name))

    def dim_reduction_search(X, y):

        rf_params = {
            "max_depth": [3, None],
            "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'],
            "min_samples_split": [2, 3, 10],
            "min_samples_leaf": [1, 3, 10],
            "bootstrap": [True, False],
            "criterion": ["mse"]
        }

        ada_params = {
            'n_estimators': [10, 50, 100, 300, 500],
            'learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
            'loss': ['linear', 'square', 'exponential']
        }

        dims = [10, 20, 30]

        _find_best_dim_red(dims, AdaBoostRegressor(), 'AdaBoost', X, y,
                           ada_params)
        _find_best_dim_red(dims, RandomForestRegressor(), 'RandomForest', X, y,
                           rf_params)

    #X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2)

    ##############################################################
    # Feature selection
    ##############################################################

    #Recursive Feature Elimination
    def ReFeEl(nr_features,
               X_train,
               y_train,
               X_test,
               y_test,
               estimator,
               nr_models=5):
        print('Start selecting features')
        t1 = time.time()
        #estimator = AdaBoostRegressor(learning_rate= 0.001, loss ='square', n_estimators  = 50)
        result = []
        for nr_feature in nr_features:
            selector = RFE(estimator, nr_feature, step=1)
            selector.fit(X_train, y_train)
            y_pred = selector.predict(X_test)
            mse = sk.metrics.mean_squared_error(y_test, y_pred)
            result.append((mse, selector))
        #sort models and take nr_models best ones
        result.sort(key=lambda x: x[0])
        result = result[:nr_models]

        t2 = time.time()
        print('selecting features took {} seconds'.format(t2 - t1))
        print(result[0][1].support_)
        print(result[0][1].ranking_)
        print('Minimum MSE: {}, number of selected features: {}'.format(
            result[0][0], len(result[0][1].support_[result[0][1].support_])))

        return result

    if FEATURE_REDUCTION:
        if not USE_SAX:
            estimator = AdaBoostRegressor(learning_rate=0.001,
                                          loss='square',
                                          n_estimators=50)
        else:
            estimator = AdaBoostRegressor(learning_rate=0.01,
                                          loss='linear',
                                          n_estimators=50)
        _, total_nr_features = X.shape
        nr_features = range(1, total_nr_features)

        opt_features_models = ReFeEl(nr_features, X_train, y_train, X_test,
                                     y_test, estimator)
        print(opt_features_models)

        with open('optimal_features_model_{}.pickle'.format(USE_SAX),
                  'wb') as f:
            pickle.dump(opt_features_models, f)

    # Feature Importance using Extra Trees
    def ET_feature_selection():
        estimator = ExtraTreesRegressor()
        estimator.fit(X, y)
        print(estimator.feature_importances_)
        print((len(estimator.feature_importances_[
            estimator.feature_importances_ < 0.01]),
               len(estimator.feature_importances_)))
        print(np.mean(estimator.feature_importances_))
        print(np.std(estimator.feature_importances_))

        fig = plt.figure()
        ax = fig.add_subplot(111)
        bp = ax.boxplot(estimator.feature_importances_)
        fig.savefig('boxplot.png', bbox_inches='tight')
        input('hallo')

    ##############################################################
    #Dimensionality reduction search
    ##############################################################
    if DIM_REDUCTION_SEARCH:
        dim_reduction_search(X, y)

    ##############################################################
    #Grid search + CV
    ##############################################################
    def append_deep_copy(rows_list, model_name, nr_features, mse, params):
        result = {
            'model': deepcopy(model_name),
            'nr_features': deepcopy(nr_features),
            'MSE': deepcopy(mse),
            'parameters': str(deepcopy(params))
        }
        rows_list.append(result)

        return rows_list

    params = {
        'RF': {
            "max_depth": [3, None],
            "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'],
            "min_samples_split": [2, 3, 10],
            "min_samples_leaf": [1, 3, 10],
            "bootstrap": [True, False],
            "criterion": ["mse"]
        },
        'AdaBoost': {
            'n_estimators': [10, 50, 100, 300, 500],
            'learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
            'loss': ['linear', 'square', 'exponential']
        }
    }

    ##############################################################
    # Recursive Feature Elimination
    #############################################################

    for estimator in [(AdaBoostRegressor(), 'AdaBoost'),
                      (RandomForestRegressor(), 'RF')]:

        #Get features and store feature selector
        print('Optimizing {} using CV RFE'.format(estimator[1]))
        t0 = time.time()
        selector = RFECV(estimator[0], step=1, cv=10)
        selector = selector.fit(X_train, y_train)
        X_train_transformed = selector.transform(X_train)
        X_test_transformed = selector.transform(X_test)
        t1 = time.time()
        print('Optimizing features done in {} seconds, storing model..'.format(
            t1 - t0))
        #print('Selected features ({}): {}'.format(len(selector.get_support()[selector.get_support()]),selector.get_support()))
        doc = open(
            'performance-{}-days/models/RFE-{}-selector.pickle'.format(
                day, estimator[1]), 'wb')
        pickle.dump(selector, doc)
        doc.close()

        #Optimize hyperparameters and evaluate model
        print('Start optimizing hyperparameters using determined features...')
        t0 = time.time()
        opt_model = GridSearchCV(estimator[0],
                                 params[estimator[1]],
                                 cv=10,
                                 refit=True)
        opt_model.fit(X_train_transformed, y_train)
        t1 = time.time()

        print('Optimizing took {} seconds'.format((t1 - t0)))
        #print('best found parameters')
        #print(opt_model.best_params_)
        y_pred = opt_model.predict(X_test_transformed)
        mse = sk.metrics.mean_squared_error(y_test, y_pred)
        print("MSE on test set: {}".format(mse))
        model_doc = open(
            'performance-{}-days/models/RFE-{}-model.pickle'.format(
                day, estimator[1]), 'wb')
        pickle.dump(opt_model, model_doc)
        model_doc.close()

    ##############################################################
    # Feature Stability Selection
    #############################################################

    RL = RandomizedLasso(alpha='aic')
    print('Start optimizing using Randomized Lasso')
    t0 = time.time()
    RL.fit(X, y)
    t1 = time.time()
    print('Optimizing done in {} seconds'.format(t1 - t0))
    #print('Best parameters: {}'.format(RL.get_params()))
    #print('Best features: {}'.format(RL.get_support()))
    doc = open(
        'performance-{}-days/models/RandomizedLasso-selector.pickle'.format(
            day), 'wb')
    pickle.dump(RL, doc)
    doc.close()

    X_train_RL = RL.transform(X_train)
    X_test_RL = RL.transform(X_test)
    print('Using RL features to optimize model..')
    for estimator in [(AdaBoostRegressor(), 'AdaBoost'),
                      (RandomForestRegressor(), 'RF')]:
        print('Optimizing {} using CV RFE'.format(estimator[0]))
        t0 = time.time()
        opt_model = GridSearchCV(estimator[0],
                                 params[estimator[1]],
                                 cv=10,
                                 refit=True)
        opt_model.fit(X_train_RL, y_train)
        t1 = time.time()

        print('Optimizing took {} seconds'.format((t1 - t0)))
        #print('best found parameters')
        #print(opt_model.best_params_)
        y_pred = opt_model.predict(X_test_RL)
        mse = sk.metrics.mean_squared_error(y_test, y_pred)
        print("MSE on test set: {}".format(mse))
        model_doc = open(
            'performance-{}-days/models/RandomizedLasso-{}-model.pickle'.
            format(day, estimator[1]), 'wb')
        pickle.dump(opt_model, model_doc)
        model_doc.close()
for x in range(0,len(outputs[1])):
    final_inputs[x] = np.zeros(len(final_feats))
    count = 0
    for key in final_feats:
        final_inputs[x][count] = final_feats[key][x]
        count = count+1

inputs = [input for input in final_inputs.values()]


svr = SVR(kernel="linear")
rfe = RFE(svr, step=1)
rfe = rfe.fit(inputs,outputs[1])
rfe.support_
rfe.ranking_



selected_features = []
count = 0
for key in final_feats.keys():
    if (rfe.support_[count] == True):
        selected_features.append(key)
    count = count + 1
    
    

rlasso = RandomizedLasso(alpha=1)
rlasso.fit(inputs, outputs[2])
rlasso.scores_
示例#34
0
data_final.columns.values

y_all = data_final['CLM_YesNo']

X_all = data_final.drop('CLM_YesNo', axis=1)

from sklearn import datasets

from sklearn.feature_selection import RFE

from sklearn.linear_model import RandomizedLasso

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

#rfe = RFE(logreg, 13)

#rfe = rfe.fit(X_all, y_all)

#print (rfe.support_)
#print (rfe.ranking_)

#X_rfe = X_all[X_all.columns[rfe.support_]]

rlasso = RandomizedLasso(scaling=0.025)

rlasso.fit(X_all, y_all)

print(rlasso.scores_)
def f():

    # from minepy import MINE

    # np.random.seed(0)
    #
    # size = 750
    # X = np.random.uniform(0, 1, (size, 14))
    #
    # # "Friedamn #1” regression problem
    # Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
    #      10 * X[:, 3] + 5 * X[:, 4] + np.random.normal(0, 1))
    # # Add 3 additional correlated variables (correlated with X1-X3)
    # X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))
    #
    # names = ["x%s" % i for i in range(1, 15)]
    boston = pd.read_csv('/Users/ufenqi/Documents/dataming/base1/data/data_1510/traindata_use.csv')
    boston.fillna(-1)
    target = '1'
    IDcol = '0'
    predictors = [x for x in boston.columns if x not in [target, IDcol]]
    print len(predictors)
    X = boston[predictors]
    Y = boston[target]
    names = predictors

    ranks = {}

    def rank_to_dict(ranks, names, order=1):
        minmax = MinMaxScaler()
        ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
        ranks = map(lambda x: round(x, 2), ranks)
        return dict(zip(names, ranks))

    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

    # stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

    rf = RandomForestRegressor()
    rf.fit(X, Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

    f, pval = f_classif(X, Y)
    ranks["Corr."] = rank_to_dict(f, names)

    # mine = MINE()
    # mic_scores = []
    # for i in range(X.shape[1]):
    #     mine.compute_score(X[:, i], Y)
    #     m = mine.mic()
    #     mic_scores.append(m)
    #
    # ranks["MIC"] = rank_to_dict(mic_scores, names)

    r = {}
    for name in names:
        r[name] = round(np.mean([ranks[method][name]
                                 for method in ranks.keys()]), 2)

    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    print "\t%s" % "\t".join(methods)
    i=0
    for name in names:
        print "%s\t%s" % (name, "\t".join(map(str,
                                              [ranks[method][name] for method in methods])))
        i+=1
    print i
示例#36
0
def continuousFeaturesSelecting(df_continuous_removed_corr, *path_to_write):
    """对所有变量作标准化处理, 然后分为X自变量(除ovd_daynum)和y因变量(逾期天数),
	    采用随机Lasso选择对y回归贡献得分大于0.8的X,
	    返回y和X组成的特征选择后的数据"""
    #print df_continuous_removed_corr.describe()

    datasets = df_continuous_removed_corr.values
    columns_list = df_continuous_removed_corr.columns.tolist()
    #print columns_list

    #1 Normalizer() 归一化
    #normal_scaler = Normalizer().fit_transform(datasets)
    #print normal_scaler[:, 0]

    #2 StandardScaler() 标准化
    std_scaled = StandardScaler().fit_transform(datasets)
    y = std_scaled[:, 0]
    X = std_scaled[:, 1:]
    #    print datasets
    #    print X.shape, '\n', X, '\n'
    #    print y.shape, '\n', y, '\n'

    # RandomizedLasso
    rlasso = RandomizedLasso()
    rlasso.fit(X, y)
    # 给ndarray格式特征匹配上特征名(此处不包括y逾期天数)
    list_features_rank = sorted(zip(columns_list[1:],
                                    map(lambda x: round(x, 4),
                                        rlasso.scores_)),
                                key=lambda x: x[1],
                                reverse=True)
    df_features_rank = pd.DataFrame(list_features_rank,
                                    columns=[
                                        'features_label',
                                        'features_rank',
                                    ])

    # 如果选择了存数路径, 则将连续型特征及其排名得分存入指定文件中
    if path_to_write:
        df_features_rank.to_csv(path_to_write[0], index=False)

    # 截取得分在1.0及以上的特征X
    list_columns_features_selected =\
     df_features_rank[df_features_rank[
     'features_rank']>=1.0]['features_label'].values.tolist()
    # 再添加上y(逾期天数), 构成进行样本聚类的所有特征
    list_columns_features_selected.insert(0, 'ovd_daynum')
    #    print list_features_rank
    #    print df_features_rank
    #    print list_columns_features_selected
    # 在输入DataFrame上删减后的DataFrame
    df_features_selected = df_continuous_removed_corr[
        list_columns_features_selected]

    # 对删减后的DataFrame作标准化后再返回, 方便直接进行样本聚类
    #print df_features_selected.values
    #print StandardScaler().fit_transform(df_features_selected.values)
    df_continuous_features_selected =\
     pd.DataFrame(StandardScaler().fit_transform(df_features_selected.values),
     index=df_features_selected.index, columns=df_features_selected.columns)
    #print df_features_selected.head()
    #print df_continuous_features_selected.head()
    # 按行展示均值和方差
    #print df_continuous_features_selected.values.mean(axis=0)
    #print df_continuous_features_selected.values.var(axis=0)

    return df_continuous_features_selected
def randomLasso_hq(X, y, alpha=0.025):
    ### random lasso or logistic regression =======
    from sklearn.linear_model import RandomizedLasso
    rlasso = RandomizedLasso(alpha=alpha)
    rlasso.fit(X, y)
    return(rlasso.scores_)
示例#38
0
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.datasets import make_classification
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model import RandomizedLasso
from sklearn.datasets import make_regression
X, y = make_classification(n_samples=100,
                           n_features=100,
                           n_informative=5,
                           n_redundant=2,
                           random_state=101)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.30, random_state=101)
classifier = LogisticRegression(C=0.1, penalty='l1', random_state=101)
classifier.fit(X_train, y_train)
print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test, y_test))
selector = RandomizedLogisticRegression(n_resampling=300, random_state=101)
selector.fit(X_train, y_train)
print("Variance selected: %i" % sum(selector._get_support_mask() != 0))
X_train_s = selector.transform(X_train)
X_test_s = selector.transform(X_test)
classifier.fit(X_train_s, y_train)
print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test_s, y_test))
XX, yy = make_regression(n_samples=100,
                         n_features=10,
                         n_informative=4,
                         random_state=101)
rlasso = RandomizedLasso()
rlasso.fit(XX, yy)
print(list(enumerate(rlasso.scores_)))
示例#39
0
def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature

lr = LinearRegression(normalize=True)
lr.fit(X, y)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

ridge = Ridge(alpha=7)
ridge.fit(X, y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

lasso = Lasso(alpha=.05)
lasso.fit(X, y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

rlasso = RandomizedLasso(alpha=0.04)
rlasso.fit(X, y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X, y)
ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1)

rf = RandomForestRegressor()
rf.fit(X, y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

f, pval = f_regression(X, y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

mine = MINE()
示例#41
0
def run(args):
    X_train = np.nan_to_num(
        np.genfromtxt(args.training_data, delimiter=args.delimiter))
    y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1)

    X_trains = X_train
    if args.scale:
        print "Scaling features (mean removal divided by std)..."
        scaler = StandardScaler().fit(X_train)
        X_trains = scaler.transform(X_train)

    # create output folders
    outF = args.output_folder + "/" + os.path.basename(
        args.training_data) + "--FS_" + str(
        args.select_features) + "--i_" + str(args.iterations)
    buildDir(outF)
    maskF = outF + "/masks/"
    buildDir(maskF)
    #evaluation  features  first_experiments  labels  logs  masks  parameters
    #  predictions  src  suca
    paramF = outF + "/parameters/"
    buildDir(paramF)
    #featF = outF+"/features/"
    #buildDir(featF)    

    #evalF = buildDir(outF+"/evaluation")



    #os.path.basename(
    #        args.training_data)]) + featsel_str + "--" + os.path.basename(
    # test_label



    # initializes numpy random seed
    np.random.seed(args.seed)

    # performs feature selection
    featsel_str = ".all-feats"
    if args.select_features:
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                                  n_jobs=8, random_state=args.seed,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join(
            #    [".", "masks", os.path.basename(args.training_data)])
            [maskF, os.path.basename(args.training_data)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1)

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
        {"n_estimators": [5, 10, 50, 100, 200, 500],
         "max_depth": [3, 2, 1, None],
         "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
         "min_samples_split": sp_randint(1, 11),
         "min_samples_leaf": sp_randint(1, 11),
         "bootstrap": [True, False]}
         # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator, param_distributions,
                                n_iter=args.iterations,
                                scoring=mae_scorer, n_jobs=8, refit=True,
                                cv=KFold(X_train.shape[0], args.folds, shuffle=True,
                                         random_state=args.seed), verbose=1,
                                random_state=args.seed)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................ 
    
    models_dir = sorted(glob.glob(args.models_dir + os.sep + "*"))
    
    estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
                                     max_depth=search.best_params_["max_depth"], 
                                     max_features=search.best_params_["max_features"],
                                     min_samples_leaf=search.best_params_["min_samples_leaf"], 
                                     min_samples_split=search.best_params_["min_samples_split"], 
                                     n_estimators=search.best_params_["n_estimators"], 
                                     verbose=1, 
                                     random_state=42, 
                                     n_jobs=8)
   
    estimator2.fit(X_trains,y_train)
    from sklearn.externals import joblib
    print "koooonnn %s" % args.models_dir
    joblib.dump(estimator2, args.models_dir+"/XRT.pkl")
    joblib.dump(scaler, args.models_dir+"/scaler.pkl")
    joblib.dump(sel_est, args.models_dir+"/sel_est.pkl")
    
#    print "Kioonnn number of feat:\n", n_feature
    # ................SHAHAB ........................

    print "Best parameters: ", search.best_params_

    # saves parameters on yaml file
    #param_path = os.sep.join([".", "parameters", os.path.basename(
    param_path = os.sep.join([paramF, os.path.basename(
        args.training_data)]) + featsel_str + ".params.yaml"
    param_file = codecs.open(param_path, "w", "utf-8")
    yaml.dump(search.best_params_, stream=param_file)
    testF = os.sep.join([outF, "/test/"])
    buildDir(testF)

    m = y_train.mean()

    # evaluates model on the different test sets
    test_features = sorted(glob.glob(args.test_data + os.sep + "*"))
    test_labels = sorted(glob.glob(args.test_labels + os.sep + "*"))
    for test_feature, test_label in zip(test_features, test_labels):
        print "Evaluating on %s" % test_label
    	X_test = np.nan_to_num(
        	np.genfromtxt(test_feature, delimiter=args.delimiter))
    	y_test = np.clip(np.genfromtxt(test_label), 0, 1)

    	X_tests = X_test
    	if args.scale:
        	X_tests = scaler.transform(X_test)

    	if args.select_features:
        	X_tests = sel_est.transform(X_tests)

    	# gets predictions on test set
    	#y_pred = search.predict(X_tests)
    	y_pred = np.clip(search.predict(X_tests), 0, 1)

    	# evaluates on test set
    	mae = mean_absolute_error(y_test, y_pred)
    	rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    	print "Test MAE = %2.8f" % mae
    	print "Test RMSE = %2.8f" % rmse
    	print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max())
    	# saves evaluation
    	testFX = testF + "/" + os.path.basename(test_label)
    	buildDir(testFX)
    	buildDir(testFX + "/evaluation/")

    	eval_path = os.sep.join([testFX, "evaluation", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label)
    	mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8")
    	mae_eval.write(str(mae) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8")
    	rmse_eval.write(str(rmse) + "\n")

    	mu = m * np.ones(y_test.shape[0])  # baseline on test set
    	maeB = mean_absolute_error(y_test, mu)
    	rmseB = np.sqrt(mean_squared_error(y_test, mu))
    	print "Test MAE Baseline= %2.8f" % maeB
    	print "Test RMSE Baseline= %2.8f" % rmseB
    	mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8")
    	mae_eval.write(str(maeB) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8")
    	rmse_eval.write(str(rmseB) + "\n")



	# saves predictions
	buildDir(testFX + "/predictions/")
	preds_path = os.sep.join([testFX, "predictions", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label) + ".preds"
	np.savetxt(preds_path, y_pred, fmt="%2.15f")
示例#42
0
from sklearn.linear_model import RandomizedLasso
import csv
data = []
mark = []
name = []
with open('/Users/hhy/Desktop/test.csv', 'r', encoding='utf-8_sig') as f:
    csv_reader = csv.reader(f)
    for x in csv_reader:
        data.append(list(map(int, x[0:-1])))
        mark.append(int(x[-1]))
with open('/Users/hhy/Desktop/feature.csv', 'r', encoding='utf-8_sig') as f:
    csv_reader = csv.reader(f)
    for x in csv_reader:
        name.append(x[-1])
rlasso = RandomizedLasso()
rlasso.fit(data, mark)
print("Features sorted by their score:")
print(
    sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), name),
           reverse=True))

#rfe
from sklearn import cross_validation
from sklearn.linear_model import LinearRegression
from sklearn.linear_model.logistic import LogisticRegression
import csv
data = []
mark = []
name = []
with open('/Users/hhy/Desktop/test.csv', 'r', encoding='utf-8_sig') as f:
    csv_reader = csv.reader(f)
def lass_varselect(train, num_vars, target, alpha):   
    lass = RandomizedLasso(alpha=alpha, n_resampling=5)
    lass.fit(train[num_vars], train[target])
    return lass.get_support()
示例#44
0
def stability(features, labels):
    labels = labels.flatten()
    rlasso = RandomizedLasso(alpha=0.025)
    rlasso.fit(features, labels)
    return rlasso.scores_
示例#45
0
class LinearAll:
    """
    A repertoire of Linear Variable Selection and Prediction Models

    Parameters
    ----------
    n_jobs : int, optional
        Number of jobs to run in parallel (default 1).
        If -1 all CPUs are used. This will only provide speedup for
        n_targets > 1 and sufficient large problems
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be:
        None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’
    refit : boolean
        Refit the best estimator with the entire dataset. If “False”,
        it is impossible to make predictions using this GridSearchCV
        instance after fitting.
    iid : boolean, optional
        If True, the data is assumed to be identically distributed across
        the folds, and the score is computed from all samples individually,
        and not the mean loss across the folds.
        (If the number of data points is the same across folds, either
        returns the same thing)

    Attributes
    ----------
    ols_train,
    predictions models before variable selection
    predictions models after variable selection
    """

    def __init__ (self, cv=20, scoring = 'mean_squared_error',
                  n_jobs=1, refit=False, iid=False, pre_pred=True,
                  param_ridge_post=list(np.arange(1,3,0.1)),
                    rlasso_selection_threshold = 0.5):
        #self.__name__ = '__main__'
        """
        CAUTION: we changed to __main__ so that parallelization works
        """
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.iid = iid
        self.pre_pred =pre_pred
        self.param_ridge_post = param_ridge_post
        self.rlasso_selection_threshold = rlasso_selection_threshold

    def run_models(self, X, y, param_ridge):
        """

        Prediction Models.

        OLS, PLS, Ridge

        """

        ##################################
        ## OLS CV
        ##################################
        #ols = linear_model.LinearRegression(fit_intercept=True,
        #                                          normalize=False,
        #                                          copy_X=True)
        #ols_cv_score = cross_validation.cross_val_score(
        #        ols, X, y,
        #        cv=self.cv, scoring=self.scoring,
        #        n_jobs=self.n_jobs)
        """
        self.ols_cv_score.shape = (cv,)
        """

        ##################################
        ## PLS CV
        ##################################
        tuned_parameters = [{'n_components': range(1, 5)}]
        pls = PLSRegression()
        pls_cv = GridSearchCV(pls, tuned_parameters,
                                cv=self.cv, scoring=self.scoring,
                                n_jobs=self.n_jobs,
                                refit=self.refit, iid=self.iid)
        pls_cv.fit(X, y)


        ##################################
        ## Ridge CV
        ##################################
        tuned_parameters = [{'alpha': param_ridge}]
        ridge = linear_model.Ridge(alpha = 1)
        ridge_cv = GridSearchCV(ridge, tuned_parameters,
                                     cv=self.cv, scoring=self.scoring,
                                     n_jobs=self.n_jobs,
                                     refit=self.refit, iid=self.iid)
        ridge_cv.fit(X, y)

        return (pls_cv, ridge_cv)

    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self

    def predict(self, X_test):
        assert(self.refit == True)
        if self.pls_post.best_score_ > self.ridge_post.best_score_:
            self.best_model = self.pls_post
            print "Chosen Model: pls"
        else:
            self.best_model = self.ridge_post
            print "Chosen Model: ridge"

        if self.rlasso_selection_threshold == 0:
            X_test_selected = X_test[:, self.active]
        else:
            X_test_selected = self.rlasso.transform(X_test)
        return self.best_model.best_estimator_.predict(X_test_selected)
    if attribute is "_all":
        continue
    else:
        # select the columns containing the attribute
        attribute_columns=filter(lambda x:re.search(attribute,x), data.iloc[:,10:].columns)
        X = data[attribute_columns[:20]] # use only 20 mode paramteres
        
    remove_highly_correlated(X,threshold=0.98)
    print(X.columns.values)
    list_dicts = list()
    for train_index, test_index in skf:
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train.shape)
        if feature_selection == "randomized_lasso":
            feature_selector=RandomizedLasso(sample_fraction=0.5,n_resampling=50,verbose=False,n_jobs=-1)
        elif feature_selection == "RFECV_linearSVM":
#            print(feature_selection % "selected")
            feature_selector = RFECV(SVC(kernel="linear"),step=1,cv=StratifiedKFold(y,5),scoring="accuracy")
        else:
            print("Options are: randomized_lasso, RFECV_linearSVM")
            
        feature_selector.fit(X_train,y_train)
        result = {'X_train':X_train,'y_train':y_train,'X_test':X_test,'y_test':y_test,'feature_selector':feature_selector}
        list_dicts.append(result)
        
        
    dict_for_attribute[attribute] = list_dicts
    print("done in %0.3fs" % (time()-t0))