def spline(X, y, features, outcome):
    # estimator = linear_model.Ridge(solver='svd')
    estimator = Earth(feature_importance_type='rss')
    feature_s = pd.Series(
        data=features, index=['x' + str(i) for i in np.arange(len(features))])
    table = []

    estimator.fit(X, y)
    # if estimator.rsq_ > 0.5:
    #     print outcome, 'r2 score: ', estimator.rsq_
    print '================='
    print 'Features Importance:'
    # printing feature importances with x0 to xn
    # mapped to real features name
    for line in estimator.summary_feature_importances().split('\n'):
        line_cleaned = line.split()
        if len(line_cleaned) == 2:
            print line_cleaned[0], line_cleaned[1], \
                feature_s[line_cleaned[0]]
    print 'r2 score: ', estimator.rsq_
    return estimator
示例#2
0
     10 * X[:, 3] +
     5 * X[:, 4] + numpy.random.uniform(size=m))
# Fit an Earth model
criteria = ('rss', 'gcv', 'nb_subsets')
model = Earth(max_degree=3,
              max_terms=10,
              minspan_alpha=.5,
              feature_importance_type=criteria,
              verbose=True)
model.fit(X, y)
rf = RandomForestRegressor()
rf.fit(X, y)
# Print the model
print(model.trace())
print(model.summary())
print(model.summary_feature_importances(sort_by='gcv'))

# Plot the feature importances
importances = model.feature_importances_
importances['random_forest'] = rf.feature_importances_
criteria = criteria + ('random_forest',)
idx = 1

fig = plt.figure(figsize=(20, 10))
labels = ['$x_{}$'.format(i) for i in range(n)]
for crit in criteria:
    plt.subplot(2, 2, idx)
    plt.bar(numpy.arange(len(labels)),
            importances[crit],
            align='center',
            color='red')
示例#3
0
def csc(df, hamming_string_dict, outdir, filename):
    """CRISPR Specificity Correction

    :param df: pandas dataframe with first column as gRNA and second column as logFC/metric
    :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics
    :param outdir: absolute filepath to output directory
    :param filename: name of input file to be used as part of output filename
    :return: CSC adjustment

    """
    # MARS compatible file
    df_mars_lst = []
    df_v = np.asarray(df)
    for i in range(len(df_v)):
        row_lst = []
        grna, metric = df_v[i][0], df_v[i][1]
        try:
            metric = float(metric)
        except ValueError:
            sys.stdout.write(
                'WARNING: encountered %s which is not float compatible, skipping\n'
                % metric)
            continue
        row_lst.append(grna)
        try:
            for jj in hamming_string_dict[grna]:
                row_lst.append(jj)
            row_lst.append(metric)
            df_mars_lst.append(row_lst)
        except KeyError:
            sys.stdout.write('\n%s not found in selected library: passing\n' %
                             grna)
            continue

    df = pd.DataFrame(df_mars_lst,
                      columns=[
                          'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3',
                          'original_value'
                      ])

    # exclude infinte specificity non-target gRNAs
    df = df[df['h0'] != 0]

    # isolate pertinent confounder variables
    df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']]

    # knots
    knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1])

    # training and testing data
    train_x, test_x, train_y, test_y = train_test_split(df_confounders,
                                                        df['original_value'],
                                                        test_size=0.10,
                                                        random_state=1)

    # Fit an Earth model
    model = Earth(feature_importance_type='gcv')
    try:
        model.fit(train_x, train_y)
    except ValueError:
        sys.stdout.write(
            '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n'
        )
        model_processed = 'F'
        sys.stdout.write(
            'training input x data\n %s\ntraining input y data\n %s\n' %
            (train_x, train_y))
        return model_processed

    # Print the model
    print(model.trace())
    print(model.summary())
    print(model.summary_feature_importances())

    # Plot the model
    y_hat = model.predict(test_x)

    # calculating RMSE values
    rms1 = sqrt(mean_squared_error(test_y, y_hat))
    print('\n\nRMSE on Predictions\n\n')
    print(rms1)

    # calculating R^2 for training
    print('\n\nR^2 on Training Data\n\n')
    print(model.score(train_x, train_y))

    # calculating R^2 for testing
    print('\n\nR^2 on Testing Data\n\n')
    print(model.score(test_x, test_y))

    # write out model metrics
    with open('%s/csc_model_metrics_%s.txt' % (outdir, filename),
              'w') as outfile:
        outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' %
                      (model.trace(), model.summary(),
                       model.summary_feature_importances(), rms1))

    if rms1 <= 1.0:

        #model processed
        model_processed = 'T'

        # full data prediction
        df['earth_adjustment'] = model.predict(df_confounders)

        # CSC correction
        df['earth_corrected'] = df['original_value'] - df['earth_adjustment']

        # main write out
        df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename))

        # pickle write out
        model_file = open(
            '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb')
        pl.dump(model, model_file)
        model_file.close()

        sys.stdout.write('\nCSC adjustment complete\n')
        sys.stdout.write('\nCSC output files written to %s\n' % outdir)
        return model_processed

    else:
        sys.stdout.write(
            '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n'
        )
        model_processed = 'F'
        return model_processed
示例#4
0
data = df[start_p:stop_p]
data['Day'] = data.index.dayofyear #add day
data = data.interpolate(limit=300000000,limit_direction='both').astype('float32') #interpolate neighbor first, for rest NA fill with mean() 

conclude_df=pd.DataFrame()
for n_out in range(1,n_future+1):
    X,y,xlabels = to_supervise(data,target,n_out)
    criteria = ('rss', 'gcv', 'nb_subsets')
    model = Earth(enable_pruning = True,
                #   max_degree=3,
                #  max_terms=20,
                minspan_alpha=.5,
                feature_importance_type=criteria,
                verbose=True)
    model.fit(X,y,xlabels=xlabels)
    nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83]
    gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83]
    rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83]
    
    rss,gcv,nbsub = toDF(rss),toDF(gcv),toDF(nbsub)
    top20=pd.concat([rss,gcv,nbsub],ignore_index=True)
    top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature')
    top20['timestep'] = n_out

    #ADDED combine all result
    conclude_df = pd.concat([conclude_df,top20],ignore_index=True)
    if mode=='day':
        conclude_df.drop_duplicates('feature').to_csv('/home/song/Public/Song/Work/Thesis/MAR/[CPY012]featurelist_MAR_daily.csv',index=False)
        conclude_df.to_csv('/home/song/Public/Song/Work/Thesis/MAR/[CPY012]featurelist_MAR_daily_Fullreport.csv',index=False)

    elif mode=='hour':