def spline(X, y, features, outcome): # estimator = linear_model.Ridge(solver='svd') estimator = Earth(feature_importance_type='rss') feature_s = pd.Series( data=features, index=['x' + str(i) for i in np.arange(len(features))]) table = [] estimator.fit(X, y) # if estimator.rsq_ > 0.5: # print outcome, 'r2 score: ', estimator.rsq_ print '=================' print 'Features Importance:' # printing feature importances with x0 to xn # mapped to real features name for line in estimator.summary_feature_importances().split('\n'): line_cleaned = line.split() if len(line_cleaned) == 2: print line_cleaned[0], line_cleaned[1], \ feature_s[line_cleaned[0]] print 'r2 score: ', estimator.rsq_ return estimator
10 * X[:, 3] + 5 * X[:, 4] + numpy.random.uniform(size=m)) # Fit an Earth model criteria = ('rss', 'gcv', 'nb_subsets') model = Earth(max_degree=3, max_terms=10, minspan_alpha=.5, feature_importance_type=criteria, verbose=True) model.fit(X, y) rf = RandomForestRegressor() rf.fit(X, y) # Print the model print(model.trace()) print(model.summary()) print(model.summary_feature_importances(sort_by='gcv')) # Plot the feature importances importances = model.feature_importances_ importances['random_forest'] = rf.feature_importances_ criteria = criteria + ('random_forest',) idx = 1 fig = plt.figure(figsize=(20, 10)) labels = ['$x_{}$'.format(i) for i in range(n)] for crit in criteria: plt.subplot(2, 2, idx) plt.bar(numpy.arange(len(labels)), importances[crit], align='center', color='red')
def csc(df, hamming_string_dict, outdir, filename): """CRISPR Specificity Correction :param df: pandas dataframe with first column as gRNA and second column as logFC/metric :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics :param outdir: absolute filepath to output directory :param filename: name of input file to be used as part of output filename :return: CSC adjustment """ # MARS compatible file df_mars_lst = [] df_v = np.asarray(df) for i in range(len(df_v)): row_lst = [] grna, metric = df_v[i][0], df_v[i][1] try: metric = float(metric) except ValueError: sys.stdout.write( 'WARNING: encountered %s which is not float compatible, skipping\n' % metric) continue row_lst.append(grna) try: for jj in hamming_string_dict[grna]: row_lst.append(jj) row_lst.append(metric) df_mars_lst.append(row_lst) except KeyError: sys.stdout.write('\n%s not found in selected library: passing\n' % grna) continue df = pd.DataFrame(df_mars_lst, columns=[ 'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3', 'original_value' ]) # exclude infinte specificity non-target gRNAs df = df[df['h0'] != 0] # isolate pertinent confounder variables df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']] # knots knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1]) # training and testing data train_x, test_x, train_y, test_y = train_test_split(df_confounders, df['original_value'], test_size=0.10, random_state=1) # Fit an Earth model model = Earth(feature_importance_type='gcv') try: model.fit(train_x, train_y) except ValueError: sys.stdout.write( '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n' ) model_processed = 'F' sys.stdout.write( 'training input x data\n %s\ntraining input y data\n %s\n' % (train_x, train_y)) return model_processed # Print the model print(model.trace()) print(model.summary()) print(model.summary_feature_importances()) # Plot the model y_hat = model.predict(test_x) # calculating RMSE values rms1 = sqrt(mean_squared_error(test_y, y_hat)) print('\n\nRMSE on Predictions\n\n') print(rms1) # calculating R^2 for training print('\n\nR^2 on Training Data\n\n') print(model.score(train_x, train_y)) # calculating R^2 for testing print('\n\nR^2 on Testing Data\n\n') print(model.score(test_x, test_y)) # write out model metrics with open('%s/csc_model_metrics_%s.txt' % (outdir, filename), 'w') as outfile: outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' % (model.trace(), model.summary(), model.summary_feature_importances(), rms1)) if rms1 <= 1.0: #model processed model_processed = 'T' # full data prediction df['earth_adjustment'] = model.predict(df_confounders) # CSC correction df['earth_corrected'] = df['original_value'] - df['earth_adjustment'] # main write out df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename)) # pickle write out model_file = open( '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb') pl.dump(model, model_file) model_file.close() sys.stdout.write('\nCSC adjustment complete\n') sys.stdout.write('\nCSC output files written to %s\n' % outdir) return model_processed else: sys.stdout.write( '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n' ) model_processed = 'F' return model_processed
data = df[start_p:stop_p] data['Day'] = data.index.dayofyear #add day data = data.interpolate(limit=300000000,limit_direction='both').astype('float32') #interpolate neighbor first, for rest NA fill with mean() conclude_df=pd.DataFrame() for n_out in range(1,n_future+1): X,y,xlabels = to_supervise(data,target,n_out) criteria = ('rss', 'gcv', 'nb_subsets') model = Earth(enable_pruning = True, # max_degree=3, # max_terms=20, minspan_alpha=.5, feature_importance_type=criteria, verbose=True) model.fit(X,y,xlabels=xlabels) nbsub = model.summary_feature_importances(sort_by='nb_subsets')[:2000].split()[3:83] gcv = model.summary_feature_importances(sort_by='gcv')[:2000].split()[3:83] rss = model.summary_feature_importances(sort_by='rss')[:2000].split()[3:83] rss,gcv,nbsub = toDF(rss),toDF(gcv),toDF(nbsub) top20=pd.concat([rss,gcv,nbsub],ignore_index=True) top20 = pd.concat([rss,gcv,nbsub],ignore_index=True).drop_duplicates('feature') top20['timestep'] = n_out #ADDED combine all result conclude_df = pd.concat([conclude_df,top20],ignore_index=True) if mode=='day': conclude_df.drop_duplicates('feature').to_csv('/home/song/Public/Song/Work/Thesis/MAR/[CPY012]featurelist_MAR_daily.csv',index=False) conclude_df.to_csv('/home/song/Public/Song/Work/Thesis/MAR/[CPY012]featurelist_MAR_daily_Fullreport.csv',index=False) elif mode=='hour':