def cp_tune(dataT): dataCorr = np.corrcoef(dataT.T) pca = PCA(n_components=2) pc = pca.fit_transform(dataCorr) pervals = pca.explained_variance_ratio_[0:2] pervals = pervals / np.sum(pervals) MX = pervals[1] * pc[:, 0] + pervals[0] * np.mean(dataT, axis=0) SX = pervals[0] * mnorm(np.exp(1 - gene_density)) + pervals[1] * chr_len EX = pervals[1] * mnorm(np.exp(1 - gene_density)) + pervals[0] * chr_len GX = loess_1d(SX, MX, frac=2 / 3.0)[1] LX = loess_1d(EX, MX, frac=2 / 3.0)[1] return (GX, LX)
def test_loess_1d(): """ Usage example for loess_1d """ n = 200 np.random.seed(123) x = np.random.uniform(-1, 1, n) x.sort() y = np.sin(3 * x) sigy = 0.4 yran = np.random.normal(y, sigy) nbad = int(n * 0.1) # 10% outliers w = np.random.randint(0, n, nbad) # random indices from 0-n yran[w] += np.random.normal(0, 5 * sigy, nbad) xout, yout, weigts = loess_1d(x, yran, frac=0.3) w = weigts < 0.34 # identify outliers plt.clf() plt.plot(x, yran, 'ro', label='Noisy') plt.plot(xout, yout, 'b', linewidth=4, label='LOESS') plt.plot(x, y, color='limegreen', linewidth=4, label='True') plt.plot(x[w], yran[w], '+k', ms=20, label='Outliers') plt.legend(loc='lower right') plt.pause(1)
def find_outlires(self): # clear by work types, working and other work_types_flags = ['WorkDay', 'no'] countOfReplace = 0 # alg for clear outliers for wt in work_types_flags: # select new df through current worktype if wt == work_types_flags[0]: workingDays = self.dfToClean.loc[self.dfToClean['WorkType'] == work_types_flags[0], :] else: workingDays = self.dfToClean.loc[self.dfToClean['WorkType'] != work_types_flags[0], :] # clear in all years and by current year years = workingDays.Year.unique() for yr in years: myDf = workingDays.copy().loc[workingDays['Year'] == yr, :] # clear by current month in year months = myDf['Month'].unique() for mn in months: # select df with current month in current year my_df_current_month_in_year = myDf.loc[myDf['Month'] == mn, :] # get results from loess fitting xout, yout, weigts = loess_1d.loess_1d(my_df_current_month_in_year['Time'].values, my_df_current_month_in_year['DiffElLoad'].values, frac=0.2) # create column with loess result my_df_current_month_in_year['LoessSm'] = yout # calc resudials of fitting from initial data resudials = my_df_current_month_in_year['DiffElLoad'].values - \ my_df_current_month_in_year['LoessSm'].values # create column with resudials in current df my_df_current_month_in_year['Resudials'] = resudials # create confideince interval # get id values id_vec = my_df_current_month_in_year.index.tolist() # get lower and higher quantilies qL, qH = np.percentile(my_df_current_month_in_year['Resudials'], [15, 85]) # iqr interval my_iqr = qH - qL # coeff to iqr interval coef_conf = 2.0 # lower bound of conf interval lower_conf = qL - (coef_conf * my_iqr) # top bound of conf interval top_conf = qH + (coef_conf * my_iqr) # now replace outlier, i.e. value that outside conf interval for k in id_vec: # search outliers #current candidate candidate = my_df_current_month_in_year.ix[k, 'Resudials'] if candidate < lower_conf or candidate > top_conf: self.dfToClean.ix[k, 'DiffElLoad'] =\ my_df_current_month_in_year.ix[k, 'LoessSm'] countOfReplace = countOfReplace + 1 print('Count of all outliers = ', countOfReplace)
def bleach_fit(brange, frange, intensity, fitter): """Fit decay in intensity for bleach correction""" intensity_values = np.array([intensity[x] for x in brange]) # Choose type of decay if (fitter == 'linear'): # Fitting regularized linear model reg = linear_model.Ridge(alpha=1000, fit_intercept=True) try: reg.fit(brange.reshape(-1, 1), intensity_values.reshape(-1, 1)) except: raise ValueError('Fit not found - try a larger range') pred = reg.predict(frange.reshape(-1, 1)) elif (fitter == 'exponential'): # Fitting exponential model guess = (intensity[0], 0.001, 0) try: popt, _ = curve_fit(exp_func, brange, intensity_values, p0=guess) except: raise ValueError('Fit not found - try a larger range') pred = exp_func(frange, *popt) elif (fitter == 'loess'): # Fitting loess model try: _, pred, _ = loess_1d.loess_1d(brange, intensity_values, xnew=None, degree=1, frac=0.5, npoints=None, rotate=False, sigy=None) except: raise ValueError('Fit not found - try a larger range') # Bleach corrected intensity values corr = np.divide(pred[0], pred) return corr
clf = KernelRidge(kernel='rbf', gamma=0.1, degree=5) clf.fit(x[:,None], y) f_kernelridge = clf.predict(x0[:,None]) print("Scikit-Learn: ", time()-t0) # Lowess GitHub library t0 = time() f_lowess = lowess(x, y, x0, deg=2, l=0.5) print("Lowess GitHub library: ", time()-t0) # Statsmodels t0 = time() res = statslowess(y, x, return_sorted=True, frac=0.1, it=0) x_stats = res[:,0] f_stats = res[:,1] print("Statsmodels: ", time()-t0) # Loess from PyPI t0 = time() x_loess, f_loess, w_loess = loess_1d(x, y, degree=2, frac=0.1, x0=x0) print("Loess for PyPI: ", time()-t0) plt.plot(x, y, '.', markersize=1) plt.plot(x0, f(x0), '--', label='Ground truth') plt.plot(x_stats, f_stats, label='Statsmodels') plt.plot(x0, f_lowess, label='LOWESS') plt.plot(x0, f_loess, label='LOESS') plt.plot(x0, f_kernelridge, label='Kernel Ridge') plt.legend() plt.show()
from loess.loess_1d import loess_1d np.random.seed(1234) # Generate some data x = np.arange(0, 10, 0.1) y = np.sin(x) + 0.2 * np.random.randn(len(x)) # Eliminate some, so that we don't have equal sampling distances cur_ind = np.where((x > 5) & (x < 6)) x_space = np.delete(x, cur_ind) y_space = np.delete(y, cur_ind) plt.plot(x_space, y_space, '.', label='rawdata') # Smooth the data with Lowess, from the package "statsmodels" smoothed = lowess(y_space, x_space, frac=0.1) index, data = smoothed.T plt.plot(index, data, label='lowess') # Smooth with Loess, from the package "loess" x_out, y_out, weights = loess_1d(x_space, y_space, frac=0.1) plt.plot(x_out, y_out, label='loess') plt.legend() # Save and show the image out_file = 'loess.jpg' plt.savefig(out_file, dpi=200, quality=90) print(f'Image saved to {out_file}') plt.show()
print("Number of points: {}".format(len(facet_currents))) if len(sys.argv) > 3: dz = float(sys.argv[3]) else: dz = 0.0001 window = 7.5e-3 frac = window / l # z0 = np.arange(zmin, zmax + dz/2, dz) z0 = np.linspace(zmin, zmax, int(l / dz) + 1, endpoint=True) # res = lowess(facet_currents, z_centroids, it=0, return_sorted=True, frac=frac) # Is = res[:,1] z_loess, Is, w_loess = loess_1d(z_centroids, facet_currents, degree=2, frac=frac, x0=z0) Is *= 2 * np.pi * r facet_currents *= 2 * np.pi * r with open(sys.argv[2], 'w') as file: file.write("#:xaxis\tz\n") file.write("#:name\tz\tOML\tI\tg\ti\n") file.write("#:units\tm\tA/m\tA/m\tdimensionless\tA/m\n") for z, facet_current, I in zip(z0, facet_currents, Is): file.write("{}\t{}\t{}\t{}\t{}\n".format(z, I_OML, I, I / I_OML, facet_current))
def run_loraccs(self, ref_img_band, tgt_img_band, band_num, band_name, band_max_spectra, loess_frac, tgt_img_fp, outdir): ''' Runs the LORACCS method. ''' os.chdir(outdir) # Plot 2d histogram index = (ref_img_band > 0) & (tgt_img_band > 0) ref_img_band_sub = ref_img_band[index] tgt_img_band_sub = tgt_img_band[index] plt.hist2d( tgt_img_band_sub, ref_img_band_sub, bins=200, cmin=5, cmap=plt.cm.jet, ) plt.colorbar() plt.title('%s Band 2D Histogram' % band_name) plt.xlabel('Target') plt.ylabel('Reference') save_fig = '%s_2d_hist.png' % band_name plt.savefig(save_fig) plt.show() ### Extract spectral values into a dict # Get unique values from target image tgt_uniq = np.unique(tgt_img_band) if 0 in tgt_uniq: tgt_uniq = tgt_uniq[tgt_uniq != 0] counts_dict = dict() for uniq in tgt_uniq: counts_dict[uniq] = [] img_rows = range(0, tgt_img_band.shape[0]) img_row_pixel = range(0, tgt_img_band.shape[1]) for band_row in img_rows: # iterate through rows for pixel in img_row_pixel: # iterate through pixels tgt_val = tgt_img_band[band_row][pixel] ref_val = ref_img_band[band_row][pixel] if tgt_val != 0: if ref_val != 0: # Add value to the dict values = counts_dict[tgt_val] try: values.append(ref_val) except: values = ref_val else: continue # Generate stats if max(tgt_uniq) < band_max_spectra: spec_range = list(range(min(tgt_uniq), max(tgt_uniq))) else: spec_range = list(range(min(tgt_uniq), band_max_spectra)) print('Maxiumum spectral value being set to: ', max(spec_range)) stats_df = pd.DataFrame() stats_df['Spec_vals'] = spec_range stats_df['Mean'] = 0 #stats_df['Std'] = std stats_df['Pixels'] = 0 for uniq in tgt_uniq: values = np.array(counts_dict[uniq]) if len(values) > 5: # Subset out values to get rid of outliers sub = np.sort(values) sub = sub[sub < band_max_spectra] val_sub = sub[int(len(sub) * .025):int(len(sub) * .975)] mean = np.mean(val_sub) stats_df.loc[stats_df['Spec_vals'] == uniq, 'Mean'] = mean stats_df.loc[stats_df['Spec_vals'] == uniq, 'Pixels'] = len(values) # Remove all NaN stats_df = stats_df.fillna(0) stats_df_valid = stats_df[stats_df.Mean != 0] # Remove entries with pixel count less than 6 stats_df_valid = stats_df_valid[stats_df_valid.Pixels > 5] ### Create model # Set up params for LOESS x = stats_df_valid.Spec_vals.values xnew = stats_df.Spec_vals.values y = stats_df_valid.Mean.values # Run LOESS xout, yout, wout = loess_1d(x, y, xnew=xnew, frac=loess_frac, degree=2, rotate=False) # Save values into the dataframe stats_df['Mean_LOESS'] = yout # Remove any bad LOESS values (rare) stats_df = stats_df[ stats_df['Mean_LOESS'].values < band_max_spectra].copy() stats_df = stats_df[stats_df['Mean_LOESS'].values != 0].copy() # Save the data to CSV stats_df.to_csv('%s_df.csv' % band_name, index=False) ### Plot result of LORACCS along with histogram fig, ax = plt.subplots(nrows=1, figsize=(6, 4)) for_plot = stats_df.copy() for_plot = for_plot[for_plot['Pixels'] != 0] x = for_plot['Spec_vals'].values y1 = for_plot['Mean_LOESS'].values y2 = for_plot['Pixels'].values y3 = for_plot['Mean'].values # Plot histogram ax.bar(x, y2, width=1, color='lightgray') gray_patch = mpatches.Patch(color='lightgray', label='Histogram') # Set plot to have two y axes ax2 = ax.twinx() # Original target values as a scatterplot ax2.scatter(x, y3, color='tab:gray', marker='.', label='Mean Reference') #LORACCS regression line ax2.plot(x, y1, color='tab:orange', label='LORACCS Target', linewidth=2) # Fix tick marks ylabs = ax2.get_yticks() ax2.yaxis.tick_left() ax2.set_yticklabels(ylabs, fontsize=13) ax2.yaxis.set_major_formatter(FormatStrFormatter('%.0f')) y2labs = ax.get_yticks() ax.yaxis.tick_right() ax.set_yticklabels(y2labs, fontsize=13) ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f')) xlabs = ax2.get_xticks() ax2.set_xticklabels(xlabs, fontsize=13) ax2.xaxis.set_major_formatter(FormatStrFormatter('%.0f')) ax.set_title('LORACCS Model: %s Band' % band_name, fontsize=20) ax.set_xlabel('Target Spectral Values', fontsize=15) ax.yaxis.set_label_position('right') ax.set_ylabel('Reference Histogram', fontsize=15) ax2.yaxis.set_label_position('left') ax2.set_ylabel('Reference Spectral Values', fontsize=15) ax.legend(fontsize=12, loc='upper left', handles=[gray_patch]) ax2.legend(fontsize=12, loc='lower right') save_fig = '%s_LORACCS_full_spectra_plot.png' % band_name plt.savefig(save_fig) plt.show() ### Transform image using filled-in LORACCS function # Read in target image full_tgt_img = rasterio.open(tgt_img_fp) # Read in as numpy arrays data = full_tgt_img.read(band_num) spec_vals_dict = dict(zip(stats_df.Spec_vals, stats_df.Mean_LOESS)) # Change the data type in preparation for changing values data = data.astype('float32') # Loop through spectral values, replace with new value / 100000. Division # necessary so already replaced values are not overwritten for spec_val in spec_vals_dict: data[data == spec_val] = spec_vals_dict[spec_val] / 100000 # Multiply by 100000 to restore proper values, return dtype data = data * 100000 data = data.astype('uint16') return data # Returns band array transformed by the LORACCS method
def run_loraccs(self, ref_img_band, tgt_img_band, band_num, band_name, band_max_spectra, loess_frac, tgt_img_fp, outdir): ''' Runs the LORACCS method. ''' os.chdir(outdir) # Plot 2d histogram index = (ref_img_band>0)&(tgt_img_band>0) ref_img_band_sub = ref_img_band[index] tgt_img_band_sub = tgt_img_band[index] plt.hist2d(tgt_img_band_sub, ref_img_band_sub, bins=200, cmin = 5, cmap=plt.cm.jet, ) plt.colorbar() plt.title('%s Band 2D Histogram' %band_name) plt.xlabel('Target') plt.ylabel('Reference') save_fig = '%s_2d_hist.png' %band_name plt.savefig(save_fig) plt.show() ### Extract spectral values into a dict # Get unique values from target image tgt_uniq = np.unique(tgt_img_band) counts_dict = dict() for uniq in tgt_uniq: counts_dict[uniq] = [] img_rows = range(0, tgt_img_band.shape[0]) img_row_pixel = range(0, tgt_img_band.shape[1]) for band_row in img_rows: # iterate through rows for pixel in img_row_pixel: # iterate through pixels tgt_val = tgt_img_band[band_row][pixel] ref_val = ref_img_band[band_row][pixel] if tgt_val != 0: if ref_val != 0: # Add value to the dict values = counts_dict[tgt_val] try: values.append(ref_val) except: values = ref_val else: continue # Generate stats for uniq in tgt_uniq: values = np.array(counts_dict[uniq]) pixels = len(values) # Subset out values to get rid of outliers sub = np.sort(values) sub = sub[sub < band_max_spectra] val_sub = sub[int(len(sub) * .025) : int(len(sub) * .975)] try: mean = np.mean(val_sub) std = np.std(val_sub) except: print('Exception used') mean = np.mean(counts_dict[uniq]) std = np.std(counts_dict[uniq]) new_dict = {'values' : counts_dict[uniq], 'mean' : mean, 'std' : std, 'pixels' : pixels} counts_dict[uniq] = new_dict # Create pandas DataFrame of values spec_vals = tgt_uniq mean = [] std = [] pix = [] for uniq in tgt_uniq: mean.append(counts_dict[uniq]['mean']) std.append(counts_dict[uniq]['std']) pix.append(counts_dict[uniq]['pixels']) stats_df = pd.DataFrame() stats_df['Spec_vals'] = spec_vals stats_df['Mean'] = mean stats_df['Std'] = std stats_df['Pixels'] = pix # Remove all NaN stats_df = stats_df.fillna(0) stats_df_valid = stats_df[stats_df.Mean != 0] # Remove entries with pixel count less than 6 stats_df_valid = stats_df_valid[stats_df_valid.Pixels > 5] ### Create model # Set up params for LOESS x = stats_df_valid.Spec_vals.values y = stats_df_valid.Mean.values # Run LOESS xout, yout, wout = loess_1d(x, y, frac=loess_frac, degree=2, rotate=False) # Save values into the dataframe stats_df_valid['Mean_LOESS'] = yout # Remove any bad LOESS values (rare) stats_df_valid = stats_df_valid[stats_df_valid['Mean_LOESS'].values < band_max_spectra].copy() stats_df_valid = stats_df_valid[stats_df_valid['Mean_LOESS'].values != 0].copy() # Save the data to CSV stats_df_valid.to_csv('%s_df.csv' %band_name, index=False) # Fill gaps in spectra min_spectra = min(stats_df_valid.Spec_vals.values) max_spectra = max(stats_df_valid.Spec_vals.values) if max_spectra > band_max_spectra: reasonable_spec_vals = stats_df_valid[stats_df_valid['Spec_vals'] < band_max_spectra] max_spectra = reasonable_spec_vals['Spec_vals'].values[-1] print('Maxiumum spectral value being set to: ', max_spectra) spectral_range = range(int(min_spectra), int(max_spectra+1)) full_spectra = pd.DataFrame() full_spectra['Spec_vals'] = spectral_range full_spectra = full_spectra.merge(stats_df_valid, how='left', on='Spec_vals') full_spectra.drop(['Std'], axis=1, inplace=True) full_spectra.rename(columns={'Mean':'Org_Mean'}, inplace=True) full_spectra['Missing'] = pd.isna(full_spectra['Mean_LOESS']) # Identify missing spectral values all_y_values = [] # Predict missing spectral values for item in range(0, len(full_spectra)): if full_spectra['Missing'].iloc[item] == True: # Find nearest values on either side invalid_before_value = True n = item while invalid_before_value == True: n = n-1 invalid_before_value = full_spectra['Missing'].iloc[n] x1 = full_spectra['Spec_vals'].iloc[n] y1 = full_spectra['Mean_LOESS'].iloc[n] n = item invalid_after_value = True while invalid_after_value == True: n = n+1 invalid_after_value = full_spectra['Missing'].iloc[n] x2 = full_spectra['Spec_vals'].iloc[n] y2 = full_spectra['Mean_LOESS'].iloc[n] # Predict new spectra value using the equation of a line between points new_x = full_spectra['Spec_vals'].iloc[item] new_y = self.get_new_spec_val(x1, x2, y1, y2, new_x) else: new_y = full_spectra['Mean_LOESS'].iloc[item] all_y_values.append(new_y) full_spectra['Filled_LOESS']=all_y_values full_spectra.fillna(0, inplace=True) ### Write full spectra data frame to csv full_spectra.to_csv('%s full spectra.csv' %band_name, index=False) ### Plot result of LORACCS along with histogram fig, ax = plt.subplots(nrows=1, figsize=(6,4)) for_plot = full_spectra.copy() for_plot = for_plot[for_plot['Missing'] == False] x=for_plot['Spec_vals'].values y1=for_plot['Filled_LOESS'].values y2=for_plot['Pixels'].values y3=for_plot['Org_Mean'].values # Plot histogram ax.bar(x, y2, width=1, color='lightgray') gray_patch = mpatches.Patch(color='lightgray', label='Histogram') # Set plot to have two y axes ax2 = ax.twinx() # Original target values as a scatterplot ax2.scatter(x, y3, color='tab:gray', marker='.', label='Mean Reference') #LORACCS regression line ax2.plot(x, y1, color='tab:orange', label='LORACCS Target', linewidth=2) # Fix tick marks ylabs = ax2.get_yticks() ax2.yaxis.tick_left() ax2.set_yticklabels(ylabs, fontsize=13) ax2.yaxis.set_major_formatter(FormatStrFormatter('%.0f')) y2labs = ax.get_yticks() ax.yaxis.tick_right() ax.set_yticklabels(y2labs, fontsize=13) ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f')) xlabs = ax2.get_xticks() ax2.set_xticklabels(xlabs, fontsize=13) ax2.xaxis.set_major_formatter(FormatStrFormatter('%.0f')) ax.set_title('LORACCS Model: %s Band' %band_name, fontsize=20) ax.set_xlabel('Target Spectral Values', fontsize=15) ax.yaxis.set_label_position('right') ax.set_ylabel('Reference Histogram', fontsize=15) ax2.yaxis.set_label_position('left') ax2.set_ylabel('Reference Spectral Values', fontsize=15) ax.legend(fontsize=12, loc='upper left', handles=[gray_patch]) ax2.legend(fontsize=12, loc='lower right') save_fig = '%s_LORACCS_full_spectra_plot.png' %band_name plt.savefig(save_fig) plt.show() ### Transform image using filled-in LORACCS function # Read in target image full_tgt_img = gdal.Open(tgt_img_fp) # Get bands band_data = full_tgt_img.GetRasterBand(band_num) # Read in as numpy arrays data = gdal_array.BandReadAsArray(band_data) spec_vals_dict = dict(zip(full_spectra.Spec_vals, full_spectra.Filled_LOESS)) # Change the data type in preparation for changing values data = data.astype('float') # Loop through spectral values, replace with new value / 100000. Division # necessary so already replaced values are not overwritten for spec_val in spec_vals_dict: data[data == spec_val] = spec_vals_dict[spec_val] / 100000 # Multiply by 100000 to restore proper values, return dtype data = data*100000 data = data.astype('uint16') return data # Returns band array transformed by the LORACCS method
def preOutlierDetection(self, frame: pd.DataFrame, options: dict) -> dict: """ This function utilizes the loess method to strip the seasonality from the target column and determine a trend. Based on the difference between the trend and the actual target column, outliers are identified as a function of the options['outlierStdevMultiplier'] value, which should be an int or a float. Parameters ---------- frame : pd.DataFrame pandas dataframe that includes the data to be forecast options : dict dictionary that includes at least 'seasonalityBandwidth', 'targetColumn', 'outlierStdevMultiplier' Returns ------- dict Will return with two keys, 'frame' which will be the original pandas dataframe but now with the X_INTERPOLATED and X_OUTLIER columns, and 'options', the value of which is whatever dictionary was originally passed through the options parameter. """ targetColumn = options['targetColumn'] frame['X_INDEX'] = frame.index.values frame['X_INTERPOLATED'] = frame[targetColumn] # split the data into past/future based on null in target column nullIdx = frame[targetColumn].isnull() futureData = frame[nullIdx] historicalIdx = list(map(operator.not_, nullIdx)) historicalData = frame[historicalIdx] x = np.asarray(historicalData['X_INDEX'].tolist()) y = np.asarray(historicalData[params.getParam('targetColumn', options)].tolist()) bandwidth = params.getParam('seasonalityBandwidth', options) xout, yout, weights = lo.loess_1d(x, y, frac=bandwidth, degree=2) frame['X_TREND'] = np.append( yout, np.asarray(futureData[targetColumn].tolist())) frame['X_TREND_DIFF'] = frame[targetColumn] - frame['X_TREND'] stdev = frame['X_TREND_DIFF'].std() avg = frame['X_TREND_DIFF'].mean() mult = params.getParam('outlierStdevMultiplier', options) #identifies outliers based on the number of standard deviations #from the mean as specified in line 100. It is thus not the strict #mean of the target column, but the mean of the difference #between the target column and the loess trend calculated in line 94. frame['X_OUTLIER'] = 0 for index, row in frame.iterrows(): diff = abs(frame['X_TREND_DIFF'][index]) if diff > avg + mult * stdev: if index > 0 and index <= frame.shape[0] - 1: frame['X_INTERPOLATED'][index] = mean([ frame['X_INTERPOLATED'][index - 1], frame['X_INTERPOLATED'][index + 1] ]) frame['X_OUTLIER'][index] = 1 else: frame['X_INTERPOLATED'][index] = frame['X_TREND'][index] frame['X_OUTLIER'][index] = 1 frame.drop(columns=['X_TREND', 'X_TREND_DIFF', 'X_INDEX']) fdict = dict() fdict['frame'] = frame fdict['options'] = options return fdict
def prepare(self, frame: pd.DataFrame, options: dict) -> dict: """ This function does a few things in an attempt to prepare the data for forecasting. First, if specified in the 'options' dictionary, it will scale all of the variables to between 0 and 1. It will then add to the dataframe 'frame' an X_TREND, X_TREND_DIFF, and X_TREND_RATIO column to be used in later modeling/prediction. It will also create, as specified in the 'options' dictionary, indexes that will be passed through in the return to identify which parts of the model should be used for training, evaluation, etc. Parameters ---------- frame : pd.DataFrame pandas dataframe that includes all the information necessary to forecast options : dict dictionary that includes at least the keys 'scalePredictors', 'predictorColumns', 'targetColumn', 'numHoldoutRows', and 'seasonalityBandwidth' Returns ------- dict dictionary that includes the dataframe passed through the frame parameter with additions, the options dictionary, and keys 'historicalIdx', 'futureIdx', and 'evalIdx', corresponding to the training, forecasting, and holdout periods as measured by the index of the dataframe stored under the 'frame' key """ # create copy of target for modification (fill zeros with very small number) random.seed(158923) targetColumn = params.getParam('targetColumn', options) frame['X_INDEX'] = frame.index.values # scale predictors between 0 and 1 try: newPredCols = [] if params.getParam('scalePredictors', options): for predCol in params.getParam('predictorColumns', options): newCol = 'X_' + predCol frame[newCol] = (frame[predCol] - frame[predCol].min()) / ( frame[predCol].max() - frame[predCol].min()) newPredCols.append(newCol) options['predictorColumns'] = newPredCols except Exception as e: print("Unable to scale predictors: ", e) # ensure predictors and target are float frame[targetColumn] = frame[targetColumn].astype(float) frame[params.getParam('predictorColumns', options)] = frame[params.getParam( 'predictorColumns', options)].astype(float) newTargetColumn = 'X_' + targetColumn # if we have done outlier detection there will be an interpolated column that has the interpolated actuals if 'X_INTERPOLATED' in frame: frame[newTargetColumn] = list( map(lambda x: (x if x != 0.0 else random.random() / 1E5), frame['X_INTERPOLATED'])) else: frame[newTargetColumn] = list( map(lambda x: (x if x != 0.0 else random.random() / 1E5), frame[params.getParam('targetColumn', options)])) options['targetColumn'] = newTargetColumn # split the data into past/future based on null in target column lastNonNullIdx = self.lastNonNullIndex(frame[targetColumn]) fullHistoricalIdx = frame['X_INDEX'] <= lastNonNullIdx # we use full history for trending/smoothing (but NOT modeling in the future) fullHistoricalData = frame[fullHistoricalIdx] numHoldoutRows = params.getParam('numHoldoutRows', options) fullFutureIdx = frame['X_INDEX'] > lastNonNullIdx fullFutureData = frame[fullFutureIdx] # we store history minus hold-out for future modeling # could these variables potentially be renamed? # the subtraction of numHoldoutRows really changes the "concept" # of what's being discussed here lastNonNullIdx = lastNonNullIdx - numHoldoutRows historicalIdx = frame['X_INDEX'] <= lastNonNullIdx #historicalData = frame[historicalIdx] futureIdx = frame['X_INDEX'] > lastNonNullIdx #futureData = frame[futureIdx] if (numHoldoutRows > 0): # if variable names are switched as discussed above it would avoid # some of the awkward constructions here evalIdx = list( map( lambda x: x > lastNonNullIdx and x <= (lastNonNullIdx + numHoldoutRows), frame['X_INDEX'])) else: evalIdx = historicalIdx x = np.asarray(fullHistoricalData['X_INDEX'].tolist()) y = np.asarray(fullHistoricalData[newTargetColumn].tolist()) bandwidth = params.getParam('seasonalityBandwidth', options) xout, yout, weights = lo.loess_1d(x, y, frac=bandwidth, degree=2) frame['X_TREND'] = np.append( yout, np.asarray(fullFutureData[targetColumn].tolist())) #for use with additive seasonality? frame['X_TREND_DIFF'] = frame[targetColumn] - frame['X_TREND'] #for use with multiplicative seasonality? frame['X_TREND_RATIO'] = frame[targetColumn] / frame['X_TREND'] fdict = dict() fdict['historicalIdx'] = historicalIdx fdict['futureIdx'] = futureIdx fdict['evalIdx'] = evalIdx fdict['frame'] = frame fdict['options'] = options return fdict
distArray = np.zeros((samples, len(chrm))) gene_density = np.array([ 7.86, 4.87, 5.20, 3.77, 4.62, 5.86, 5.37, 4.36, 5.30, 5.27, 9.16, 7.37, 2.65, 5.37, 5.33, 8.67, 13.68, 3.29, 22.53, 8.22, 4.43, 8.15, 5.19 ]) chr_len = np.array([249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540, \ 102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566,155270560]) chr_len = chr_len / 1E6 gene_density = mnorm(gene_density) chr_len = mnorm(chr_len) z = loess_1d(gene_density, chr_len, frac=2. / 3) file_list = [ sd + '/csn_' + str(_) + '_coor.txt' for _ in xrange(1, samples + 1) ] for findex, file in enumerate(file_list): #print findex G_coor = np.loadtxt(file) chrcoords = [[] for _ in chrm] chrcenter = np.zeros((len(chrm), 3)) for i in xrange(len(node_label)): if str(i) in cnodes: chrcounts[node_label[i]] += 1 chrcenter[node_label[i]] += G_coor[i, :] for i in xrange(len(chrm)):