def denoise(nblck,filename,mode='sym', wv='sym5' ): from statsmodels.robust import mad #noisy_coefs = pywt.wavedec(nblck, 'sym5', level=5, mode='per') noisy_coefs = pywt.wavedec(nblck, wavelet=wv, mode=mode) #level=5, #dwt is for single level decomposition; wavedecoding is for more levels sigma = mad(noisy_coefs[-1]) #uthresh=np.std(ca)/2 uthresh = sigma*np.sqrt(2*np.log(len(nblck))) denoised = noisy_coefs[:] denoised[1:] = [pywt.threshold(i, value=uthresh,mode='soft') for i in denoised[1:]] signal = pywt.waverec(denoised, wavelet=wv, mode=mode) from matplotlib import pyplot as plt fig, axes = plt.subplots(1, 2, sharey=True, sharex=True,figsize=(8,4)) ax1, ax2 = axes ax1.plot(signal) #ax1.set_xlim(0,2**10) ax1.set_title("Recovered Signal") ax1.margins(.1) ax2.plot(nblck) ax2.set_title("Noisy Signal") for ax in fig.axes: ax.tick_params(labelbottom=False, top=False, bottom=False, left=False, right=False) fig.tight_layout() fig.savefig(filename+'_'+wv+'.pdf') plt.clf() return signal
def task_mad(data): """ http://statsmodels.sourceforge.net/devel/generated/statsmodels.robust.scale.mad.html """ mad_value = mad(data) return mad_value
def _guerrero_cv(self, x, bounds, window_length=4, scale='sd', options={'maxiter': 25}): """ Computes lambda using guerrero's coefficient of variation. If no seasonality is present in the data, window_length is set to 4 (as per Guerrero and Perera, (2004)). NOTE: Seasonality-specific auxiliaries *should* provide their own seasonality parameter. Parameters ---------- x : array_like bounds : tuple Numeric 2-tuple, that indicate the solution space for the lambda parameter. window_length : int Seasonality/grouping parameter. Default 4, as per Guerrero and Perera (2004). NOTE: this indicates the length of the individual groups, not the total number of groups! scale : {'sd', 'mad'} The dispersion measure to be used. 'sd' indicates the sample standard deviation, but the more robust 'mad' is also available. options : dict The options (as a dict) to be passed to the optimizer. """ nobs = len(x) groups = int(nobs / window_length) # remove the first n < window_length observations from consideration. grouped_data = np.reshape(x[nobs - (groups * window_length): nobs], (groups, window_length)) mean = np.mean(grouped_data, 1) scale = scale.lower() if scale == 'sd': dispersion = np.std(grouped_data, 1, ddof=1) elif scale == 'mad': dispersion = mad(grouped_data, axis=1) else: raise ValueError("Scale '{0}' not understood.".format(scale)) def optim(lmbda): rat = np.divide(dispersion, np.power(mean, 1 - lmbda)) # eq 6, p 40 return np.std(rat, ddof=1) / np.mean(rat) res = minimize_scalar(optim, bounds=bounds, method='bounded', options=options) return res.x
def madnormalize(vector): """ Parameters ---------- vector Returns ------- """ demedianed = vector - np.median(vector) sigmad = mad(demedianed) if sigmad > 0.0: return demedianed / sigmad else: return demedianed
for lst in [k for k in pickedSubjects.iterkeys() if k.startswith('sr')]: subjects += pickedSubjects[lst] poss = np.zeros((len(subjects), 3)) for ii, subj in enumerate(subjects): raw_file = glob.glob( op.join(study_dir, 'bad_%s' % subj, 'raw_fif', '*mmn_raw.fif'))[0] raw = mne.io.read_raw_fif(raw_file, allow_maxshield='yes') poss[ii] = raw.info['dev_head_t']['trans'][:3, 3] np.savez_compressed(op.join(study_dir, 'initial_head_poss.npz'), poss=poss, subjects=subjects) poss = np.load(op.join(study_dir, 'initial_head_poss.npz'))['poss'] poss_norm = LA.norm(poss, axis=1) mad_poss_norm = mad(poss_norm) ''' Median Absolute deviation R-blboggers - Absolute Deviation Around the Median https://www.r-bloggers.com/absolute-deviation-around-the-median/ Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and Handle Outliers", The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. ''' # Outliers defined as >/< +/- 3 * MAD mask = ~np.logical_or(poss_norm > np.median(poss_norm) + 2.5 * mad_poss_norm, poss_norm < np.median(poss_norm) - 2.5 * mad_poss_norm) # Figure window dressing sns.set(style="white", palette="colorblind", color_codes=True) colors = sns.color_palette()
import numpy as np import seaborn as sns import pandas as pd import matplotlib.pyplot as plt import math from statsmodels import robust col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type'] iris = pd.read_csv("iris.xlsx", names=col) #print(iris) iris_setosa = iris.loc[iris["type"] == "Iris-setosa"] iris_virginica = iris.loc[iris["type"] == "Iris-virginica"] iris_versicolor = iris.loc[iris["type"] == "Iris-versicolor"] print("Meadian absolute deviations") print("setosa", robust.mad(iris_setosa["petal_length"])) print("viriginica", robust.mad(iris_virginica["petal_length"])) print("versicolor", robust.mad(iris_versicolor["petal_length"]))
def mad(X): X = np.array(X).astype('float64') return robust.mad(X)
def make_selection_lookups( all_series: np.ndarray, pattern_lookups: Dict[int, PatternLookup], subseries_lookups: Dict[int, Dict[int, SubSeriesLookup]], sub_clusters: SubClusters, sub_mrfs: SubMRFs ) -> Tuple[Dict[Tuple, np.ndarray], Dict[Tuple, np.ndarray]]: """ :param all_series: :param pattern_lookups: :param subseries_lookups: :param sub_clusters: :param sub_mrfs: :return: """ dispersions: Dict[Tuple, np.ndarray] = {} modes: Dict[Tuple, np.ndarray] = {} print("computing selection criteria") for k in pattern_lookups: print("k =", k) for cid in {p.cid for p in pattern_lookups[k]["base"]}: print(" cid", cid) ss = [ all_series[p.start_idx:p.end_idx] for p in pattern_lookups[k]["base"] if p.cid == cid ] ubiqs = np.array([[ len([x for x in s[:, i] if x > 0]) / len(s) for i in range(s.shape[1]) ] for s in ss]) dispersions[(k, cid, 0)] = np.mean(robust.mad(ubiqs, axis=0)) modes[(k, cid, 0)] = stats.mode(np.round(ubiqs, 1)).mode if cid not in sub_clusters[k]: continue idx_lookup = subseries_lookups[k][cid].idx_lookup ser = subseries_lookups[k][cid].series for sub_k in sub_clusters[k][cid]: if not any( is_null_cluster(mrf) for mrf in sub_mrfs[k][cid] [sub_k].values()) or len(idx_lookup) <= sub_k: continue ps = pattern_lookups[k][cid][sub_k] for sub_cid in {p.cid for p in ps}: ss = [ ser[p.start_idx:p.end_idx] for p in ps if p.cid == sub_cid ] ubiqs = np.array([[ len([x for x in s[:, i] if x > 0]) / len(s) for i in range(s.shape[1]) ] for s in ss]) dispersions[(k, cid, sub_k, sub_cid)] = np.mean(robust.mad(ubiqs, axis=0)) modes[(k, cid, sub_k, sub_cid)] = stats.mode(np.round(ubiqs, 1)).mode dispersion_lookup = { tag: [x[1] for x in sorted(xs)] for tag, xs in groupby(sorted(dispersions.items()), lambda x: x[0][:3]) } mode_lookup = { tag: [x[1] for x in sorted(xs)] for tag, xs in groupby(sorted(modes.items()), lambda x: x[0][:3]) } return dispersion_lookup, mode_lookup
def wave(data, wavelet='haar', mode='soft'): #, pyr=0, wav = 0): """Wavelet coefficients of the input data, and subsequent pyramide plotting build using pywt. threshold is using universal thresholding. Refer to jseabold wavelet regression http://jseabold.net/blog/2012/02/23/wavelet-regression-in-python/ SYNTAX: [true_coef, signal, denoised] = wavelets.wave(ava['101'], mode='hard') INPUT: data: 1D-array or list with values for wavelets. wavelet: the mother wavelet. default = 'Haar', refer to pywt.wavelist() for wavelets. mode: 'hard', 'soft', 'less', refer to pywt.threshold for details. Output: true_coef: the coefficients for the original signal. signal: wavelet transformed data. denoised: the denoised coefficients. Ex. [tr, signal, dn] = wavelets.wave(ava['101'], wavelet='coif16', mode='hard') """ true_coefs = pywt.wavedec(data, wavelet, mode='per') #Evaluate data #Pyramid plot ''' if pyr ==1: fig = cpp.coef_pyramid_plot(true_coefs[1:]) fig.show() ''' #Calculating Mean-absolute-deviation sigma = mad(true_coefs[-1]) #Calculating the universal thresholding. uthresh = sigma * np.sqrt(2 * np.log(len(data))) #uthresh = sigma*np.sqrt(2*np.log(len(data))/len(data)) #denoising data using universal thresholding, resulting in denoised signal. denoised = true_coefs[:] denoised[1:] = (pywt.threshold(i, value=uthresh, mode=mode, substitute=0) for i in denoised[1:]) signal = pywt.waverec(denoised, wavelet, mode='per') return true_coefs, signal, denoised ''' #Number of coefficients comp = cmpt(denoised) #Evaluate Chosen Wavelet #let = pywt.Wavelet(wavelet) #sca, wave, x = let.wavefun() ''' '''
def gamma(x, z): changes = np.empty(T) for t in range(T): changes[t] = np.linalg.norm(V_color(x, t) - V_color(z, t)) mad = robust.mad(changes) return 1 / (1 + (LAMBDA_S * mad))
def prepare_features(window_data): # trimming window_data_x = window_data.x[:] window_data_y = window_data.y[:] window_data_z = window_data.z[:] window_data_x[window_data_x > MAX_VAL] = MAX_VAL window_data_x[window_data_x < MIN_VAL] = MIN_VAL window_data_y[window_data_y > MAX_VAL] = MAX_VAL window_data_y[window_data_y < MIN_VAL] = MIN_VAL window_data_z[window_data_z > MAX_VAL] = MAX_VAL window_data_z[window_data_z < MIN_VAL] = MIN_VAL assert np.sum(window_data_x[window_data_x > MAX_VAL]) == 0 assert np.sum(window_data_x[window_data_x < MIN_VAL]) == 0 assert np.sum(window_data_y[window_data_y > MAX_VAL]) == 0 assert np.sum(window_data_y[window_data_y < MIN_VAL]) == 0 assert np.sum(window_data_z[window_data_z > MAX_VAL]) == 0 assert np.sum(window_data_z[window_data_z < MIN_VAL]) == 0 magnitude = np.sqrt(window_data_x**2 + window_data_y**2 + window_data_z**2) # min x_min = np.min(window_data_x) y_min = np.min(window_data_y) z_min = np.min(window_data_z) overall_min = np.min(magnitude) # max x_max = np.max(window_data_x) y_max = np.max(window_data_y) z_max = np.max(window_data_z) overall_max = np.max(magnitude) # mean x_mean = np.mean(window_data_x) y_mean = np.mean(window_data_y) z_mean = np.mean(window_data_z) overall_mean = np.mean(magnitude) # standard deviation x_stdev = np.std(window_data_x) y_stdev = np.std(window_data_y) z_stdev = np.std(window_data_z) overall_stdev = np.std(magnitude) # mean average deviation x_mad = mad(window_data_x) y_mad = mad(window_data_y) z_mad = mad(window_data_z) overall_mad = mad(magnitude) # skewness x_skewness = skew(window_data_x) y_skewness = skew(window_data_y) z_skewness = skew(window_data_z) overall_skewness = skew(magnitude) # kurtosis x_kurtosis = kurtosis(window_data_x) y_kurtosis = kurtosis(window_data_y) z_kurtosis = kurtosis(window_data_z) overall_kurtosis = kurtosis(magnitude) # root mean square amplitude x_rms_amplitude = np.sqrt(np.abs(np.mean(window_data_x))) y_rms_amplitude = np.sqrt(np.abs(np.mean(window_data_y))) z_rms_amplitude = np.sqrt(np.abs(np.mean(window_data_z))) overall_rms_amplitude = np.sqrt(np.abs(np.mean(magnitude))) covariance_matrix = np.cov(window_data[['x', 'y', 'z']]) # covariance of two values x_y_covariance = covariance_matrix[0, 1] x_z_covariance = covariance_matrix[0, 2] y_z_covariance = covariance_matrix[1, 2] # min covariance of two values min_covariance = np.min([x_y_covariance, x_z_covariance, y_z_covariance]) # max covariance of two values max_covariance = np.max([x_y_covariance, x_z_covariance, y_z_covariance]) # window energy x_window_energy = np.sum(window_data_x) y_window_energy = np.sum(window_data_y) z_window_energy = np.sum(window_data_z) overall_window_energy = np.sum(magnitude) # window entropy # x_window_entropy = entropy(window_data.x) # y_window_entropy = entropy(window_data.y) # z_window_entropy = entropy(window_data.z) # min_window_entropy = np.min([ # x_window_entropy, y_window_entropy, z_window_entropy]) # max_window_entropy = np.max([ # x_window_entropy, y_window_entropy, z_window_entropy]) # overall_window_entropy = entropy(magnitude) # Fourier transform frequency_component_amplitudes = np.fft.fft(magnitude).real # spectral centroid x_spectral_centroid = spectral_centroid(window_data_x) y_spectral_centroid = spectral_centroid(window_data_y) z_spectral_centroid = spectral_centroid(window_data_z) overall_spectral_centroid = spectral_centroid(magnitude) # spectral energy x_spectral_energy = np.sum(np.fft.fft(window_data_x).real) y_spectral_energy = np.sum(np.fft.fft(window_data_y).real) z_spectral_energy = np.sum(np.fft.fft(window_data_z).real) overall_spectral_energy = np.sum(frequency_component_amplitudes) # spectral entropy # x_spectral_entropy = entropy(np.fft.fft(window_data.x).real) # y_spectral_entropy = entropy(np.fft.fft(window_data.y).real) # z_spectral_entropy = entropy(np.fft.fft(window_data.z).real) # overall_spectral_entropy = entropy(frequency_component_amplitudes) features = ( x_min, y_min, z_min, overall_min, x_max, y_max, z_max, overall_max, x_mean, y_mean, z_mean, overall_mean, x_stdev, y_stdev, z_stdev, overall_stdev, x_mad, y_mad, z_mad, overall_mad, x_skewness, y_skewness, z_skewness, overall_skewness, x_kurtosis, y_kurtosis, z_kurtosis, overall_kurtosis, x_rms_amplitude, y_rms_amplitude, z_rms_amplitude, overall_rms_amplitude, x_y_covariance, x_z_covariance, y_z_covariance, min_covariance, max_covariance, x_window_energy, y_window_energy, z_window_energy, overall_window_energy, # x_window_entropy, y_window_entropy, z_window_entropy, # min_window_entropy, max_window_entropy, # overall_window_entropy, x_spectral_centroid, y_spectral_centroid, z_spectral_centroid, overall_spectral_centroid, x_spectral_energy, y_spectral_energy, z_spectral_energy, overall_spectral_energy, # x_spectral_entropy, y_spectral_entropy, z_spectral_entropy, # overall_spectral_entropy ) features += tuple(frequency_component_amplitudes) return features
def add_robust_features(df): df['X_95_quantile'] = np.array( [np.quantile(df.iloc[i].X, 0.95) for i in range(len(df))]) df['X_mad'] = np.array([robust.mad(df.iloc[i].X) for i in range(len(df))]) return df
def dev_func(vector, parametric=False): if parametric: return np.std(vector) else: return mad(vector, c=1)
def get_sta_median(v_APs): sta = np.median(v_APs, 0) sta_mad = mad(v_APs, axis=0) return sta, sta_mad
def fit(self,X,*args,**kwargs): """ Fit a projection pursuit dimension reduction model. Required input argument: X data as matrix or data frame Optinal input arguments: arg or kwarg: y data as vector or 1D matrix kwargs: h, int: option to overrule class's n_components parameter in fit. Convenient command line, yet should not be used in automated loops, e.g. cross-validation. dmetric, str: distance metric used internally. Defaults to 'euclidean' mixing, bool: to estimate mixing matrix (only relevant for ICA) Further parameters to the regression methods can be passed on here as well as kwargs, e.g. quantile=0.8 for quantile regression. kwargs only relevant if y specified: """ # Collect optional fit arguments biascorr = kwargs.pop('biascorr',False) if 'h' not in kwargs: h = self.n_components else: h = kwargs.pop('h') self.n_components = h if 'dmetric' not in kwargs: dmetric = 'euclidean' else: dmetric = kwargs.get('dmetric') if 'mixing' not in kwargs: mixing = False else: mixing = kwargs.get('mixing') if 'y' not in kwargs: na = len(args) if na > 0: #Use of *args makes it sklearn consistent flag = 'two-block' y = args[0] else: flag = 'one-block' y = 0 # to allow calls with 'y=y' in spit of no real y argument present else: flag = 'two-block' y = kwargs.get('y') if 'quantile' not in kwargs: quantile = .5 else: quantile = kwargs.get('quantile') if self.regopt == 'robust': if 'fun' not in kwargs: fun = 'Hampel' else: fun = kwargs.get('fun') if 'probp1' not in kwargs: probp1 = 0.95 else: probp1 = kwargs.get('probp1') if 'probp2' not in kwargs: probp2 = 0.975 else: probp2 = kwargs.get('probp2') if 'probp3' not in kwargs: probp3 = 0.99 else: probp3 = kwargs.get('probp3') if self.projection_index == dicomo: if self.pi_arguments['mode'] in ('M3','cos','c*k'): if 'option' not in kwargs: option = 1 else: option = kwargs.get('option') if option > 3: print('Option value >3 will compute results, but meaning may be questionable') # Initiate projection index self.most = self.projection_index(**self.pi_arguments) # Initiate some parameters and data frames if self.copy: X0 = copy.deepcopy(X) self.X0 = X0 else: X0 = X X = convert_X_input(X0) n,p = X0.shape trimming = self.trimming # Check dimensions if h > min(n,p): raise(MyException('number of components cannot exceed number of samples')) if (self.projection_index == dicomo and self.pi_arguments['mode'] == 'kurt' and self.whiten_data==False): warnings.warn('Whitening step is recommended for ICA') # Pre-processing adjustment if whitening if self.whiten_data: self.center_data = True self.scale_data = False self.compression = False print('All results produced are for whitened data') # Centring and scaling if self.scale_data: if self.center=='mean': scale = 'std' elif ((self.center=='median')|(self.center=='l1median')): scale = 'mad' else: scale = 'None' warnings.warn('Without scaling, convergence to optima is not given') # Data Compression for flat tables if required if ((p>n) and self.compression): V,S,U = np.linalg.svd(X.T,full_matrices=False) X = np.matmul(U.T,np.diag(S)) n,p = X.shape if (srs.mad(X)==0).any(): warnings.warn('Due to low scales in data, compression would induce zero scales.' + '\n' + 'Proceeding without compression.') dimensions = False if copy: X = copy.deepcopy(X0) else: X = X0 else: dimensions = True else: dimensions = False # Initiate centring object and scale X data centring = VersatileScaler(center=self.center,scale=scale,trimming=trimming) if self.center_data: Xs = centring.fit_transform(X) mX = centring.col_loc_ sX = centring.col_sca_ else: Xs = X mX = np.zeros((1,p)) sX = np.ones((1,p)) fit_arguments = {} # Data whitening (best practice for ICA) if self.whiten_data: V,S,U = np.linalg.svd(Xs.T,full_matrices=False) del U K = (V/S)[:,:p] del V,S Xs = np.matmul(Xs, K) Xs *= np.sqrt(p) # Presently, X and y need to be matrices # Will be changed to use regular np.ndarray Xs = np.matrix(Xs) # Pre-process y data when available if flag != 'one-block': ny = y.shape[0] y = convert_y_input(y) if len(y.shape) < 2: y = np.matrix(y).reshape((ny,1)) # py = y.shape[1] if ny != n: raise(MyException('X and y number of rows must agree')) if self.copy: y0 = copy.deepcopy(y) self.y0 = y0 if self.center_data: ys = centring.fit_transform(y) my = centring.col_loc_ sy = centring.col_sca_ else: ys = y my = 0 sy = 1 ys = np.matrix(ys).astype('float64') else: ys = None # Initializing output matrices W = np.zeros((p,h)) T = np.zeros((n,h)) P = np.zeros((p,h)) B = np.zeros((p,h)) R = np.zeros((p,h)) B_scaled = np.zeros((p,h)) C = np.zeros((h,1)) Xev = np.zeros((h,1)) assovec = np.zeros((h,1)) Maxobjf = np.zeros((h,1)) # Initialize deflation matrices E = copy.deepcopy(Xs) f = ys bi = np.zeros((p,1)) opt_args = { 'alpha': self.alpha, 'trimming': self.trimming, 'biascorr': biascorr, 'dmetric' : 'euclidean', } if self.optimizer=='grid': # Define grid optimization ranges if 'ndir' not in self.optimizer_options: self.optimizer_options['ndir'] = 1000 optrange = np.sign(self.optrange) optmax = self.optrange[1] stop0s = np.arcsin(optrange[0]) stop1s = np.arcsin(optrange[1]) stop1c = np.arccos(optrange[0]) stop0c = np.arccos(optrange[1]) anglestart = max(stop0c,stop0s) anglestop = max(stop1c,stop1s) nangle = np.linspace(anglestart,anglestop,self.optimizer_options['ndir'],endpoint=False) alphamat = np.matrix([np.cos(nangle), np.sin(nangle)]) opt_args['_stop0c'] = stop0c opt_args['_stop0s'] = stop0s opt_args['_stop1c'] = stop1c opt_args['_stop1s'] = stop1s opt_args['optmax'] = optmax opt_args['optrange'] = self.optrange opt_args['square_pi'] = self.square_pi if optmax != 1: alphamat *= optmax if p>2: anglestart = min(opt_args['_stop0c'],opt_args['_stop0s']) anglestop = min(opt_args['_stop1c'],opt_args['_stop1s']) nangle = np.linspace(anglestart,anglestop,self.optimizer_options['ndir'],endpoint=True) alphamat2 = np.matrix([np.cos(nangle), np.sin(nangle)]) if optmax != 1: alphamat2 *= opt_args['optmax'] # Arguments for grid plane opt_args['alphamat'] = alphamat, opt_args['ndir'] = self.optimizer_options['ndir'], opt_args['maxiter'] = self.optimizer_options['maxiter'] if type(opt_args['ndir'] is tuple): opt_args['ndir'] = opt_args['ndir'][0] # Arguments for grid plane #2 grid_args_2 = { 'alpha': self.alpha, 'alphamat': alphamat2, 'ndir': self.optimizer_options['ndir'], 'trimming': self.trimming, 'biascorr': biascorr, 'dmetric' : 'euclidean', '_stop0c' : stop0c, '_stop0s' : stop0s, '_stop1c' : stop1c, '_stop1s' : stop1s, 'optmax' : optmax, 'optrange' : self.optrange, 'square_pi' : self.square_pi } if flag=='two-block': grid_args_2['y'] = f if flag=='two-block': opt_args['y'] = f # Itertive coefficient estimation for i in range(0,h): if self.optimizer=='grid': if p==2: wi,maximo = gridplane(E,self.most, pi_arguments=opt_args ) elif p>2: afin = np.zeros((p,1)) # final parameters for linear combinations Z = copy.deepcopy(E) # sort variables according to criterion meas = [self.most.fit(E[:,k], **opt_args) for k in np.arange(0,p)] if self.square_pi: meas = np.square(meas) wi,maximo = gridplane(Z[:,0:2],self.most,opt_args) Zopt = Z[:,0:2]*wi afin[0:2]=wi for j in np.arange(2,p): projmat = np.matrix([np.array(Zopt[:,0]).reshape(-1), np.array(Z[:,j]).reshape(-1)]).T wi,maximo = gridplane(projmat,self.most, opt_args ) Zopt = Zopt*float(wi[0]) + Z[:,j]*float(wi[1]) afin[0:(j+1)] = afin[0:(j+1)]*float(wi[0]) afin[j] = float(wi[1]) tj = Z*afin objf = self.most.fit(tj, **{**fit_arguments,**opt_args} ) if self.square_pi: objf *= objf # outer loop to run until convergence objfold = copy.deepcopy(objf) objf = -1000 afinbest = afin ii = 0 maxiter_2j = 2**round(np.log2(self.optimizer_options['maxiter'])) while ((ii < self.optimizer_options['maxiter'] + 1) and (abs(objfold - objf)/abs(objf) > 1e-4)): for j in np.arange(0,p): projmat = np.matrix([np.array(Zopt[:,0]).reshape(-1), np.array(Z[:,j]).reshape(-1)]).T if j > 16: divv = maxiter_2j else: divv = min(2**j,maxiter_2j) wi,maximo = gridplane_2(projmat, self.most, q=afin[j], div=divv, pi_arguments=grid_args_2 ) Zopt = Zopt*float(wi[0,0]) + Z[:,j]*float(wi[1,0]) afin *= float(wi[0,0]) afin[j] += float(wi[1,0]) # % evaluate the objective function: tj = Z*afin objfold = copy.deepcopy(objf) objf = self.most.fit(tj, q=afin, **opt_args ) if self.square_pi: objf *= objf if objf!=objfold: if self.constraint == 'norm': afinbest = afin/np.sqrt(np.sum(np.square(afin))) else: afinbest = afin ii +=1 if self.verbose: print(str(ii)) #endwhile afinbest = afin wi = np.zeros((p,1)) wi = afinbest Maxobjf[i] = objf # endif;%if p>2; else: # do not optimize by the grid algorithm if self.trimming > 0: warnings.warn('Optimization that involves a trimmed objective is not a quadratic program. The scipy-optimize result will be off!!') if 'center' in self.pi_arguments: if (self.pi_arguments['center']=='median'): warnings.warn('Optimization that involves a median in the objective is not a quadratic program. The scipy-optimize result will be off!!') constraint = {'type':'eq', 'fun': lambda x: np.linalg.norm(x) -1, } if len(self.optimizer_constraints)>0: constraint = [constraint,self.optimizer_constraints] wi = minimize(pp_objective, E[0,:].transpose(), args=(self.most,E,opt_args), method=self.optimizer, constraints=constraint, options=self.optimizer_options).x wi = np.matrix(wi).reshape((p,1)) wi /= np.sqrt(np.sum(np.square(wi))) # Computing projection weights and scores ti = E*wi if self.optimizer != 'grid': Maxobjf[i] = self.most.fit(E*wi,**opt_args) nti = np.linalg.norm(ti) pi = E.T*ti / (nti**2) if self.whiten_data: wi /= np.sqrt((wi**2).sum()) wi = K*wi wi0 = wi wi = np.array(wi) if len(W[:,i].shape) == 1: wi = wi.reshape(-1) W[:,i] = wi T[:,i] = np.array(ti).reshape(-1) P[:,i] = np.array(pi).reshape(-1) if flag != 'one-block': criteval = self.most.fit(E*wi0, **opt_args ) if self.square_pi: criteval *= criteval assovec[i] = criteval # Deflation of the datamatrix guaranteeing orthogonality restrictions E -= ti*pi.T # Calculate R-Weights R = np.dot(W[:,0:(i+1)],pinv2(np.dot(P[:,0:(i+1)].T,W[:,0:(i+1)]),check_finite=False)) # Execute regression y~T if y is present. Generate regression estimates. if flag != 'one-block': if self.regopt=='OLS': ci = np.dot(ti.T,ys)/(nti**2) elif self.regopt == 'robust': linfit = rm(fun=fun,probp1=probp1,probp2=probp2,probp3=probp3, centre=self.center,scale=scale, start_cutoff_mode='specific',verbose=self.verbose) linfit.fit(ti,ys) ci = linfit.coef_ elif self.regopt == 'quantile': linfit = QuantReg(y,ti) model = linfit.fit(q=quantile) ci = model.params # end regression if C[i] = ci bi = np.dot(R,C[0:(i+1)]) bi_scaled = bi bi = np.multiply(np.reshape(sy/sX,(p,1)),bi) B[:,i] = bi[:,0] B_scaled[:,i] = bi_scaled[:,0] # endfor; Loop for latent dimensions # Re-adjust estimates to original dimensions if data have been compressed if dimensions: B = np.matmul(V[:,0:p],B) B_scaled = np.matmul(V[:,0:p],B_scaled) R = np.matmul(V[:,0:p],R) W = np.matmul(V[:,0:p],W) P = np.matmul(V[:,0:p],P) bi = B[:,h-1] if self.center_data: Xs = centring.fit_transform(X0) mX = centring.col_loc_ sX = centring.col_sca_ else: Xs = X0 mX = np.zeros((1,p)) sX = np.ones((1,p)) bi = bi.astype("float64") if flag != 'one-block': # Calculate scaled and unscaled intercepts if dimensions: X = convert_X_input(X0) if(self.center == "mean"): intercept = sps.trim_mean(y - np.matmul(X,bi),trimming) else: intercept = np.median(np.reshape(y - np.matmul(X,bi),(-1))) yfit = np.matmul(X,bi) + intercept if not(scale == 'None'): if (self.center == "mean"): b0 = np.mean(ys - np.matmul(Xs.astype("float64"),bi)) else: b0 = np.median(np.array(ys.astype("float64") - np.matmul(Xs.astype("float64"),bi))) else: b0 = intercept # Calculate fit values and residuals yfit = yfit r = y - yfit setattr(self,"coef_",B) setattr(self,"intercept_",intercept) setattr(self,"coef_scaled_",B_scaled) setattr(self,"intercept_scaled_",b0) setattr(self,"residuals_",r) setattr(self,"fitted_",yfit) setattr(self,"y_loadings_",C) setattr(self,"y_loc_",my) setattr(self,"y_sca_",sy) setattr(self,"x_weights_",W) setattr(self,"x_loadings_",P) setattr(self,"x_rotations_",R) setattr(self,"x_scores_",T) setattr(self,"x_ev_",Xev) setattr(self,"crit_values_",assovec) setattr(self,"Maxobjf_",Maxobjf) if self.whiten_data: setattr(self,"whitening_",K) if mixing: setattr(self,"mixing_",np.linalg.pinv(W)) setattr(self,"x_loc_",mX) setattr(self,"x_sca_",sX) setattr(self,'scaling',scale) if self.return_scaling_object: setattr(self,'scaling_object_',centring) return(self)
from statsmodels import robust url = "winequality-red.csv" names = [ 'Fixed Acidity', 'Volatile Acidity', 'Citric Acid', 'Residual Sugar', 'Chlorides', 'Free SO2', 'Total SO2', 'Density', 'pH', 'Sulphates', 'ALC by Vol', 'Quality' ] data = pandas.read_csv(url, names=names) print np.mean(data) print "\n" print "Median of all Attributes:" print(np.median(data, axis=0)) print "\n" print "Standard Deviation of all Attributes:" print np.std(data, axis=0) print "\n" mad = robust.mad(data, axis=0) print "MAD of the attributes given is: " print mad print "\n" max_data = np.max(data, axis=0) min_data = np.min(data, axis=0) print "Maximum and minimum data points are given below:" print max_data print "\n" print min_data print data['Quality']
def lcStats(F_fileName, Fstat_fileName, S_fileName=None, filter=True): fPhot = open(F_fileName) fStat = open(Fstat_fileName, 'w') eof = False activeField = 0 activeTile = 0 lcDict = {} if S_fileName is not None: starData = np.loadtxt(S_fileName, delimiter=';', dtype=str) sTile = starData[:, 1].astype(int) sSeq = starData[:, 2].astype(int) sRchunk = starData[:, 7].astype(int) raDecPat = re.compile('\(([0-9-\.]+),([0-9-\.]+)\)') while not eof: photLine = fPhot.readline() if photLine == '': eof = True else: photFields = string.split(photLine, ';') field = int(photFields[1]) tile = int(photFields[2]) seq = int(photFields[3]) rmag = float(photFields[9]) rerr = float(photFields[10]) bmag = float(photFields[24]) berr = float(photFields[25]) if filter: if rmag <= -15 or bmag <= -15 or rmag > -2 or bmag > -2 or rerr < 0 or berr < 0: continue if field != activeField or tile != activeTile: if activeField == 0: activeField = field activeTile = tile else: # error exit sys.exit('Input not all same field and tile') if lcDict.has_key(seq): lc = lcDict[seq] lc[0].append(rmag) lc[1].append(bmag) lc[2].append(rerr) lc[3].append(berr) lcDict[seq] = lc else: lcDict[seq] = [[rmag], [bmag], [rerr], [berr]] if debug: print lcDict if S_fileName is not None: fStat.write( '# F T S Rchunk RA DEC Rmed Rmad RmeanErr Vmed Vmad VmeanErr WScoeff WScoeffp\n' ) else: fStat.write( '# F T S Rmed Rmad RmeanErr Vmed Vmad VmeanErr WScoeff WScoeffp\n') for seq in lcDict.keys(): lc = lcDict[seq] lcr = np.array(lc[0]) lcb = np.array(lc[1]) lcrerr = lc[2] lcberr = lc[3] lcrMedian = np.median(lcr) lcrStdev = mad(lcr, center=lcrMedian) lcrAverr = np.median(lcrerr) lcbMedian = np.median(lcb) lcbStdev = mad(lcb, center=lcbMedian) lcbAverr = np.median(lcberr) bMinusR = lcb - lcr wsCoeff, wsCoeffp = pearsonr(lcb - lcbMedian, lcr - lcrMedian) if S_fileName is not None: idStar = np.where((sTile == tile) & (sSeq == seq)) if len(idStar[0]) == 0: print 'Star fts %d %d %d not found in Star file' % (field, tile, seq) raise ValueError thisStar = starData[idStar, :][0][0] redChunk = int(thisStar[7]) raHMS = Angle(thisStar[3] + ' hours') decDMS = Angle(thisStar[4] + ' degrees') raDeg = raHMS.degree decDeg = decDMS.degree outputLine = '%d %d %d %d %.5f %.5f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n' % ( field, tile, seq, redChunk, raDeg, decDeg, lcrMedian, lcrStdev, lcrAverr, lcbMedian, lcbStdev, lcbAverr, wsCoeff, wsCoeffp) else: outputLine = '%d %d %d %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n' % ( field, tile, seq, lcrMedian, lcrStdev, lcrAverr, lcbMedian, lcbStdev, lcbAverr, wsCoeff, wsCoeffp) fStat.write(outputLine) fStat.close()
usedur[idx] = 5.0 ts = fed.timeseries(uowStart, uowEnd) skylineData = np.zeros_like(ts.ephemCentral, dtype=np.int64) print('alpha') for i in range(len(alltic)): curTic = alltic[i] curper = useper[i] curepc = useepc[i] curdur = usedur[i] ts = ts.makeEphemVector(curper,curepc,curdur) skylineData = skylineData + np.copy(ts.ephemFull) medSkyline = np.median(skylineData) madSkyline = robust.mad(skylineData) tmp = np.arange(len(skylineData)) plt.plot(tmp, skylineData, '.') idxBad = np.where((skylineData-medSkyline)/madSkyline > 2.75)[0] plt.plot(tmp[idxBad], skylineData[idxBad], '.r') plt.show() for j in idxBad: fout.write('{:11.5f}\n'.format(ts.ts[j])) fout.close() print('hello world')
def tpf_resamp(file, fileOut, RESAMP, lcFile): """ Resample TESS target pixel file and save as h5d format resamp - Resample factor just make it odd okay""" hdulist = fits.open(file) arr = hdulist[1].data[0]['FLUX'] nImage = len(hdulist[1].data[:]['CADENCENO']) shp = arr.shape nx = shp[0] ny = shp[1] saturate_pixel = np.zeros((nx, ny), dtype=int) median_image = np.zeros((nx, ny)) # Get header information that we should keep keepprihdr = ['TICID','SECTOR','CAMERA','CCD','PXTABLE','RA_OBJ', \ 'DEC_OBJ','PMRA','PMDEC','PMTOTAL','TESSMAG','TEFF', \ 'LOGG','RADIUS'] formatprihdr = [np.uint32, int, int, int,int, \ float,float,float,float,float, \ float,float,float,float,float] keep1hdr = ['1CRV4P', '2CRV4P', '1CRPX4', '2CRPX4'] format1hdr = [int, int, float, float] cadenceNo = hdulist[1].data[:]['CADENCENO'] timetbjd = hdulist[1].data[:]['TIME'] flux_array = hdulist[1].data[:]['FLUX'] flux_bkg_array = hdulist[1].data[:]['FLUX_BKG'] dq_flag = hdulist[1].data[:]['QUALITY'] f = h5py.File(lcFile, 'r') cadNo = np.array(f['cadenceNo']) cadNoBeg = np.array(f['cadenceNoBeg']) cadNoEnd = np.array(f['cadenceNoEnd']) ia, ib = cjb.intersect(cadNoBeg, cadenceNo) # In rare instances the light curve data doesnt exist for this sector and # ib will be empty causeing error if so # just return and do nothing for this sector try: frstIdx = ib[0] except: return ia, ib = cjb.intersect(cadNoEnd, cadenceNo) endIdx = ib[-1] #Make a fix for Sector 3 where not all cadences were used # in the backend DV #idx = np.where((cadenceNo>=114115) & (cadenceNo<=128706))[0] #nImage = len(idx) #cadenceNo, timetbjd, dq_flag = idx_filter(idx, cadenceNo, timetbjd, dq_flag) #flux_array = flux_array[idx, :, :] #flux_bkg_array = flux_bkg_array[idx, :, :] # trim off the excess images not integral into resamp cadenceNo = cadenceNo[frstIdx:endIdx + 1] timetbjd = timetbjd[frstIdx:endIdx + 1] flux_array = flux_array[frstIdx:endIdx + 1, :, :] flux_bkg_array = flux_bkg_array[frstIdx:endIdx + 1, :, :] dq_flag = dq_flag[frstIdx:endIdx + 1] newNImage = len(cadenceNo) // RESAMP # Do downsampling of data stream cadenceNo = np.mean(np.reshape(cadenceNo, (newNImage, RESAMP)), axis=1, dtype=int) timetbjd = np.mean(np.reshape(timetbjd, (newNImage, RESAMP)), axis=1) flux_array = np.sum(np.reshape(flux_array, (newNImage, RESAMP, nx, ny)), axis=1) flux_bkg_array = np.sum(np.reshape(flux_bkg_array, (newNImage, RESAMP, nx, ny)), axis=1) dq_flag = np.sum(np.reshape(dq_flag, (newNImage, RESAMP)), axis=1, dtype=int) # Identify data that is missing or NaN idx = np.where((np.isfinite(timetbjd)) & (np.isfinite(np.squeeze(flux_array[:, 0, 0]))) & (np.isfinite(np.squeeze(flux_bkg_array[:, 0, 0]))))[0] valid_data_flag = np.zeros((newNImage, ), dtype=np.bool_) valid_data_flag[idx] = True # Identify saturated pixels for i in range(nx): for j in range(ny): curflux = flux_array[:, i, j] diff_flux = np.diff(curflux[valid_data_flag]) robmad = robust.mad(diff_flux) medval = np.median(curflux[valid_data_flag]) median_image[i, j] = medval if medval > 1000.0 and np.log10(robmad / medval) < -3.5: saturate_pixel[i, j] = 1 # print("Saturated Pixel detected x: {0:d} y: {1:d}".format(i, j)) # Now save data as h5py epic = hdulist[0].header['TICID'] sec = hdulist[0].header['SECTOR'] # fileoutput = os.path.join(make_data_dirs(dirOut,sec,epic), 'tess_tpf_{0:016d}.h5d'.format(epic)) f = h5py.File(fileOut, 'w') tmp = f.create_dataset('cadenceNo', data=cadenceNo, compression='gzip') tmp = f.create_dataset('timetbjd', data=timetbjd, compression='gzip') tmp = f.create_dataset('flux_array', data=flux_array, compression='gzip') tmp = f.create_dataset('flux_bkg_array', data=flux_bkg_array, compression='gzip') tmp = f.create_dataset('dq_flag', data=dq_flag, compression='gzip') tmp = f.create_dataset('valid_data_flag', data=valid_data_flag, compression='gzip') tmp = f.create_dataset('saturate_pixel', data=saturate_pixel, compression='gzip') tmp = f.create_dataset('median_image', data=median_image, compression='gzip') # Now make many datasets from the header parameters for i in range(len(keepprihdr)): curval = hdulist[0].header[keepprihdr[i]] if np.isscalar(curval): tmp = f.create_dataset(keepprihdr[i], data=np.array( [hdulist[0].header[keepprihdr[i]]], dtype=formatprihdr[i])) else: tmp = f.create_dataset(keepprihdr[i], data=np.array([-1], dtype=formatprihdr[i])) for i in range(len(keep1hdr)): curval = hdulist[1].header[keep1hdr[i]] if np.isscalar(curval): tmp = f.create_dataset(keep1hdr[i], data=np.array( [hdulist[1].header[keep1hdr[i]]], dtype=format1hdr[i])) else: tmp = f.create_dataset(keep1hdr[i], data=np.array([-1], dtype=format1hdr[i])) f.close()
Asimetria_Colombia = [] MAD_Colombia = [] TriMd = [] YK_Colombia = [] for i in range(len(time)): Mapa_Colombia = precip[i, Colombia_Lat, :] Mapa_Colombia = Mapa_Colombia[:, Colombia_Lon] Mapa_NoNaN_Colombia = Mapa_Colombia[np.isfinite(Mapa_Colombia)] Media_Colombia.append(np.mean(Mapa_NoNaN_Colombia)) Mediana_Colombia.append(np.median(Mapa_NoNaN_Colombia)) Desviacion_Colombia.append(np.std(Mapa_NoNaN_Colombia)) Curtosis_Colombia.append(scipy.stats.kurtosis(Mapa_NoNaN_Colombia)) Asimetria_Colombia.append(stats.skew(Mapa_NoNaN_Colombia)) MAD_Colombia.append(robust.mad(Mapa_NoNaN_Colombia)) #TriMd_Colombia.append(np.median(Mapa_NoNaN_Colombia)) #YK_Colombia.append(np.median(Mapa_NoNaN_Colombia)) Media_Colombia = np.array(Media_Colombia) Mediana_Colombia = np.array(Mediana_Colombia) Desviacion_Colombia = np.array(Desviacion_Colombia) Curtosis_Colombia = np.array(Curtosis_Colombia) Asimetria_Colombia = np.array(Asimetria_Colombia) MAD_Colombia = np.array(MAD_Colombia) Meses = np.array([fechas[i].month for i in range(len(fechas))]) Colombia_Media_mensual = np.zeros([12]) * np.NaN Colombia_Mediana_mensual = np.zeros([12]) * np.NaN Colombia_Desviacion_mensual = np.zeros([12]) * np.NaN
def _mad(x): return smrb.mad(x)
def normalize_mad(x): x = np.array(x, dtype=np.float32) med = np.median(x, axis=0) mad = robust.mad(x, axis=0) return (x - med) / (mad * 2) # 2 is for having smaller values
print('Airbone:', numpy.median(Airborne)) print('Aquatic:', numpy.median(Aquatic)) print('Predator:', numpy.median(Predator)) print('Toothed:', numpy.median(Toothed)) print('Backbone:', numpy.median(Backbone)) print('Breathes:', numpy.median(Breathes)) print('Venomous:', numpy.median(Venomous)) print('Fins:', numpy.median(Fins)) print('Legs:', numpy.median(Legs)) print('Tail:', numpy.median(Tail)) print('Domestic:', numpy.median(Domestic)) print('Catsize:', numpy.median(Catsize)) print('Type:', numpy.median(Type)) print('\n') print('MAD:') print('Hair:', robust.mad(Hair)) print('Feather:', robust.mad(Feather)) print('Eggs:', robust.mad(Eggs)) print('Milk:', robust.mad(Milk)) print('Airbone:', robust.mad(Airborne)) print('Aquatic:', robust.mad(Aquatic)) print('Predator:', robust.mad(Predator)) print('Toothed:', robust.mad(Toothed)) print('Backbone:', robust.mad(Backbone)) print('Breathes:', robust.mad(Breathes)) print('Venomous:', robust.mad(Venomous)) print('Fins:', robust.mad(Fins)) print('Legs:', robust.mad(Legs)) print('Tail:', robust.mad(Tail)) print('Domestic:', robust.mad(Domestic)) print('Catsize:', robust.mad(Catsize))
def find_noisy_channels(raw, linenoise): """ High-pass filters, detrends, and removes line noise from the EEG data. Additionally finds channels having Nans, no data, unusually high amplitudes poor correlation, high-frequency noise, and bad correlation in the low frequency portion of the signal using RANSAC. Inspired by the PREP pipleine [1]. Fischler and Bolles RANSAC method was used for finding outlier channels [2]. Parameters __________ raw: raw mne object contains the EEG data and other information related to it linenoise: int line frequency that needs to be removed by notch filtering Raises ______ IOE error If too few channels are present to perfom RANSAC Returns _______ noisy_channels: list of string list of the names of all the bad channels References __________ [1] Bigdely-Shamlo, N., Mullen, T., Kothe, C., Su, K., & Robbins, K. (2015). The PREP pipeline: standardized preprocessing for large-scale EEG analysis. Frontiers In Neuroinformatics, 9. doi: 10.3389/fninf.2015.00016 [2] Fischler, M., & Bolles, R. (1981). Random sample consensus: a paradigm for model fitting with applications to image analysis and automated cartography. Communications Of The ACM, 24(6), 381-395. doi: 10.1145/358669.358692 """ EEGData = raw.get_data() ch_names_original = raw.info["ch_names"] sample_rate = raw.info["sfreq"] mne.filter.filter_data(EEGData,sample_rate, 1, None, picks=None, filter_length="auto", l_trans_bandwidth="auto", h_trans_bandwidth="auto", n_jobs=1, method="fir", iir_params=None, copy=True,phase="zero",fir_window="hamming", fir_design="firwin",pad="reflect_limited", verbose=None) EEGData = signal.detrend(EEGData) # removing line noise mne.filter.notch_filter(EEGData,sample_rate,linenoise,filter_length="auto", notch_widths=None,trans_bandwidth=1,method="fir", iir_params=None,mt_bandwidth=None, p_value=0.05,picks=None,n_jobs=1,copy=True,phase="zero", fir_window="hamming",ir_design="firwin",pad="reflect_limited", verbose=None) # finding channels with NaNs or constant values for long periods of time original_dimensions = np.shape(EEGData) original_channels = np.arange(original_dimensions[0]) channels_interpolate = original_channels nan_channel_mask = [False] * original_dimensions[0] no_signal_channel_mask = [False] * original_dimensions[0] for i in range(0, original_dimensions[0]): nan_channel_mask[i] = np.sum(np.isnan(EEGData[i, :])) > 0 for i in range(0, original_dimensions[0]): no_signal_channel_mask[i] = robust.mad(EEGData[i, :]) < 10 ** (-10) or np.std( EEGData[i, :]) < 10 ** (-10) nan_channels = channels_interpolate[nan_channel_mask] no_data_channels = channels_interpolate[no_signal_channel_mask] for i in range(0, original_dimensions[0]): if nan_channel_mask[i] == True or no_signal_channel_mask[i] == True: EEGData = np.delete(EEGData, i, axis=0) nans_no_data_channels = np.union1d(nan_channels, no_data_channels) channels_interpolate = np.setdiff1d( channels_interpolate, nans_no_data_channels) nans_no_data_ChannelName = list() ch_names = raw.info["ch_names"] for i in range(0, len(nans_no_data_channels)): nans_no_data_ChannelName.append(ch_names[nans_no_data_channels[i]]) raw.drop_channels(nans_no_data_ChannelName) evaluation_channels = channels_interpolate new_dimension = np.shape(EEGData) # find channels that have abnormally high or low amplitude robust_channel_deviation = np.zeros(original_dimensions[0]) deviation_channel_mask = [False] * (new_dimension[0]) channel_deviation = np.zeros(new_dimension[0]) for i in range(0, new_dimension[0]): channel_deviation[i] = 0.7413 * iqr(EEGData[i, :]) channel_deviationSD = 0.7413 * iqr(channel_deviation) channel_deviationMedian = np.nanmedian(channel_deviation) robust_channel_deviation[evaluation_channels] = np.divide( np.subtract(channel_deviation, channel_deviationMedian), channel_deviationSD ) for i in range(0, new_dimension[0]): deviation_channel_mask[i] = abs(robust_channel_deviation[i]) > 5 or np.isnan( robust_channel_deviation[i] ) deviation_channels = evaluation_channels[deviation_channel_mask] # finding channels with high frequency noise EEGData = np.transpose(EEGData) dimension = np.shape(EEGData) if sample_rate > 100: new_EEG = np.zeros((dimension[0], dimension[1])) bandpass_filter = filter_design( N_order=100, amp=np.array([1, 1, 0, 0]), freq=np.array([0, 0.36, 0.4, 1]), sample_rate=sample_rate) for i in range(0, dimension[1]): new_EEG[:, i] = signal.filtfilt(bandpass_filter, 1, EEGData[:, i]) noisiness = np.divide(robust.mad(np.subtract(EEGData, new_EEG)), robust.mad(new_EEG)) noisiness_median = np.nanmedian(noisiness) noiseSD = (np.median(np.absolute(np.subtract(noisiness, np.median(noisiness)))) * 1.4826) zscore_HFNoise = np.divide(np.subtract(noisiness, noisiness_median), noiseSD) HFnoise_channel_mask = [False] * new_dimension[0] for i in range(0, new_dimension[0]): HFnoise_channel_mask[i] = zscore_HFNoise[i] > 5 or np.isnan( zscore_HFNoise[i]) else: new_EEG = EEGData noisiness_median = 0 noisinessSD = 1 zscore_HFNoise = np.zeros(dimension[1], 1) HFNoise_channels = [] HFNoise_channels = evaluation_channels[HFnoise_channel_mask] # finding channels by correlation CORRELATION_SECONDS = 1 # default value CORRELATION_FRAMES = CORRELATION_SECONDS * sample_rate correlation_window = np.arange(CORRELATION_FRAMES) correlation_offsets = np.arange(1, dimension[0] - CORRELATION_FRAMES, CORRELATION_FRAMES) w_correlation = len(correlation_offsets) maximum_correlations = np.ones((original_dimensions[0], w_correlation)) drop_out = np.zeros((dimension[1], w_correlation)) channel_correlation = np.ones((w_correlation, dimension[1])) noiselevels = np.zeros((w_correlation, dimension[1])) channel_deviations = np.zeros((w_correlation, dimension[1])) drop = np.zeros((w_correlation, dimension[1])) len_correlation_window = len(correlation_window) EEG_new_win = np.reshape( np.transpose(new_EEG[0: len_correlation_window * w_correlation, :]), (dimension[1], len_correlation_window, w_correlation), order="F") data_win = np.reshape( np.transpose(EEGData[0: len_correlation_window * w_correlation, :]), (dimension[1], len_correlation_window, w_correlation), order="F") for k in range(0, w_correlation): eeg_portion = np.transpose(np.squeeze(EEG_new_win[:, :, k])) data_portion = np.transpose(np.squeeze(data_win[:, :, k])) window_correlation = np.corrcoef(np.transpose(eeg_portion)) abs_corr = np.abs( np.subtract(window_correlation, np.diag(np.diag(window_correlation)))) channel_correlation[k, :] = np.quantile( abs_corr, 0.98, axis=0) # problem is here is solved noiselevels[k, :] = np.divide( robust.mad(np.subtract(data_portion, eeg_portion)), robust.mad(eeg_portion)) channel_deviations[k, :] = 0.7413 * iqr(data_portion, axis=0) for i in range(0, w_correlation): for j in range(0, dimension[1]): drop[i, j] = np.int( np.isnan(channel_correlation[i, j]) or np.isnan(noiselevels[i, j])) if drop[i, j] == 1: channel_deviations[i, j] = 0 noiselevels[i, j] = 0 maximum_correlations[evaluation_channels, :] = np.transpose(channel_correlation) drop_out[:] = np.transpose(drop) noiselevels_out = np.transpose(noiselevels) channel_deviations_out = np.transpose(channel_deviations) thresholded_correlations = maximum_correlations < 0.4 thresholded_correlations = thresholded_correlations.astype(int) fraction_BadCorrelationWindows = np.mean(thresholded_correlations, axis=1) fraction_BadDropOutWindows = np.mean(drop_out, axis=1) badCorrelation_channels = np.where(fraction_BadCorrelationWindows > 0.01) badCorrelation_channels_out = badCorrelation_channels[:] dropout_channels = np.where(fraction_BadDropOutWindows > 0.01) dropout_channels_out = dropout_channels[:] # medianMaxCorrelation = np.median(maximumCorrelations, 2); badSNR_channels = np.union1d(badCorrelation_channels_out, HFNoise_channels) noisy_channels = np.union1d(np.union1d(np.union1d(deviation_channels, np.union1d(badCorrelation_channels_out, dropout_channels_out)), badSNR_channels), np.union1d(nan_channels, no_data_channels)) # performing ransac bads = list() for i in range(0, len(noisy_channels)): bads.append(ch_names[noisy_channels[i]]) SAMPLES = 50 FRACTION_GOOD = 0.25 CORR_THRESH = 0.75 FRACTION_BAD = (0.4,) CORR_WIN_SEC = 4 chn_pos = raw._get_channel_positions() raw.info["bads"] = bads good_chn_labs = list() good_idx = mne.pick_channels(ch_names, include=[], exclude=raw.info["bads"]) for i in range(0, len(good_idx)): good_chn_labs.append(ch_names[good_idx[i]]) n_chans_good = good_idx.shape[0] chn_pos_good = chn_pos[good_idx, :] n_pred_chns = int(np.ceil(FRACTION_GOOD * n_chans_good)) EEGData_filtered = np.transpose(new_EEG) if n_pred_chns <= 3: raise IOError("Too few channels available to reliably perform ransac.") # Make the ransac predictions ransac_eeg = run_ransac(chn_pos=chn_pos, chn_pos_good=chn_pos_good, good_chn_labs=good_chn_labs, n_pred_chns=n_pred_chns, data=EEGData_filtered, n_samples=SAMPLES, raw=raw) signal_len = original_dimensions[1] n_chans = len(chn_pos) correlation_frames = CORR_WIN_SEC * raw.info["sfreq"] correlation_window = np.arange(correlation_frames) n = correlation_window.shape[0] correlation_offsets = np.arange( 0, (signal_len - correlation_frames), correlation_frames) w_correlation = correlation_offsets.shape[0] data_window = EEGData_filtered[:n_chans, : n * w_correlation] data_window = data_window.reshape(n_chans, n, w_correlation) pred_window = ransac_eeg[:n_chans, : n * w_correlation] pred_window = pred_window.reshape(n_chans, n, w_correlation) channel_correlations = np.ones((w_correlation, n_chans)) for k in range(w_correlation): data_portion = data_window[:, :, k] pred_portion = pred_window[:, :, k] corr = np.corrcoef(data_portion, pred_portion) corr = np.diag(corr[0:n_chans, n_chans:]) channel_correlations[k, :] = corr thresholded_correlations = channel_correlations < CORR_THRESH frac_bad_corr_windows = np.mean(thresholded_correlations, axis=0) # find the corresponding channel names and return bad_idxs_bool = frac_bad_corr_windows > FRACTION_BAD bad_idxs = np.argwhere(bad_idxs_bool) bad_by_ransac = list() noisy_channels = np.union1d(noisy_channels, bad_idxs[0: len(bad_idxs)][0]) ransac_channel_correlations = channel_correlations noisy_channels_list = list() for i in range(0, len(noisy_channels)): noisy_channels_list.append(ch_names_original[noisy_channels[i]]) print(noisy_channels_list) return noisy_channels_list
DecilesS3.append(np.percentile(S3, i)) DecilesS4.append(np.percentile(S4, i)) DecilesS5.append(np.percentile(S5, i)) #Rango intercuartil IQR1 = S1_perc_75 - S1_perc_25 IQR2 = S2_perc_75 - S2_perc_25 IQR3 = S3_perc_75 - S3_perc_25 IQR4 = S4_perc_75 - S4_perc_25 IQR5 = S5_perc_75 - S5_perc_25 from statsmodels import robust #Desviacion absoluta media MAD1 = robust.mad(S1) MAD2 = robust.mad(S2) MAD3 = robust.mad(S3) MAD4 = robust.mad(S4) MAD5 = robust.mad(S5) #Trimedia def Trimd(percentil25, mediana, percentil75): Trimedia = ((percentil25) + (2 * mediana) + (percentil75)) / 4 return Trimedia TriM1 = Trimd(S1_perc_25, S1_medi, S1_perc_75) TriM2 = Trimd(S2_perc_25, S2_medi, S2_perc_75)
needD = 1 while(norm(y_dwtD[dwtD_sort_id[0:needD]]) / norm(y_dwtD) < (compressed_percentage/100)): needD = needD + 1 print needD, compressed_percentage/100 #zero the coeff that is not really doing contribution to compressed_percentage% of y (thresholding) y_dwtD[dwtD_sort_id[needD+1:]] = 0 #y_dwtD = np.reshape(y_dwtD, (len(y_dwtD), 1)) #y_cmp = np.concatenate((y_dwtA, y_dwtD),axis=1) #print np.shape(y_cmp) #get compressed signal by inverse dwt the finalized coeffs y_cmp = idwt(y_dwtA, y_dwtD, 'db4') ''' sigma = mad(coeff[-1]) threshold = sigma * np.sqrt(2 * np.log(len(y_data))) #coeff[1:] = (pywt.threshold(i, value=threshold, mode="soft") for i in coeff[1:]) coeff[1:] = (pywt.threshold(i, value=threshold, mode="hard") for i in coeff[1:]) y_cmp = pywt.waverec(coeff, "db20", mode="per") #print np.shape(y_cmp) ''' output_file("legend.html", title="legend.py example") p1 = figure(title="Original", tools=TOOLS, plot_width=800, plot_height=400) p2 = figure(title="After dwt", tools=TOOLS, plot_width=800, plot_height=400) #p1.circle(x, y, legend="Control points", color="red", alpha=0.5) p1.line(x, y_data, legend="Control Points", color="blue", alpha=0.8) #p2.line(x, fft_y, legend="Control points", color="red", alpha=0.5)
# Median, Percentile, quantile, MAD print("Median:") print(np.median(haber_1["nodes"])) print(np.median(haber_2["nodes"])) print("\nQuantiles:") print(np.percentile(haber_1["nodes"], np.arange(0, 100, 25))) print(np.percentile(haber_2["nodes"], np.arange(0, 100, 25))) print("\n20th Percentile range") print(np.percentile(haber_1["nodes"], np.arange(0, 100, 20))) print(np.percentile(haber_2["nodes"], np.arange(0, 100, 20))) from statsmodels import robust print("\nMedian Absolute Deviation:") print(robust.mad(haber_1["nodes"])) print(robust.mad(haber_2["nodes"])) # Box plot and Whiskers # Setting handles for the legend. import matplotlib.patches as mpatches blue_patch = mpatches.Patch(color="steelblue", label="1") orange_patch = mpatches.Patch(color="orange", label="2") # Box plot and whiskers for nodes sns.boxplot(x="status", y="nodes", data=haber) plt.title("Box plot for Nodes") plt.legend(title="status", handles=[blue_patch, orange_patch]) plt.show() # Box plot and whiskers for age sns.boxplot(x="status", y="age", data=haber)
def check_station_residual(self, instaxml, period, runid = 0, discard = False, usemad = True, madfactor = 3., crifactor = 0.5, crilimit = 10.,\ plot = True, projection = 'merc', cmap = 'surf', vmin = None, vmax = None, clabel = 'average absolute'): stainv = obspy.read_inventory(instaxml) lats = [] lons = [] staids = [] for network in stainv: for station in network: stlo = float(station.longitude) if stlo < 0.: stlo += 360. if station.latitude <= self.maxlat and station.latitude >= self.minlat\ and stlo <= self.maxlon and stlo >= self.minlon: lats.append(station.latitude) lons.append(stlo) staids.append(network.code + '.' + station.code) smoothgroup = self['smooth_run_' + str(runid)] try: residdset = smoothgroup['%g_sec' % (period) + '/residual'] # id fi0 lam0 f1 lam1 vel_obs weight res_tomo res_mod delta residual = residdset[()] except: raise AttributeError('Residual data: ' + str(period) + ' sec does not exist!') if discard: res_tomo = residual[:, 7] # quality control to discard data with large misfit if usemad: from statsmodels import robust mad = robust.mad(res_tomo) cri_res = madfactor * mad else: cri_res = min(crifactor * per, crilimit) residual = residual[np.abs(res_tomo) < cri_res, :] lats = np.asarray(lats, dtype=np.float64) lons = np.asarray(lons, dtype=np.float64) Ncounts, absres, res = _tomo_funcs._station_residual( np.float64(lats), np.float64(lons), np.float64(residual)) # plot #----------- # plot data #----------- m = self._get_basemap(projection=projection) x, y = m(lons, lats) try: import pycpt if os.path.isfile(cmap): cmap = pycpt.load.gmtColormap(cmap) # cmap = cmap.reversed() elif os.path.isfile(cpt_path + '/' + cmap + '.cpt'): cmap = pycpt.load.gmtColormap(cpt_path + '/' + cmap + '.cpt') except: pass values = res / Ncounts im = m.scatter(x, y, marker='^', s=50, c=values, cmap=cmap, vmin=vmin, vmax=vmax) cb = m.colorbar( im, "bottom", size="5%", pad='2%' ) #, ticks=[20., 25., 30., 35., 40., 45., 50., 55., 60., 65., 70.]) cb.set_label(clabel, fontsize=20, rotation=0) plt.suptitle(str(period) + ' sec', fontsize=20) cb.ax.tick_params(labelsize=40) cb.set_alpha(1) cb.draw_all() # # cb.solids.set_rasterized(True) cb.solids.set_edgecolor("face") plt.show() return Ncounts, absres, res, staids
def baseline_als(x, y, lam=None, p=None, niter=10, return_baseline=False, offset_correction=False): """Baseline Correction with Asymmetric Least Squares Smoothing. Parameters ---------- x : array-like the sample time/number/position y : array-like the data series corresponding to ``x`` lam : float the lambda parameter of the ALS method. This control how much the baseline can adapt to local changes. A higher value corresponds to a stiffer baseline p : float the asymmetry parameter of the ALS method. This controls the overall slope tolerated for the baseline. A higher value correspond to a higher possible slope Other Parameters ---------------- niter : int The number of iterations to perform return_baseline : bool return the baseline? offset_correction : bool also correct for an offset to align with the running mean of the scan Returns ------- y_subtracted : array-like, same size as ``y`` The initial time series, subtracted from the trend baseline : array-like, same size as ``y`` Fitted baseline. Only returned if return_baseline is ``True`` Examples -------- >>> x = np.arange(0, 10, 0.01) >>> y = np.zeros_like(x) + 10 >>> ysub = baseline_als(x, y) >>> np.all(ysub < 0.001) True """ if lam is None: lam = 1e11 if p is None: p = 0.001 z = _als(y, lam, p, niter=niter) ysub = y - z offset = 0 if offset_correction: std = mad(ysub) good = np.abs(ysub) < 10 * std if len(x[good]) < 10: good = np.ones(len(x), dtype=bool) warnings.warn('Too few bins to perform baseline offset correction' ' precisely. Beware of results') offset = offset_fit(x[good], ysub[good], 0) if return_baseline: return ysub - offset, z + offset else: return ysub - offset
np.reshape(yr_accomp[i:i + seglen], (1, seglen))), axis=0) estimates = np.concatenate( (np.reshape(ye_vocals[i:i + seglen], (1, seglen)), np.reshape(ye_accomp[i:i + seglen], (1, seglen))), axis=0) [SDR, _, SIR, SAR] = museval.evaluate(references, estimates) #sdr, isr, sir, sar vocal_SDR.append(SDR[0]) vocal_SIR.append(SIR[0]) vocal_SAR.append(SAR[0]) print("Current vocal SDR median/mad/mean/std", np.median(np.asarray(vocal_SDR)), robust.mad(np.asarray(vocal_SDR)), np.mean(np.asarray(vocal_SDR)), np.std(np.asarray(vocal_SDR))) sw_SDR.append(np.median(np.asarray(vocal_SDR))) print("Current macro vocal SDR median/mad/mean/std", np.median(np.asarray(sw_SDR)), robust.mad(np.asarray(sw_SDR)), np.mean(np.asarray(sw_SDR)), np.std(np.asarray(sw_SDR))) print("Current vocal SIR median/mad/mean/std", np.median(np.asarray(vocal_SIR)), robust.mad(np.asarray(vocal_SIR)), np.mean(np.asarray(vocal_SIR)), np.std(np.asarray(vocal_SIR))) sw_SIR.append(np.median(np.asarray(vocal_SIR))) print("Current macro vocal SIR median/mad/mean/std", np.median(np.asarray(sw_SIR)), robust.mad(np.asarray(sw_SIR)), np.mean(np.asarray(sw_SIR)), np.std(np.asarray(sw_SIR))) print("Current vocal SAR median/mad/mean/std", np.median(np.asarray(vocal_SAR)),
plt.xlabel('{:s} ({:s})'.format(a_true, a_string)) plt.ylabel('{:s} ({:s})'.format(a_fit, a_string)) plt.title('(b) acceleration') # + subtitle + statistic_title[1]) plt.legend(framealpha=0.5, loc='upper left', fontsize=legend_fontsize_fraction*axis_fontsize) plt.grid() plt.tight_layout() if save: filename = 'acceleration_{:s}_{:s}.pdf'.format(name, root) plt.savefig(os.path.join(image_directory, filename), bbox_inches='tight', pad_inches=pad_inches) # # Median velocity and acceleration plots # v1 = np.median(z1v, axis=1) v1e = mad(z1v, axis=1, c=1.0) v2 = np.median(z2v, axis=1) v2e = mad(z2v, axis=1, c=1.0) a2 = np.median(z2a, axis=1) a2e = mad(z2a, axis=1, c=1.0) v_string = v0.unit.to_string('latex_inline') a_string = a0.unit.to_string('latex_inline') plt.figure(3) plt.errorbar(accs, v1, yerr=v1e, label='polynomial n=1, fit velocity') plt.errorbar(accs, v2, yerr=v2e, label='polynomial n=2, fit velocity') plt.xlim(np.min(accs), np.max(accs)) plt.axhline(v0.to(u.km/u.s).value, label='true velocity ({:n} {:s})'.format(v0.value, v_string), color='r') plt.xlabel('true acceleration ({:s})'.format(a_string))
def analysisIrradianceandPowerMismatch2(testfolder, writefiletitle, numpanels, sensorsy, portraitorlandscape='landscape'): ''' Reads and calculates power output and mismatch for each file in the testfolder where all the bifacial_radiance irradiance results .csv are saved. First load each file, cleans it and resamples it to the numsensors set in this function, and then calculates irradiance mismatch and PVMismatch power output for averaged, minimum, or detailed irradiances on each cell for the cases of A) only 12 or 8 downsmaples values are considered (at the center of each cell), and B) 12 or 8 values are obtained from averaging all the irradiances falling in the area of the cell (No edges or inter-cell spacing are considered at this moment). Then it saves all the A and B irradiances, as well as the cleaned/resampled front and rear irradiances. Ideally sensorsy in the read data is >> 12 to give results for the irradiance mismatch in the cell. Also ideally n Parameters ---------- testfolder: folder containing output .csv files for bifacial_radiance writefiletitle: .csv title where the output results will be saved. numpanels: 1 or 2 only at hte moment, necessary for the cleaning routine. portraitorlandscape: 'portrait' or 'landscape', for PVMismatch input which defines the electrical interconnects inside the module. sensorsy : number of sensors. Ideally this number is >> 12 and is also similar to the number of sensors (points) in the .csv result files. We want more than 12 sensors to be able to calculate mismatch of irradiance in the cell. ''' #INPUT VARIABLES NECESSARY: #\\nrel.gov\shared\5J00\Staff\CDeline\Bifacial mismatch data\Tracker mismatch data\3_26_19 Cairo_mismatch_1up tube #testfolder = r'C:\Users\sayala\Documents\RadianceScenes\Demo3\results' #testfolder = r'\\nrel.gov\shared\5J00\Staff\CDeline\Bifacial mismatch data\Tracker mismatch data\3_26_19 Cairo_mismatch_1up tube\results_noTorqueTube' #writefiletitle = r'C:\Users\sayala\Documents\RadianceScenes\results_Cairo_mismatch_1up_noTorqueTube.csv' #numpanels= 1 #portraitorlandscape = 'portrait' # portrait has 12 cells, landscape has 8 #sensorsy = 120 # deepclean will clean and resample to this number of sensors. #ideally close nubmer to the original number of sample points. # Also, if it's just 12 or 8 (for landscape or portrait), all the averagd values and cell mismatch # become a mooth point # User information. filelist = sorted(os.listdir(testfolder)) print('{} files in the directory'.format(filelist.__len__())) # PVMISMATCH Initialization of System pvsys = pvsystem.PVsystem( numberStrs=1, numberMods=1) # makes the system # 1 module, in portrait mode. pmp_ideal = pvsys.Pmp # Panel ideal. Monofacial. stdpl = np.array([[0, 23, 24, 47, 48, 71, 72, 95], [1, 22, 25, 46, 49, 70, 73, 94], [2, 21, 26, 45, 50, 69, 74, 93], [3, 20, 27, 44, 51, 68, 75, 92], [4, 19, 28, 43, 52, 67, 76, 91], [5, 18, 29, 42, 53, 66, 77, 90], [6, 17, 30, 41, 54, 65, 78, 89], [7, 16, 31, 40, 55, 64, 79, 88], [8, 15, 32, 39, 56, 63, 80, 87], [9, 14, 33, 38, 57, 62, 81, 86], [10, 13, 34, 37, 58, 61, 82, 85], [11, 12, 35, 36, 59, 60, 83, 84]]) if portraitorlandscape == 'portrait': samplecells = 12 repeatedcells = 8 if portraitorlandscape == 'landscape': samplecells = 8 repeatedcells = 12 stdpl = stdpl.transpose() # SAMPLE POINT AND HEADER DEFINITION cellCenterPVM = [ ] # This grabs just the value at the 'center' of the cell. cellFrontandBackMismatch_Header = [] cellBackMismatch_Header = [] cellCenterFrontValue_Header = [] cellCenterBackValue_Header = [] cellFrontAveragedValue_Header = [] cellBackAveragedValue_Header = [] frontres_header = [] backres_header = [] for i in range(0, samplecells): cellCenterPVM.append((i * sensorsy / (samplecells * 1.0) + (i + 1) * sensorsy / (samplecells * 1.0) / 2)) cellFrontandBackMismatch_Header.append('FrontplusBack_Mismatch_cell_' + str(i)) cellBackMismatch_Header.append('Back_Mismatch_cell_' + str(i)) cellCenterFrontValue_Header.append('CellCenterFrontValue_cell' + str(i)) cellCenterBackValue_Header.append('CellCenterBackValue_cell' + str(i)) cellBackAveragedValue_Header.append('CellBack_AveragedValue_cell_' + str(i)) cellFrontAveragedValue_Header.append('CellFront_AveragedValue_cell_' + str(i)) for i in range(0, sensorsy): frontres_header.append('Clean_Front_cell' + str(i)) backres_header.append('Clean_Back_cell' + str(i)) # HEADERS: outputheaders = [ 'Timestamp', 'PowerAveraged_CellCenter', 'PowerMin_CellCenter', 'PowerDetailed_CellCenter', 'PowerAveraged_AverageValues', 'PowerMin_AverageValues', 'PowerDetailed_AverageValues', 'PowerFRONT_Averaged', 'PowerFRONT_Detailed', 'MAD_cellCenterVal', 'MAD_cellAverage', 'MAD_frontplusback_clean', 'Cell Front Min', 'Cell Back Min', 'Irradiance Mismatch Front+Back Max', 'Irradiance Mismatch Back Max' ] outputheaders += cellFrontandBackMismatch_Header outputheaders += cellBackMismatch_Header outputheaders += cellCenterFrontValue_Header outputheaders += cellCenterBackValue_Header outputheaders += cellBackAveragedValue_Header outputheaders += cellFrontAveragedValue_Header outputheaders += frontres_header outputheaders += backres_header with open(writefiletitle, 'w') as csvfile: sw = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') sw.writerow(outputheaders) # LOOP OVER FILES HERE for z in range(0, filelist.__len__()): #for z in range(0, 1): data = load.read1Result(os.path.join(testfolder, filelist[z])) #sensorsy = len(data) # 210 for this case. deepclean resamples to value given. [frontres, backres] = load.deepcleanResult(data, sensorsy, numpanels, automatic=True) cellAverageValues_FrontPlusBack = [] cellFrontAverage = [] # This averages the number of sensors. cellBackAverage = [] cellFrontandBackMismatch = [] cellBackMismatch = [] cellFrontMin = [] cellBackMin = [] cellFrontPlusBackMin = [] frontandbackres = frontres + backres cellRows = len( frontres ) # this is the same as sensorsy.... maybe replace? #TODO if cellRows != samplecells: for i in range(0, samplecells): istart = int(i * cellRows / samplecells) iend = int((i + 1) * cellRows / samplecells) cellFrontAverage.append(np.average(frontres[istart:iend])) cellBackAverage.append(np.average(backres[istart:iend])) cellAverageValues_FrontPlusBack.append( np.average(frontres[istart:iend]) + np.average(backres[istart:iend])) cellFrontandBackMismatch.append( (max(frontandbackres[istart:iend]) - min(frontandbackres[istart:iend])) * 100 / (max(frontandbackres[istart:iend]) + min(frontandbackres[istart:iend]))) cellBackMismatch.append( (max(backres[istart:iend]) - min(backres[istart:iend])) * 100 / (max(backres[istart:iend]) + min(backres[istart:iend]))) cellFrontMin.append(min(frontres[istart:iend])) cellBackMin.append(min(backres[istart:iend])) cellFrontPlusBackMin.append( min(frontandbackres[istart:iend])) cellCenterValFront = np.interp(cellCenterPVM, list(range(0, cellRows)), frontres) cellCenterValBack = np.interp(cellCenterPVM, list(range(0, cellRows)), backres) else: cellCenterValFront = frontres cellCenterValBack = backres sunmatDetailed_CellCenter = [] sunmatAveraged_CellCenter = [] sunmatMin_CellCenter = [] sunmatDetailed_AverageValues = [] sunmatAveraged_AverageValues = [] sunmatMin_AverageValues = [] sunmatFrontOnly_Averaged = [] sunmatFrontOnly_Detailed = [] # Center of Cell only cellCenterValues_FrontPlusBack = cellCenterValFront + cellCenterValBack AveFront_CellCenter = cellCenterValFront.mean() AveBack_CellCenter = cellCenterValBack.mean() # Average of Cell #cellAverageValues_FrontPlusBack = sum(cellFrontAverage,cellBackAverage) AveFront_AverageValues = np.mean(cellFrontAverage) AveBack_AverageValues = np.mean(cellBackAverage) # Repeat to create a matrix to pass matrix. for j in range(0, len(cellCenterValues_FrontPlusBack)): sunmatDetailed_CellCenter.append( [cellCenterValues_FrontPlusBack[j] / 1000] * repeatedcells) sunmatDetailed_AverageValues.append( [cellAverageValues_FrontPlusBack[j] / 1000] * repeatedcells) for j in range(0, len(cellCenterValFront)): sunmatAveraged_CellCenter.append( [(AveFront_CellCenter + AveBack_CellCenter) / 1000] * repeatedcells) sunmatAveraged_AverageValues.append( [(AveFront_AverageValues + AveBack_AverageValues) / 1000] * repeatedcells) for j in range(0, len(cellCenterValFront)): sunmatMin_CellCenter.append( [min(cellCenterValues_FrontPlusBack) / 1000] * repeatedcells) sunmatMin_AverageValues.append( [min(cellFrontPlusBackMin) / 1000] * repeatedcells) # FRONT MISMATCH for j in range(0, len(cellCenterValFront)): sunmatFrontOnly_Averaged.append([cellFrontAverage[j] / 1000] * repeatedcells) sunmatFrontOnly_Detailed.append( [cellCenterValFront[j] / 1000] * repeatedcells) # ACtually do calculations pvsys.setSuns({0: {0: [sunmatAveraged_CellCenter, stdpl]}}) PowerAveraged_CellCenter = pvsys.Pmp pvsys.setSuns({0: {0: [sunmatDetailed_CellCenter, stdpl]}}) PowerDetailed_CellCenter = pvsys.Pmp pvsys.setSuns({0: {0: [sunmatMin_CellCenter, stdpl]}}) PowerMinimum_CellCenter = pvsys.Pmp # ACtually do calculations pvsys.setSuns({0: {0: [sunmatAveraged_AverageValues, stdpl]}}) PowerAveraged_AverageValues = pvsys.Pmp pvsys.setSuns({0: {0: [sunmatDetailed_AverageValues, stdpl]}}) PowerDetailed_AverageValues = pvsys.Pmp pvsys.setSuns({0: {0: [sunmatMin_AverageValues, stdpl]}}) PowerMinimum_AverageValues = pvsys.Pmp # ACtually do calculations pvsys.setSuns({0: {0: [sunmatFrontOnly_Averaged, stdpl]}}) PowerFRONT_Averaged = pvsys.Pmp pvsys.setSuns({0: {0: [sunmatFrontOnly_Detailed, stdpl]}}) PowerFRONT_Detailed = pvsys.Pmp #flattened = [val for sublist in dictvalues for val in sublist] # Append Values # Append Values #cellCenterValFrontFlat = [val for sublist in cellCenterValFront for val in sublist] outputvalues = [ filelist[z], PowerAveraged_CellCenter, PowerMinimum_CellCenter, PowerDetailed_CellCenter, PowerAveraged_AverageValues, PowerMinimum_AverageValues, PowerDetailed_AverageValues, PowerFRONT_Averaged, PowerFRONT_Detailed, robust.mad(cellCenterValues_FrontPlusBack), robust.mad(cellAverageValues_FrontPlusBack), robust.mad(frontandbackres), min(cellFrontMin), min(cellBackMin), max(cellFrontandBackMismatch), max(cellBackMismatch) ] outputvalues += cellFrontandBackMismatch # 12 outputvalues += cellBackMismatch # 12 outputvalues += list(cellCenterValFront) # 12 outputvalues += list(cellCenterValBack) # 12 outputvalues += list(cellFrontAverage) # 12 outputvalues += list(cellBackAverage) # 12 outputvalues += list(frontres) # sensorsy # 210 outputvalues += list(backres) # sensorsy 210 sw.writerow(outputvalues)
plt.xlim(xlim) plt.xlabel('true velocity ({:s})'.format(v_string)) plt.ylabel('fit acceleration ({:s})'.format(a_string)) plt.title('(b) acceleration' + subtitle + statistic_title[1]) plt.legend(framealpha=0.5, loc='upper left') plt.grid() plt.tight_layout() if save: filename = 'acceleration_mean_{:s}.png'.format(root) plt.savefig(os.path.join(image_directory, filename), bbox_inches='tight', pad_inches=pad_inches) # # Median velocity and acceleration plots # v1 = np.median(z1v, axis=1) v1e = mad(z1v, axis=1, c=1.0) v2 = np.median(z2v, axis=1) v2e = mad(z2v, axis=1, c=1.0) a2 = np.median(z2a, axis=1) a2e = mad(z2a, axis=1, c=1.0) v_string = v0.unit.to_string('latex_inline') a_string = a0.unit.to_string('latex_inline') """ plt.figure(3) plt.errorbar(accs, v1, yerr=v1e, label='polynomial n=1, fit velocity') plt.errorbar(accs, v2, yerr=v2e, label='polynomial n=2, fit velocity') plt.xlim(xlim) plt.axhline(v0.to(u.km/u.s).value, label='true velocity ({:n} {:s})'.format(v0.value, v_string), color='r')
# Normalise size data by item. joined = pd.concat([known, unknown], sort = False) item_sizes = df(joined.groupby('item_id')['item_size'].apply(list)) x_size = list(item_sizes.index) y_size = list(item_sizes.item_size.values) zip_dict_size = dict(zip(x_size, y_size)) def mad_normalise(initial_size, sizes_median, mad): return (float(initial_size) - sizes_median) / mad replacement_dict_master = {} for item in joined['item_id'].unique(): item_sizes = zip_dict_size[item] item_sizes = [float(size) for size in item_sizes] sizes_median = np.median(item_sizes) mad = robust.mad(item_sizes) if mad == 0: replacement_dict = {(item, initial_size): 0 for initial_size in item_sizes} else: replacement_dict = {(item, initial_size): round(mad_normalise(initial_size, sizes_median, mad), 2) for initial_size in item_sizes} replacement_dict_master.update(replacement_dict) known['item_size'] = known.set_index(['item_id', 'item_size']).index.map(replacement_dict_master.get) unknown['item_size'] = unknown.set_index(['item_id', 'item_size']).index.map(replacement_dict_master.get) # Change date columns to datetime features. known['order_date'] = pd.to_datetime(known['order_date']) known['delivery_date'] = pd.to_datetime(known['delivery_date']) known['user_dob'] = pd.to_datetime(known['user_dob']) known['user_reg_date'] = pd.to_datetime(known['user_reg_date'])
ioblk.parm.fitregion = 4.0 ioblk.parm.debugLevel = 3 tmpflx = flx[idxGd] tmpt = time[idxGd] tmpttwo = tmpt * tmpt tmptfour = tmpttwo * tmpttwo # Remove polynomial fit pvals = np.polyfit(tmpt, tmpflx, 4, full=False) tmpy = pvals[4] + pvals[3] * tmpt + pvals[2] * tmpttwo + pvals[ 1] * tmpttwo * tmpt + pvals[0] * tmptfour #plt.plot(tmpt, tmpflx, '.') #plt.plot(tmpt, tmpy, '-') #plt.show() ioblk.normlc = tmpflx / tmpy - 1.0 ioblk.normes = robust.mad(ioblk.normlc) origstd = np.copy(ioblk.normes) ioblk.normts = time[idxGd] ioblk.modellc = np.copy(ioblk.normlc) ioblk.yData = np.copy(ioblk.normlc) ioblk.errData = np.full_like(ioblk.normlc, ioblk.normes) ioblk.timezpt = np.median(ioblk.normts) ioblk.normts = ioblk.normts - ioblk.timezpt ioblk.physval_names = ['Per', 'To', 'Amp', 'Zpt'] ioblk.calcval_names = ['Per_c', 'To_c', 'Amp_c', 'Zpt_c'] # Give seed starting values for the minimization ioblk.origests = np.array([allper[i], 0.0, ioblk.normes * 3.0, 0.0]) # Give integer array for variables you want fixed during fit # 0 - not fixed (solved for) ; 1 - fixed (not solved for)
def train_lipop(seed: int = 19700101, limit: int = -1, use_cuda: bool = True, use_tqdm=True, force_save=False, special_config: dict = None, position_encoder_path: str = 'net/pe.pt', tag='std', dataset='Lipop'): cfg = DEFAULT_CONFIG.copy() if special_config: cfg.update(special_config) for k, v in cfg.items(): print(k, ':', v) set_seed(seed, use_cuda) np.set_printoptions(precision=4, suppress=True, linewidth=140) if dataset == 'FreeSolv': smiles, info_list, properties = load_freesolv(limit, force_save=force_save) elif dataset == 'ESOL': smiles, info_list, properties = load_esol(limit, force_save=force_save) else: smiles, info_list, properties = load_lipop(limit, force_save=force_save) molecules = [ HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em']) for info in info_list ] n_dim = molecules[0].n_dim e_dim = molecules[0].e_dim node_num = len(molecules) train_mask, validate_mask, test_mask = sample(list(range(node_num)), cfg['TRAIN_PER'], cfg['VALIDATE_PER'], cfg['TEST_PER']) n_seg = int(len(train_mask) / (cfg['BATCH'] + 1)) n_seg = min(len(train_mask), n_seg) train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1)) n_seg = min(len(validate_mask), n_seg) validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(test_mask) / (cfg['BATCH'] + 1)) n_seg = min(len(test_mask), n_seg) test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)] print(train_mask[0], validate_mask[0], test_mask[0]) print(len(train_mask_list), len(validate_mask_list), len(test_mask_list)) t_properties = properties[train_mask, :] prop_mean = np.mean(t_properties, axis=0) print('mean:', prop_mean) prop_std = np.std(t_properties.tolist(), axis=0, ddof=1) print('std:', prop_std) prop_mad = robust.mad(t_properties.tolist(), axis=0) print('mad:', prop_mad) norm_properties = (properties - prop_mean) / prop_std if position_encoder_path and os.path.exists(position_encoder_path): position_encoder = torch.load(position_encoder_path) position_encoder.eval() else: print('NO POSITION ENCODER IS BEING USED!!!') position_encoder = None model = AMPNN(n_dim=n_dim, e_dim=e_dim, config=cfg, position_encoder=position_encoder, use_cuda=use_cuda) regression = MLP(cfg['F_DIM'], 1, h_dims=cfg['MLP_DIMS'], dropout=cfg['DROPOUT']) if use_cuda: model.cuda() regression.cuda() for name, param in chain(model.named_parameters(), regression.named_parameters()): if param.requires_grad: print(name, ":", param.shape) optimizer = optim.Adam(filter( lambda x: x.requires_grad, chain(model.parameters(), regression.parameters())), lr=cfg['LR'], weight_decay=cfg['DECAY']) scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=cfg['GAMMA']) matrix_cache = MatrixCache(cfg['MAX_DICT']) loss_fuc = MSELoss() logs = [] def forward(mask: list, name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor): nfs = torch.cat([molecules[i].node_features for i in mask]) efs = torch.cat([molecules[i].edge_features for i in mask]) if use_cuda: nfs = nfs.cuda() efs = efs.cuda() us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name, use_cuda) embeddings, _ = model(nfs, efs, us, vs, mm_tuple, name, [smiles[i] for i in mask]) std_loss = 0 logits = regression(embeddings) target = norm_properties[mask, :] target = torch.tensor(target.astype(np.float32), dtype=torch.float32) if use_cuda: target = target.cuda() return logits, target, std_loss def train(mask_list: list, name=None): model.train() regression.train() u_losses = [] losses = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None optimizer.zero_grad() logits, target, std_loss = forward(m, name=name_) u_loss = loss_fuc(logits, target) u_losses.append(u_loss.cpu().item()) loss = u_loss + std_loss # loss.backward() # optimizer.step() losses.append(loss) if len(losses) >= cfg['PACK'] or i == len(mask_list) - 1: (sum(losses) / len(losses)).backward() optimizer.step() losses.clear() u_loss = np.average(u_losses) print('\t\tSemi-supervised loss: {:.4f}'.format(u_loss)) logs[-1].update({'on_train_loss': u_loss}) def evaluate(mask_list: list, name=None, visualize=None): model.eval() regression.eval() losses = [] masks = [] logits_list = [] target_list = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None logits, target, _ = forward(m, name=name_) loss = loss_fuc(logits, target) losses.append(loss.cpu().item()) if visualize: masks.extend(m) logits_list.append(logits.cpu().detach().numpy()) target_list.append(target.cpu().detach().numpy()) mse_loss = np.average(losses) * (prop_std[0]**2) rmse_loss = np.average([loss**0.5 for loss in losses]) * prop_std[0] print('\t\tMSE Loss: {:.3f}'.format(mse_loss)) print('\t\tRMSE Loss: {:.3f}'.format(rmse_loss)) logs[-1].update({'{}_loss'.format(name): mse_loss}) logs[-1].update({'{}_metric'.format(name): rmse_loss}) if visualize: all_logits = np.vstack(logits_list) all_target = np.vstack(target_list) best_ids, best_ds, worst_ids, worst_ds = \ plt_multiple_scatter(GRAPH_PATH + visualize, masks, all_logits, all_target) print('\t\tBest performance on:') for i, d in zip(best_ids, best_ds): print('\t\t\t{}: {}'.format(smiles[i], d)) print('\t\tWorst performance on:') for i, d in zip(worst_ids, worst_ds): print('\t\t\t{}: {}'.format(smiles[i], d)) for epoch in range(cfg['ITERATION']): logs.append({'epoch': epoch + 1}) scheduler.step(epoch=epoch) print('In iteration {}:'.format(epoch + 1)) print('\tTraining: ') train(train_mask_list, name='train') print('\tEvaluating training: ') evaluate( train_mask_list, name='train', # visualize='train_{}'.format(epoch + 1) if (epoch + 1) % cfg['EVAL'] == 0 else None ) print('\tEvaluating validation: ') evaluate( validate_mask_list, name='evaluate', # visualize='val_{}'.format(epoch + 1) if (epoch + 1) % cfg['EVAL'] == 0 else None ) print('\tEvaluating test: ') evaluate( test_mask_list, name='test', # visualize='test' if epoch + 1 == cfg['ITERATION'] else None ) gc.collect() d = {'metric': 'RMSE', 'logs': logs} with open('{}{}.json'.format(LOG_PATH, tag), 'w+', encoding='utf-8') as fp: json.dump(d, fp)