def group_level1_preds(filtered_data): df_list = [] model_list = [x for x in filtered_data.columns if x.startswith('m')] filtered_data['logdur'] = np.log(1 + 0.1 * filtered_data['duration'] / 10000) for col in ['ml2']: df = filtered_data.groupby('process_id').apply(lambda x: pd.Series({ col + '_wgmedianlogdur': weighted.median(x[col], x['logdur'] / x[col]), })) df_list.append(df) if 'final_rinse_total_turbidity_liter' in filtered_data.columns: filtered_data['score'] = mapeu( filtered_data['final_rinse_total_turbidity_liter'], filtered_data['ml2']) df = filtered_data[filtered_data.groupby('process_id').score.transform( 'min') == filtered_data['score']] df['ml2best'] = df['ml2'] df = filtered_data.groupby('process_id').apply(lambda x: pd.Series({ 'final_rinse_total_turbidity_liter': x['final_rinse_total_turbidity_liter'].max(), })) df_list.append(df) return pd.concat(df_list, axis=1)
def print_submissions_stats(val_ensemble, Y_valid, test_ensemble): print(test_ensemble.shape) print('val mape', mape(Y_valid, val_ensemble)) print('-------------------------------------------') print('val wg median %s', weighted.median(val_ensemble, 1 / val_ensemble)) print('true val wg median', weighted.median(Y_valid, 1 / Y_valid)) print('test wg median %s', weighted.median(test_ensemble, 1 / test_ensemble)) print('-------------------------------------------') print('val median', val_ensemble.median()) print('true val median', Y_valid.median()) print('test median', test_ensemble.median()) print('-------------------------------------------') print('val min', val_ensemble.min()) print('true val min', Y_valid.min()) print('test min', test_ensemble.min()) print('-------------------------------------------')
def get_weighted_median(self, weighted=True): ar_median_weights = self.ar_flux_bins if weighted else self.ar_unweighted_flux_bins res = np.zeros(self.ar_z.size) for n in range(self.ar_z.size): res[n] = weighted_module.median(np.arange(self.flux_res), ar_median_weights[n]) return res / self.flux_res * self.flux_range + self.flux_offset
def median_distance(self): """ returns the median of the medians of each contact's distances """ if len(self) == 0: return None medians = [c.median_distance() for c in self] weights = [c.duration() for c in self] weights = [w / sum(weights) for w in weights] df = pd.DataFrame({'medians': medians, 'w': weights}) return weighted.median(df['medians'], df['w'])
def rolling_weighted_median(ar_data, ar_weights, box_size): ar_flux_smoothed = np.zeros_like(ar_data) box_size_lower = -(box_size // 2) box_size_upper = box_size // 2 + (box_size & 1) for j in range(ar_data.size): start = max(j + box_size_lower, 0) end = min(j + box_size_upper, ar_data.size) ar_flux_smoothed[j] = weighted.median(ar_data[start:end], ar_weights[start:end]) return ar_flux_smoothed
def _find_edge_vertex_component_medians(self, edge_vertices): edge_vertex_component_medians = [] for edge in range(self.NUMBER_OF_EDGES): if edge in [self.TOP_EDGE, self.BOTTOM_EDGE]: edge_vertex_components = numpy.array([round(v[0][1]) for v in edge_vertices[edge]]) elif edge in [self.LEFT_EDGE, self.RIGHT_EDGE]: edge_vertex_components = numpy.array([round(v[0][0]) for v in edge_vertices[edge]]) edge_vertex_weights = self._compute_edge_vertex_weights(edge_vertices[edge], edge) edge_vertex_component_medians.append(weighted.median(edge_vertex_components, edge_vertex_weights)) return edge_vertex_component_medians
def median_distance(self): """ Returns median distance in metres (a bit more robust towards outliers) """ t = self.timestamps_in_contact() # Handle the case there is no or only one GPS point if len(t) == 0: # If no GPS points are given, all we can do is to return an unrealistic distance value logger.error( "Requesting median distance from an empty trajectory - returning unphysical value 1e6" ) return 1e6 if len(t) == 1: # For a single GPS point, the median is the distance at that point logger.warning( "Requesting median distance from a trajectory with a single point" ) return self.cd['dists'][0] weights = np.array([t[i] - t[i - 1] for i in range(1, len(t))]) weights /= sum(weights) # because len(self.cd['dists']) = len(weights) + 1, I do this transformation dists = [(self.cd['dists'][i] + self.cd['dists'][i - 1]) / 2 for i in range(1, len(t))] df = pd.DataFrame({'dists': dists, 'w': weights}) return weighted.median(df['dists'], df['w'])
def display_stats(self): series = self.inputSERIES.text().upper() # take inputs from GUI model = self.inputMODEL.text().upper() full = series + model loc_df = global_df.loc[global_df['Model'] == full] used = loc_df[loc_df['Stan:'] == 'Używany'] # create separate dataframes for used and new models new = loc_df[loc_df['Stan:'] == 'Nowy'] del loc_df new.reset_index(drop=True, inplace=True) used.reset_index(drop=True, inplace=True) final_output = '' """ -------- Calculate variables needed for weighted mean, if there is no such model give such information -------- Save all of the responses (what should be displayed) to the final_output variable. - instead of print in case of using terminal """ s_iloczyn = 0 # sum of multiplications for weighted mean s_wag = 0 # sum of weights (number of bought units - ( column: Kupione:) if new.empty: final_output += "\n New: -----------------------------" final_output += '\n\n No such new model' else: if new['Kupione'].sum() == 0: final_output += "\n \nNew: -----------------------------" final_output += "\n \nWeighted mean : Null - 0 sold " final_output += "\n \nWeighted median : Null - 0 sold " final_output += "\n \nStatistics: \n " final_output += new["Cena"].describe() else: for i in range(0, len(new)): kupione = new['Kupione'][i] s_iloczyn += new['Cena'][i] * kupione s_wag += kupione weighted_average = s_iloczyn / s_wag final_output += "\n \nNew: -----------------------------" final_output += "\n \nWeighted mean : " + str(weighted_average) final_output += "\n \nWeighted median : " + str(weighted.median(new['Cena'], new['Kupione'])) final_output += "\n \nStatistics: \n\n " final_output += str(new["Cena"].describe()) ### s_iloczyn = 0 s_wag = 0 if used.empty: final_output += "\n\n Used: --------------------------" final_output += '\n\n No such used model' else: if used['Kupione'].sum() == 0: final_output += "\n \nUsed: -----------------------------" final_output += "\n \nWeighted mean : Null - 0 sold " final_output += "\n \nWeighted median : Null - 0 sold " final_output += "\n \nStatistics: \n " final_output += used["Cena"].describe() else: for i in range(0, len(used)): kupione = used['Kupione'][i] s_iloczyn = s_iloczyn + ((used['Cena'][i]) * kupione) s_wag += kupione weighted_average = s_iloczyn / s_wag final_output += "\n\n Used: -----------------------------" final_output += "\n\n Weighted mean : " + str(weighted_average) final_output += "\n\n Weighted median : " + str(weighted.median(used['Cena'], used['Kupione'])) final_output += "\n\n Statistics: \n\n " final_output += used["Cena"].describe().to_string() self.textBrowser.setText(final_output) #display output in GUI display window (text browser class)
def fit3(data_dict={}, params=None): params = params.copy() uselog = params.pop('uselog') usewg = params.pop('usewg', False) train_dataset = params.get('train_on') val_dataset = params.get('validate_on') print('training on', train_dataset) print('validate on', val_dataset) if params.get('from') is not None: to_trasnform = params.pop('to') from_trasnform = params.pop('from') elif uselog: to_trasnform = to_log from_trasnform = from_log x_dict = dict() y_dict = dict() y_true_dict = dict() pred_dict = dict() for dataset_name, dataset in data_dict.items(): x, y = generate_target(dataset) print(dataset_name + ' post target gen shape', x.shape) y_true_dict.update({dataset_name: y}) x_dict.update({dataset_name: x}) if uselog and y is not None: y = to_trasnform(y) y_dict.update({dataset_name: y}) if usewg: print('using wg') # take the labels div_train = y_true_dict[train_dataset].copy() # set labels<290000 to 290000 div_train[div_train < 290000] = 290000 div_val = y_true_dict[val_dataset].copy() div_val[div_val < 290000] = 290000 #create ligthgbm vector with weigts d_train = lgbm.Dataset(x_dict[train_dataset], y_dict[train_dataset], weight=1 / div_train) d_valid = lgbm.Dataset(x_dict[val_dataset], y_dict[val_dataset], weight=1 / div_val) else: d_train = lgbm.Dataset(x_dict[train_dataset], y_dict[train_dataset]) d_valid = lgbm.Dataset(x_dict[val_dataset], y_dict[val_dataset]) model = lgbm.train(params, d_train, 5000, valid_sets=[d_train, d_valid], verbose_eval=100, early_stopping_rounds=100) for dataset_name, dataset in x_dict.items(): if dataset_name != train_dataset: pred = model.predict(dataset) if uselog: pred = from_trasnform(pred) pred_dict.update({dataset_name: pred}) print('mape validation score %s', mape(y_true_dict[val_dataset], pred_dict[val_dataset])) print('val wg median %s', weighted.median(pred_dict[val_dataset], 1 / pred_dict[val_dataset])) for dataset_name, dataset in x_dict.items(): if dataset_name != train_dataset: if y_dict[dataset_name] is not None: print(dataset_name + 'mape score ' + str( mape(y_true_dict[dataset_name], pred_dict[dataset_name]))) print( dataset_name + 'val wg median and median', weighted.median(pred_dict[dataset_name], 1 / pred_dict[dataset_name]), np.median(pred_dict[dataset_name])) return pred_dict, y_true_dict, model
def fit4(data_dict={}, params=None): params = params.copy() uselog = params.pop('uselog') usewg = params.pop('usewg', False) train_dataset = params.get('train_on') val_dataset = params.get('validate_on') if params.get('from') is not None: to_trasnform = params.pop('to') from_trasnform = params.pop('from') elif uselog: to_trasnform = to_log from_trasnform = from_log x_dict = {} y_dict = {} y_true_dict = {} pred_dict = {} for dataset_name, dataset in data_dict.items(): x, y = generate_target2(dataset) mono_const = [1 if a.startswith('m') else 0 for a in x.columns] print(dataset_name + ' post target gen shape', x.shape) y_true_dict.update({dataset_name: y}) x_dict.update({dataset_name: x}) if uselog and y is not None: y = to_trasnform(y) y_dict.update({dataset_name: y}) # params['mc']= mono_const if usewg: print('using wg') div_train = y_true_dict[train_dataset].copy() div_train[div_train < 290000] = 290000 div_val = y_true_dict[val_dataset].copy() div_val[div_val < 290000] = 290000 d_train = lgbm.Dataset( x_dict[train_dataset], y_dict[train_dataset], weight=1 / div_train, # init_score=get_init_score(x_dict[train_dataset]) ) d_valid = lgbm.Dataset( x_dict[val_dataset], y_dict[val_dataset], weight=1 / div_val, # init_score=get_init_score(x_dict[val_dataset]) ) else: d_train = lgbm.Dataset(x_dict[train_dataset], y_dict[train_dataset]) d_valid = lgbm.Dataset(x_dict[val_dataset], y_dict[val_dataset]) model = lgbm.train( params, d_train, 50000, valid_sets=[d_train, d_valid], verbose_eval=100, early_stopping_rounds=100, # feval=mapelgbm ) for dataset_name, dataset in x_dict.items(): if dataset_name != train_dataset: # ds=lgbm.Dataset(dataset,init_score=get_init_score(dataset)) pred = model.predict(dataset) # +get_init_score(dataset) if uselog: pred = from_log(pred) pred_dict.update({dataset_name: pred}) print('mape validation score %s', mape(y_true_dict[val_dataset], pred_dict[val_dataset])) print('val wg median %s', weighted.median(pred_dict[val_dataset], 1 / pred_dict[val_dataset])) for dataset_name, dataset in x_dict.items(): if dataset_name != train_dataset: if y_dict[dataset_name] is not None: print(dataset_name + 'mape score ' + str( mape(y_true_dict[dataset_name], pred_dict[dataset_name]))) print( dataset_name + 'val wg median and median', weighted.median(pred_dict[dataset_name], 1 / pred_dict[dataset_name]), np.median(pred_dict[dataset_name])) return pred_dict, y_true_dict, model
def main(config_file=None, logging_params=DEFAULT_LOGGING): # Load config config = DEFAULTS.deepcopy() if config_file is not None: config.merge(NameSpace(load_yaml_config(config_file))) # Setup logging log.setup_logging(logging_params) logger = log.get_logger(__name__) ## Load data for flagging # Load fpga restarts time_fpga_restart = [] if config.fpga_restart_file is not None: with open(config.fpga_restart_file, 'r') as handler: for line in handler: time_fpga_restart.append( ephemeris.datetime_to_unix( ephemeris.timestr_to_datetime(line.split('_')[0]))) time_fpga_restart = np.array(time_fpga_restart) # Load housekeeping flag if config.housekeeping_file is not None: ftemp = TempData.from_acq_h5(config.housekeeping_file, datasets=["time_flag"]) else: ftemp = None # Load jump data if config.jump_file is not None: with h5py.File(config.jump_file, 'r') as handler: jump_time = handler["time"][:] jump_size = handler["jump_size"][:] else: jump_time = None jump_size = None # Load rain data if config.rain_file is not None: with h5py.File(config.rain_file, 'r') as handler: rain_ranges = handler["time_range_conservative"][:] else: rain_ranges = [] # Load data flags data_flags = {} if config.data_flags: finder.connect_database() flag_types = finder.DataFlagType.select() possible_data_flags = [] for ft in flag_types: possible_data_flags.append(ft.name) if ft.name in config.data_flags: new_data_flags = finder.DataFlag.select().where( finder.DataFlag.type == ft) data_flags[ft.name] = list(new_data_flags) # Set desired range of time start_time = (ephemeris.datetime_to_unix( datetime.datetime( *config.start_date)) if config.start_date is not None else None) end_time = (ephemeris.datetime_to_unix(datetime.datetime( *config.end_date)) if config.end_date is not None else None) ## Find gain files files = {} for src in config.sources: files[src] = sorted( glob.glob( os.path.join(config.directory, src.lower(), "%s_%s_lsd_*.h5" % ( config.prefix, src.lower(), )))) csd = {} for src in config.sources: csd[src] = np.array( [int(os.path.splitext(ff)[0][-4:]) for ff in files[src]]) for src in config.sources: logger.info("%s: %d files" % (src, len(csd[src]))) ## Remove files that occur during flag csd_flag = {} for src in config.sources: body = ephemeris.source_dictionary[src] csd_flag[src] = np.ones(csd[src].size, dtype=np.bool) for ii, cc in enumerate(csd[src][:]): ttrans = ephemeris.transit_times(body, ephemeris.csd_to_unix(cc))[0] if (start_time is not None) and (ttrans < start_time): csd_flag[src][ii] = False continue if (end_time is not None) and (ttrans > end_time): csd_flag[src][ii] = False continue # If requested, remove daytime transits if not config.include_daytime.get( src, config.include_daytime.default) and daytime_flag( ttrans)[0]: logger.info("%s CSD %d: daytime transit" % (src, cc)) csd_flag[src][ii] = False continue # Remove transits during HKP drop out if ftemp is not None: itemp = np.flatnonzero( (ftemp.time[:] >= (ttrans - config.transit_window)) & (ftemp.time[:] <= (ttrans + config.transit_window))) tempflg = ftemp['time_flag'][itemp] if (tempflg.size == 0) or ((np.sum(tempflg, dtype=np.float32) / float(tempflg.size)) < 0.50): logger.info("%s CSD %d: no housekeeping" % (src, cc)) csd_flag[src][ii] = False continue # Remove transits near jumps if jump_time is not None: njump = np.sum((jump_size > config.min_jump_size) & (jump_time > (ttrans - config.jump_window)) & (jump_time < ttrans)) if njump > config.max_njump: logger.info("%s CSD %d: %d jumps before" % (src, cc, njump)) csd_flag[src][ii] = False continue # Remove transits near rain for rng in rain_ranges: if (((ttrans - config.transit_window) <= rng[1]) and ((ttrans + config.transit_window) >= rng[0])): logger.info("%s CSD %d: during rain" % (src, cc)) csd_flag[src][ii] = False break # Remove transits during data flag for name, flag_list in data_flags.items(): if csd_flag[src][ii]: for flg in flag_list: if (((ttrans - config.transit_window) <= flg.finish_time) and ((ttrans + config.transit_window) >= flg.start_time)): logger.info("%s CSD %d: %s flag" % (src, cc, name)) csd_flag[src][ii] = False break # Print number of files left after flagging for src in config.sources: logger.info("%s: %d files (after flagging)" % (src, np.sum(csd_flag[src]))) ## Construct pair wise differences npair = len(config.diff_pair) shift = [nd * 24.0 * 3600.0 for nd in config.nday_shift] calmap = [] calpair = [] for (tsrc, csrc), sh in zip(config.diff_pair, shift): body_test = ephemeris.source_dictionary[tsrc] body_cal = ephemeris.source_dictionary[csrc] for ii, cc in enumerate(csd[tsrc]): if csd_flag[tsrc][ii]: test_transit = ephemeris.transit_times( body_test, ephemeris.csd_to_unix(cc))[0] cal_transit = ephemeris.transit_times(body_cal, test_transit + sh)[0] cal_csd = int(np.fix(ephemeris.unix_to_csd(cal_transit))) ttrans = np.sort([test_transit, cal_transit]) if cal_csd in csd[csrc]: jj = list(csd[csrc]).index(cal_csd) if csd_flag[csrc][jj] and not np.any( (time_fpga_restart >= ttrans[0]) & (time_fpga_restart <= ttrans[1])): calmap.append([ii, jj]) calpair.append([tsrc, csrc]) calmap = np.array(calmap) calpair = np.array(calpair) ntransit = calmap.shape[0] logger.info("%d total transit pairs" % ntransit) for ii in range(ntransit): t1 = ephemeris.transit_times( ephemeris.source_dictionary[calpair[ii, 0]], ephemeris.csd_to_unix(csd[calpair[ii, 0]][calmap[ii, 0]]))[0] t2 = ephemeris.transit_times( ephemeris.source_dictionary[calpair[ii, 1]], ephemeris.csd_to_unix(csd[calpair[ii, 1]][calmap[ii, 1]]))[0] logger.info("%s (%d) - %s (%d): %0.1f hr" % (calpair[ii, 0], csd_flag[calpair[ii, 0]][calmap[ii, 0]], calpair[ii, 1], csd_flag[calpair[ii, 1]][calmap[ii, 1]], (t1 - t2) / 3600.0)) # Determine unique diff pairs diff_name = np.array(['%s/%s' % tuple(cp) for cp in calpair]) uniq_diff, lbl_diff, cnt_diff = np.unique(diff_name, return_inverse=True, return_counts=True) ndiff = uniq_diff.size for ud, udcnt in zip(uniq_diff, cnt_diff): logger.info("%s: %d transit pairs" % (ud, udcnt)) ## Load gains inputmap = tools.get_correlator_inputs(datetime.datetime.utcnow(), correlator='chime') ninput = len(inputmap) nfreq = 1024 # Set up gain arrays gain = np.zeros((2, nfreq, ninput, ntransit), dtype=np.complex64) weight = np.zeros((2, nfreq, ninput, ntransit), dtype=np.float32) input_sort = np.zeros((2, ninput, ntransit), dtype=np.int) kcsd = np.zeros((2, ntransit), dtype=np.float32) timestamp = np.zeros((2, ntransit), dtype=np.float64) is_daytime = np.zeros((2, ntransit), dtype=np.bool) for tt in range(ntransit): for kk, (src, ind) in enumerate(zip(calpair[tt], calmap[tt])): body = ephemeris.source_dictionary[src] filename = files[src][ind] logger.info("%s: %s" % (src, filename)) temp = containers.StaticGainData.from_file(filename) freq = temp.freq[:] inputs = temp.input[:] isort = reorder_inputs(inputmap, inputs) inputs = inputs[isort] gain[kk, :, :, tt] = temp.gain[:, isort] weight[kk, :, :, tt] = temp.weight[:, isort] input_sort[kk, :, tt] = isort kcsd[kk, tt] = temp.attrs['lsd'] timestamp[kk, tt] = ephemeris.transit_times( body, ephemeris.csd_to_unix(kcsd[kk, tt]))[0] is_daytime[kk, tt] = daytime_flag(timestamp[kk, tt])[0] if np.any(isort != np.arange(isort.size)): logger.info("Input ordering has changed: %s" % ephemeris.unix_to_datetime( timestamp[kk, tt]).strftime("%Y-%m-%d")) logger.info("") inputs = np.array([(inp.id, inp.input_sn) for inp in inputmap], dtype=[('chan_id', 'u2'), ('correlator_input', 'S32')]) ## Load input flags inpflg = np.ones((2, ninput, ntransit), dtype=np.bool) min_flag_time = np.min(timestamp) - 7.0 * 24.0 * 60.0 * 60.0 max_flag_time = np.max(timestamp) + 7.0 * 24.0 * 60.0 * 60.0 flaginput_files = sorted( glob.glob( os.path.join(config.flaginput_dir, "*" + config.flaginput_suffix, "*.h5"))) if flaginput_files: logger.info("Found %d flaginput files." % len(flaginput_files)) tmp = andata.FlagInputData.from_acq_h5(flaginput_files, datasets=()) start, stop = [ int(yy) for yy in np.percentile( np.flatnonzero((tmp.time[:] >= min_flag_time) & (tmp.time[:] <= max_flag_time)), [0, 100]) ] cont = andata.FlagInputData.from_acq_h5(flaginput_files, start=start, stop=stop, datasets=['flag']) for kk in range(2): inpflg[kk, :, :] = cont.resample('flag', timestamp[kk], transpose=True) logger.info("Flaginput time offsets in minutes (pair %d):" % kk) logger.info( str( np.fix((cont.time[cont.search_update_time(timestamp[kk])] - timestamp[kk]) / 60.0).astype(np.int))) # Sort flags so they are in same order for tt in range(ntransit): for kk in range(2): inpflg[kk, :, tt] = inpflg[kk, input_sort[kk, :, tt], tt] # Do not apply input flag to phase reference for ii in config.index_phase_ref: inpflg[:, ii, :] = True ## Flag out gains with high uncertainty and frequencies with large fraction of data flagged frac_err = tools.invert_no_zero(np.sqrt(weight) * np.abs(gain)) flag = np.all((weight > 0.0) & (np.abs(gain) > 0.0) & (frac_err < config.max_uncertainty), axis=0) freq_flag = ((np.sum(flag, axis=(1, 2), dtype=np.float32) / float(np.prod(flag.shape[1:]))) > config.freq_threshold) if config.apply_rfi_mask: freq_flag &= np.logical_not(rfi.frequency_mask(freq)) flag = flag & freq_flag[:, np.newaxis, np.newaxis] good_freq = np.flatnonzero(freq_flag) logger.info("Number good frequencies %d" % good_freq.size) ## Generate flags with more conservative cuts on frequency c_flag = flag & np.all(frac_err < config.conservative.max_uncertainty, axis=0) c_freq_flag = ((np.sum(c_flag, axis=(1, 2), dtype=np.float32) / float(np.prod(c_flag.shape[1:]))) > config.conservative.freq_threshold) if config.conservative.apply_rfi_mask: c_freq_flag &= np.logical_not(rfi.frequency_mask(freq)) c_flag = c_flag & c_freq_flag[:, np.newaxis, np.newaxis] c_good_freq = np.flatnonzero(c_freq_flag) logger.info("Number good frequencies (conservative thresholds) %d" % c_good_freq.size) ## Apply input flags flag &= np.all(inpflg[:, np.newaxis, :, :], axis=0) ## Update flags based on beam flag if config.beam_flag_file is not None: dbeam = andata.BaseData.from_acq_h5(config.beam_flag_file) db_csd = np.floor(ephemeris.unix_to_csd(dbeam.index_map['time'][:])) for ii, name in enumerate(config.beam_flag_datasets): logger.info("Applying %s beam flag." % name) if not ii: db_flag = dbeam.flags[name][:] else: db_flag &= dbeam.flags[name][:] cnt = 0 for ii, dbc in enumerate(db_csd): this_csd = np.flatnonzero(np.any(kcsd == dbc, axis=0)) if this_csd.size > 0: logger.info("Beam flag for %d matches %s." % (dbc, str(kcsd[:, this_csd]))) flag[:, :, this_csd] &= db_flag[np.newaxis, :, ii, np.newaxis] cnt += 1 logger.info("Applied %0.1f percent of the beam flags" % (100.0 * cnt / float(db_csd.size), )) ## Flag inputs with large amount of missing data input_frac_flagged = ( np.sum(flag[good_freq, :, :], axis=(0, 2), dtype=np.float32) / float(good_freq.size * ntransit)) input_flag = input_frac_flagged > config.input_threshold for ii in config.index_phase_ref: logger.info("Phase reference %d has %0.3f fraction of data flagged." % (ii, input_frac_flagged[ii])) input_flag[ii] = True good_input = np.flatnonzero(input_flag) flag = flag & input_flag[np.newaxis, :, np.newaxis] logger.info("Number good inputs %d" % good_input.size) ## Calibrate gaincal = gain[0] * tools.invert_no_zero(gain[1]) frac_err_cal = np.sqrt(frac_err[0]**2 + frac_err[1]**2) count = np.sum(flag, axis=-1, dtype=np.int) stat_flag = count > config.min_num_transit ## Calculate phase amp = np.abs(gaincal) phi = np.angle(gaincal) ## Calculate polarisation groups pol_dict = {'E': 'X', 'S': 'Y'} cyl_dict = {2: 'A', 3: 'B', 4: 'C', 5: 'D'} if config.group_by_cyl: group_id = [ (inp.pol, inp.cyl) if tools.is_chime(inp) and (ii in good_input) else None for ii, inp in enumerate(inputmap) ] else: group_id = [ inp.pol if tools.is_chime(inp) and (ii in good_input) else None for ii, inp in enumerate(inputmap) ] ugroup_id = sorted([uidd for uidd in set(group_id) if uidd is not None]) ngroup = len(ugroup_id) group_list_noref = [ np.array([ gg for gg, gid in enumerate(group_id) if (gid == ugid) and gg not in config.index_phase_ref ]) for ugid in ugroup_id ] group_list = [ np.array([gg for gg, gid in enumerate(group_id) if gid == ugid]) for ugid in ugroup_id ] if config.group_by_cyl: group_str = [ "%s-%s" % (pol_dict[pol], cyl_dict[cyl]) for pol, cyl in ugroup_id ] else: group_str = [pol_dict[pol] for pol in ugroup_id] index_phase_ref = [] for gstr, igroup in zip(group_str, group_list): candidate = [ii for ii in config.index_phase_ref if ii in igroup] if len(candidate) != 1: index_phase_ref.append(None) else: index_phase_ref.append(candidate[0]) logger.info( "Phase reference: %s" % ', '.join(['%s = %s' % tpl for tpl in zip(group_str, index_phase_ref)])) ## Apply thermal correction to amplitude if config.amp_thermal.enabled: logger.info("Applying thermal correction.") # Load the temperatures tdata = TempData.from_acq_h5(config.amp_thermal.filename) index = tdata.search_sensors(config.amp_thermal.sensor)[0] temp = tdata.datasets[config.amp_thermal.field][index] temp_func = scipy.interpolate.interp1d(tdata.time, temp, **config.amp_thermal.interp) itemp = temp_func(timestamp) dtemp = itemp[0] - itemp[1] flag_func = scipy.interpolate.interp1d( tdata.time, tdata.datasets['flag'][index].astype(np.float32), **config.amp_thermal.interp) dtemp_flag = np.all(flag_func(timestamp) == 1.0, axis=0) flag &= dtemp_flag[np.newaxis, np.newaxis, :] for gstr, igroup in zip(group_str, group_list): pstr = gstr[0] thermal_coeff = np.polyval(config.amp_thermal.coeff[pstr], freq) gthermal = 1.0 + thermal_coeff[:, np.newaxis, np.newaxis] * dtemp[ np.newaxis, np.newaxis, :] amp[:, igroup, :] *= tools.invert_no_zero(gthermal) ## Compute common mode if config.subtract_common_mode_before: logger.info("Calculating common mode amplitude and phase.") cmn_amp, flag_cmn_amp = compute_common_mode(amp, flag, group_list_noref, median=False) cmn_phi, flag_cmn_phi = compute_common_mode(phi, flag, group_list_noref, median=False) # Subtract common mode (from phase only) logger.info("Subtracting common mode phase.") group_flag = np.zeros((ngroup, ninput), dtype=np.bool) for gg, igroup in enumerate(group_list): group_flag[gg, igroup] = True phi[:, igroup, :] = phi[:, igroup, :] - cmn_phi[:, gg, np.newaxis, :] for iref in index_phase_ref: if (iref is not None) and (iref in igroup): flag[:, iref, :] = flag_cmn_phi[:, gg, :] ## If requested, determine and subtract a delay template if config.fit_delay_before: logger.info("Fitting delay template.") omega = timing.FREQ_TO_OMEGA * freq tau, tau_flag, _ = construct_delay_template( omega, phi, c_flag & flag, min_num_freq_for_delay_fit=config.min_num_freq_for_delay_fit) # Compute residuals logger.info("Subtracting delay template.") phi = phi - tau[np.newaxis, :, :] * omega[:, np.newaxis, np.newaxis] ## Normalize by median over time logger.info("Calculating median amplitude and phase.") med_amp = np.zeros((nfreq, ninput, ndiff), dtype=amp.dtype) med_phi = np.zeros((nfreq, ninput, ndiff), dtype=phi.dtype) count_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=np.int) stat_flag_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=np.bool) def weighted_mean(yy, ww, axis=-1): return np.sum(ww * yy, axis=axis) * tools.invert_no_zero( np.sum(ww, axis=axis)) for dd in range(ndiff): this_diff = np.flatnonzero(lbl_diff == dd) this_flag = flag[:, :, this_diff] this_amp = amp[:, :, this_diff] this_amp_err = this_amp * frac_err_cal[:, :, this_diff] * this_flag.astype( np.float32) this_phi = phi[:, :, this_diff] this_phi_err = frac_err_cal[:, :, this_diff] * this_flag.astype( np.float32) count_by_diff[:, :, dd] = np.sum(this_flag, axis=-1, dtype=np.int) stat_flag_by_diff[:, :, dd] = count_by_diff[:, :, dd] > config.min_num_transit if config.weighted_mean == 2: logger.info("Calculating inverse variance weighted mean.") med_amp[:, :, dd] = weighted_mean(this_amp, tools.invert_no_zero(this_amp_err**2), axis=-1) med_phi[:, :, dd] = weighted_mean(this_phi, tools.invert_no_zero(this_phi_err**2), axis=-1) elif config.weighted_mean == 1: logger.info("Calculating uniform weighted mean.") med_amp[:, :, dd] = weighted_mean(this_amp, this_flag.astype(np.float32), axis=-1) med_phi[:, :, dd] = weighted_mean(this_phi, this_flag.astype(np.float32), axis=-1) else: logger.info("Calculating median value.") for ff in range(nfreq): for ii in range(ninput): if np.any(this_flag[ff, ii, :]): med_amp[ff, ii, dd] = wq.median( this_amp[ff, ii, :], this_flag[ff, ii, :].astype(np.float32)) med_phi[ff, ii, dd] = wq.median( this_phi[ff, ii, :], this_flag[ff, ii, :].astype(np.float32)) damp = np.zeros_like(amp) dphi = np.zeros_like(phi) for dd in range(ndiff): this_diff = np.flatnonzero(lbl_diff == dd) damp[:, :, this_diff] = amp[:, :, this_diff] * tools.invert_no_zero( med_amp[:, :, dd, np.newaxis]) - 1.0 dphi[:, :, this_diff] = phi[:, :, this_diff] - med_phi[:, :, dd, np.newaxis] # Compute common mode if not config.subtract_common_mode_before: logger.info("Calculating common mode amplitude and phase.") cmn_amp, flag_cmn_amp = compute_common_mode(damp, flag, group_list_noref, median=True) cmn_phi, flag_cmn_phi = compute_common_mode(dphi, flag, group_list_noref, median=True) # Subtract common mode (from phase only) logger.info("Subtracting common mode phase.") group_flag = np.zeros((ngroup, ninput), dtype=np.bool) for gg, igroup in enumerate(group_list): group_flag[gg, igroup] = True dphi[:, igroup, :] = dphi[:, igroup, :] - cmn_phi[:, gg, np.newaxis, :] for iref in index_phase_ref: if (iref is not None) and (iref in igroup): flag[:, iref, :] = flag_cmn_phi[:, gg, :] ## Compute RMS logger.info("Calculating RMS of amplitude and phase.") mad_amp = np.zeros((nfreq, ninput), dtype=amp.dtype) std_amp = np.zeros((nfreq, ninput), dtype=amp.dtype) mad_phi = np.zeros((nfreq, ninput), dtype=phi.dtype) std_phi = np.zeros((nfreq, ninput), dtype=phi.dtype) mad_amp_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=amp.dtype) std_amp_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=amp.dtype) mad_phi_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=phi.dtype) std_phi_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=phi.dtype) for ff in range(nfreq): for ii in range(ninput): this_flag = flag[ff, ii, :] if np.any(this_flag): std_amp[ff, ii] = np.std(damp[ff, ii, this_flag]) std_phi[ff, ii] = np.std(dphi[ff, ii, this_flag]) mad_amp[ff, ii] = 1.48625 * wq.median( np.abs(damp[ff, ii, :]), this_flag.astype(np.float32)) mad_phi[ff, ii] = 1.48625 * wq.median( np.abs(dphi[ff, ii, :]), this_flag.astype(np.float32)) for dd in range(ndiff): this_diff = this_flag & (lbl_diff == dd) if np.any(this_diff): std_amp_by_diff[ff, ii, dd] = np.std(damp[ff, ii, this_diff]) std_phi_by_diff[ff, ii, dd] = np.std(dphi[ff, ii, this_diff]) mad_amp_by_diff[ff, ii, dd] = 1.48625 * wq.median( np.abs(damp[ff, ii, :]), this_diff.astype(np.float32)) mad_phi_by_diff[ff, ii, dd] = 1.48625 * wq.median( np.abs(dphi[ff, ii, :]), this_diff.astype(np.float32)) ## Construct delay template if not config.fit_delay_before: logger.info("Fitting delay template.") omega = timing.FREQ_TO_OMEGA * freq tau, tau_flag, _ = construct_delay_template( omega, dphi, c_flag & flag, min_num_freq_for_delay_fit=config.min_num_freq_for_delay_fit) # Compute residuals logger.info("Subtracting delay template from phase.") resid = (dphi - tau[np.newaxis, :, :] * omega[:, np.newaxis, np.newaxis]) * flag.astype(np.float32) else: resid = dphi tau_count = np.sum(tau_flag, axis=-1, dtype=np.int) tau_stat_flag = tau_count > config.min_num_transit tau_count_by_diff = np.zeros((ninput, ndiff), dtype=np.int) tau_stat_flag_by_diff = np.zeros((ninput, ndiff), dtype=np.bool) for dd in range(ndiff): this_diff = np.flatnonzero(lbl_diff == dd) tau_count_by_diff[:, dd] = np.sum(tau_flag[:, this_diff], axis=-1, dtype=np.int) tau_stat_flag_by_diff[:, dd] = tau_count_by_diff[:, dd] > config.min_num_transit ## Calculate statistics of residuals std_resid = np.zeros((nfreq, ninput), dtype=phi.dtype) mad_resid = np.zeros((nfreq, ninput), dtype=phi.dtype) std_resid_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=phi.dtype) mad_resid_by_diff = np.zeros((nfreq, ninput, ndiff), dtype=phi.dtype) for ff in range(nfreq): for ii in range(ninput): this_flag = flag[ff, ii, :] if np.any(this_flag): std_resid[ff, ii] = np.std(resid[ff, ii, this_flag]) mad_resid[ff, ii] = 1.48625 * wq.median( np.abs(resid[ff, ii, :]), this_flag.astype(np.float32)) for dd in range(ndiff): this_diff = this_flag & (lbl_diff == dd) if np.any(this_diff): std_resid_by_diff[ff, ii, dd] = np.std(resid[ff, ii, this_diff]) mad_resid_by_diff[ff, ii, dd] = 1.48625 * wq.median( np.abs(resid[ff, ii, :]), this_diff.astype(np.float32)) ## Calculate statistics of delay template mad_tau = np.zeros((ninput, ), dtype=phi.dtype) std_tau = np.zeros((ninput, ), dtype=phi.dtype) mad_tau_by_diff = np.zeros((ninput, ndiff), dtype=phi.dtype) std_tau_by_diff = np.zeros((ninput, ndiff), dtype=phi.dtype) for ii in range(ninput): this_flag = tau_flag[ii] if np.any(this_flag): std_tau[ii] = np.std(tau[ii, this_flag]) mad_tau[ii] = 1.48625 * wq.median(np.abs(tau[ii]), this_flag.astype(np.float32)) for dd in range(ndiff): this_diff = this_flag & (lbl_diff == dd) if np.any(this_diff): std_tau_by_diff[ii, dd] = np.std(tau[ii, this_diff]) mad_tau_by_diff[ii, dd] = 1.48625 * wq.median( np.abs(tau[ii]), this_diff.astype(np.float32)) ## Define output res = { "timestamp": { "data": timestamp, "axis": ["div", "time"] }, "is_daytime": { "data": is_daytime, "axis": ["div", "time"] }, "csd": { "data": kcsd, "axis": ["div", "time"] }, "pair_map": { "data": lbl_diff, "axis": ["time"] }, "pair_count": { "data": cnt_diff, "axis": ["pair"] }, "gain": { "data": gaincal, "axis": ["freq", "input", "time"] }, "frac_err": { "data": frac_err_cal, "axis": ["freq", "input", "time"] }, "flags/gain": { "data": flag, "axis": ["freq", "input", "time"], "flag": True }, "flags/gain_conservative": { "data": c_flag, "axis": ["freq", "input", "time"], "flag": True }, "flags/count": { "data": count, "axis": ["freq", "input"], "flag": True }, "flags/stat": { "data": stat_flag, "axis": ["freq", "input"], "flag": True }, "flags/count_by_pair": { "data": count_by_diff, "axis": ["freq", "input", "pair"], "flag": True }, "flags/stat_by_pair": { "data": stat_flag_by_diff, "axis": ["freq", "input", "pair"], "flag": True }, "med_amp": { "data": med_amp, "axis": ["freq", "input", "pair"] }, "med_phi": { "data": med_phi, "axis": ["freq", "input", "pair"] }, "flags/group_flag": { "data": group_flag, "axis": ["group", "input"], "flag": True }, "cmn_amp": { "data": cmn_amp, "axis": ["freq", "group", "time"] }, "cmn_phi": { "data": cmn_phi, "axis": ["freq", "group", "time"] }, "amp": { "data": damp, "axis": ["freq", "input", "time"] }, "phi": { "data": dphi, "axis": ["freq", "input", "time"] }, "std_amp": { "data": std_amp, "axis": ["freq", "input"] }, "std_amp_by_pair": { "data": std_amp_by_diff, "axis": ["freq", "input", "pair"] }, "mad_amp": { "data": mad_amp, "axis": ["freq", "input"] }, "mad_amp_by_pair": { "data": mad_amp_by_diff, "axis": ["freq", "input", "pair"] }, "std_phi": { "data": std_phi, "axis": ["freq", "input"] }, "std_phi_by_pair": { "data": std_phi_by_diff, "axis": ["freq", "input", "pair"] }, "mad_phi": { "data": mad_phi, "axis": ["freq", "input"] }, "mad_phi_by_pair": { "data": mad_phi_by_diff, "axis": ["freq", "input", "pair"] }, "tau": { "data": tau, "axis": ["input", "time"] }, "flags/tau": { "data": tau_flag, "axis": ["input", "time"], "flag": True }, "flags/tau_count": { "data": tau_count, "axis": ["input"], "flag": True }, "flags/tau_stat": { "data": tau_stat_flag, "axis": ["input"], "flag": True }, "flags/tau_count_by_pair": { "data": tau_count_by_diff, "axis": ["input", "pair"], "flag": True }, "flags/tau_stat_by_pair": { "data": tau_stat_flag_by_diff, "axis": ["input", "pair"], "flag": True }, "std_tau": { "data": std_tau, "axis": ["input"] }, "std_tau_by_pair": { "data": std_tau_by_diff, "axis": ["input", "pair"] }, "mad_tau": { "data": mad_tau, "axis": ["input"] }, "mad_tau_by_pair": { "data": mad_tau_by_diff, "axis": ["input", "pair"] }, "resid_phi": { "data": resid, "axis": ["freq", "input", "time"] }, "std_resid_phi": { "data": std_resid, "axis": ["freq", "input"] }, "std_resid_phi_by_pair": { "data": std_resid_by_diff, "axis": ["freq", "input", "pair"] }, "mad_resid_phi": { "data": mad_resid, "axis": ["freq", "input"] }, "mad_resid_phi_by_pair": { "data": mad_resid_by_diff, "axis": ["freq", "input", "pair"] }, } ## Create the output container logger.info("Creating StabilityData container.") data = StabilityData() data.create_index_map( "div", np.array(["numerator", "denominator"], dtype=np.string_)) data.create_index_map("pair", np.array(uniq_diff, dtype=np.string_)) data.create_index_map("group", np.array(group_str, dtype=np.string_)) data.create_index_map("freq", freq) data.create_index_map("input", inputs) data.create_index_map("time", timestamp[0, :]) logger.info("Writing datsets to container.") for name, dct in res.iteritems(): is_flag = dct.get('flag', False) if is_flag: dset = data.create_flag(name.split('/')[-1], data=dct['data']) else: dset = data.create_dataset(name, data=dct['data']) dset.attrs['axis'] = np.array(dct['axis'], dtype=np.string_) data.attrs['phase_ref'] = np.array( [iref for iref in index_phase_ref if iref is not None]) # Determine the output filename and save results start_time, end_time = ephemeris.unix_to_datetime( np.percentile(timestamp, [0, 100])) tfmt = "%Y%m%d" night_str = 'night_' if not np.any(is_daytime) else '' output_file = os.path.join( config.output_dir, "%s_%s_%sraw_stability_data.h5" % (start_time.strftime(tfmt), end_time.strftime(tfmt), night_str)) logger.info("Saving results to %s." % output_file) data.save(output_file)
def update_mean(delta_t_file): n = 0 ar_z = np.arange(1.9, 3.5, 0.0005) # weighted mean ar_delta_t_sum = np.zeros_like(ar_z) ar_delta_t_count = np.zeros_like(ar_z) ar_delta_t_weighted = np.zeros_like(ar_z) # histogram median delta_t_min, delta_t_max = (-10, 10) delta_t_num_buckets = 1000 ar_delta_t_histogram = np.zeros(shape=(ar_z.size, delta_t_num_buckets)) ar_ivar_total = np.zeros_like(ar_z) # calculate the weighted sum of the delta transmittance per redshift bin. for i in range(delta_t_file.num_spectra): ar_z_unbinned = delta_t_file.get_wavelength(i) ar_delta_t_unbinned = delta_t_file.get_flux(i) ar_ivar_unbinned = delta_t_file.get_ivar(i) if ar_z_unbinned.size > 2: f_delta_t = interpolate.interp1d(ar_z_unbinned, ar_delta_t_unbinned, kind='nearest', bounds_error=False, fill_value=0, assume_sorted=True) ar_delta_t = f_delta_t(ar_z) f_ivar = interpolate.interp1d(ar_z_unbinned, ar_ivar_unbinned, kind='nearest', bounds_error=False, fill_value=0, assume_sorted=True) ar_ivar = f_ivar(ar_z) ar_delta_t_sum += ar_delta_t ar_delta_t_weighted += ar_delta_t * ar_ivar ar_delta_t_count += ar_delta_t != 0 ar_ivar_total += ar_ivar ar_delta_t_clipped = np.clip(ar_delta_t, delta_t_min, delta_t_max) ar_delta_t_buckets = rescale(ar_delta_t_clipped, (delta_t_min, delta_t_max), (0, delta_t_num_buckets)) ar_delta_t_buckets = np.clip(ar_delta_t_buckets.astype(np.int32), 0, delta_t_num_buckets - 1) for j in range(ar_z.size): ar_delta_t_histogram[j, ar_delta_t_buckets[j]] += ar_ivar[j] if ar_ivar[j]: pass n += 1 # save intermediate result (the mean delta_t before removal) np.save( settings.get_mean_delta_t_npy(), np.vstack((ar_z, ar_delta_t_weighted, ar_ivar_total, ar_delta_t_sum, ar_delta_t_count))) ar_delta_t_median = np.zeros_like(ar_z) for i in range(ar_z.size): ar_delta_t_median[i] = weighted.median(np.arange(delta_t_num_buckets), ar_delta_t_histogram[i]) if i > 120: pass ar_delta_t_median = rescale(ar_delta_t_median, (0, delta_t_num_buckets), (delta_t_min, delta_t_max)) np.save(settings.get_median_delta_t_npy(), np.vstack((ar_z, ar_delta_t_median))) return ar_delta_t_weighted, ar_ivar_total, ar_z, n, ar_delta_t_median
print("\n Nowe: -----------------------------") print("\n Średnia ważona : Brak - 0 sprzedanych ") print("\n Mediana ważona : Brak - 0 sprzedanych") print("\n Statystyki: \n ") print(new["Cena"].describe()) else: for i in range(0, len(new)): kupione = new['Kupione'][i] s_iloczyn += new['Cena'][i] * kupione s_wag += kupione weighted_average = s_iloczyn / s_wag print("\n Nowe: -----------------------------") print("\n Średnia ważona : " + str(weighted_average)) print("\n Mediana ważona : " + str(weighted.median(new['Cena'], new['Kupione']))) print("\n Statystyki: \n ") print(new["Cena"].describe()) ### s_iloczyn = 0 s_wag = 0 if used.empty: print("\n Używane: --------------------------") print('\nBrak takiego używanego modelu') else: if used['Kupione'].sum() == 0: print("\n Używane: --------------------------") print("\n Średnia ważona : Brak - 0 sprzedanych ")