def serialize_kmeans_products(self, km, alpha): if alpha: arr_path = self.alpha_standardized_stacked_arr_path uniq_markers = self.tl_model.uniq_markers destination = self.alpha_cluster_dir else: arr_path = self.standardized_stacked_arr_path uniq_markers = self.uniq_markers destination = self.cluster_dir print(f'arr_path: "{arr_path}", uniq_markers: "{uniq_markers}", destination: "{destination}"') standardized_stacked_arr = utils.open_pickle(arr_path) target_ds = utils.open_pickle(self.target_ds_preprocessed_path) rf_ds_preprocessed = utils.open_pickle(self.rf_ds_preprocessed_path) labels_ar = km.labels_ labels_to_coords = np.zeros([len(labels_ar), 2]) for i, var in enumerate(labels_ar): labels_to_coords[i] = i % self.gridsize, i // self.gridsize try: label_markers = np.array([uniq_markers[var] for i, var in enumerate(labels_ar)]) except IndexError: # more than 12 clusters label_markers = np.array([(uniq_markers*3)[var] for i, var in enumerate(labels_ar)]) target_ds_withClusterLabels = target_ds.assign_coords(cluster=("time", km.predict(standardized_stacked_arr.astype(np.float)))) dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords() RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels]) self.labels_ar_path = utils.to_pickle(f'{self.RUN_datetime}_labels_ar', labels_ar, destination) self.labels_to_coords_path = utils.to_pickle(f'{self.RUN_datetime}_labels_to_coords', labels_to_coords, destination) self.label_markers_path = utils.to_pickle(f'{self.RUN_datetime}_label_markers', label_markers, destination) self.target_ds_withClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_target_ds_withClusterLabels', target_ds_withClusterLabels, destination) self.dates_to_ClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_dates_to_ClusterLabels', dates_to_ClusterLabels, destination) self.RFprec_to_ClusterLabels_dataset_path = utils.to_pickle(f'{self.RUN_datetime}_RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, destination)
def detect_prepared_datasets(self): """ Pre-processing, including time-slicing, removal of NAs, stacking & standardizing. calls - 1. prepare.preprocess_time_series 2. prepare.flatten_and_standardize_dataset` """ if utils.find('*target_ds_preprocessed.pkl', self.prepared_data_dir) and \ utils.find('*rf_ds_preprocessed.pkl', self.prepared_data_dir) and \ utils.find('*standardized_stacked_arr.pkl', self.prepared_data_dir): print('Pickles (preprocessed) found.') for pkl in utils.find('*preprocessed.pkl', self.prepared_data_dir): if "target_ds" in pkl: self.target_ds_preprocessed_path = pkl elif "rf_ds" in pkl: self.rf_ds_preprocessed_path = pkl LocalModelParams(self, utils.open_pickle(self.target_ds_preprocessed_path)) for pkl in utils.find('*standardized_stacked_arr.pkl', self.prepared_data_dir): self.standardized_stacked_arr_path = pkl else: print('Pickles of pre-processed data incomplete. Proceeding to load & process raw dataset pickles.') self.target_ds_preprocessed_path, self.rf_ds_preprocessed_path = prepare.preprocess_time_series(self, self.prepared_data_dir, self.ALPHAs) LocalModelParams(self, utils.open_pickle(self.target_ds_preprocessed_path)) # generate new local model params self.standardized_stacked_arr_path = prepare.flatten_and_standardize_dataset(self, self.prepared_data_dir) print(f'--> Months for this dataset are: {self.month_names}')
def prepare_dataset(model, dest): """ - xr.open_mfdataset() = loading - restricting to certain variables + "levels" of variables - combining variables xarrays into one - restricting to only between 1999 to 2019 - slicing domain dimensions up to required specs (i.e. model.LON_S, model.LON_N, etc...) - slicing up to chosen period only - pickling the datasets (both input & rainfall) & returning them """ # searching for raw data pickles preloaded_input_pickles = utils.find('*.pkl', model.raw_input_dir) if preloaded_input_pickles: print('Preloaded raw INPUT data pickles found...') ds_CHOSEN_VARS_renamed = utils.open_pickle(utils.find('*.pkl', model.raw_input_dir)[0]) else: print('Creating pickles of raw input data...') ds_CHOSEN_VARS_renamed = save_preloaded_raw_input_data(model) preloaded_input_pickles = utils.find('*.pkl', model.raw_rf_dir) if preloaded_input_pickles: print('Preloaded raw rainfall data pickles found...') ds_RAINFALL = utils.open_pickle(utils.find('*.pkl', model.raw_rf_dir)[0]) else: print('Creating pickles of raw rainfall data...') ds_RAINFALL = save_preloaded_raw_rf_data(model) print("Proceeding to do preliminary data cleaning...") ds_sliced = ds_CHOSEN_VARS_renamed.sel( level=slice(np.min(model.unique_pressure_lvls),np.max(model.unique_pressure_lvls)), lat=slice(model.LAT_N,model.LAT_S), lon=slice(model.LON_W,model.LON_E), time=slice('1999', '2019')) ds_sliced_rhum = ds_sliced.rhum ds_sliced_rhum_no925 = ds_sliced_rhum.drop_sel({"level":925}) ds_sliced_uwnd_only = ds_sliced.uwnd ds_sliced_vwnd_only = ds_sliced.vwnd ds_combined_sliced = xr.merge([ds_sliced_rhum_no925, ds_sliced_uwnd_only, ds_sliced_vwnd_only], compat='override') rf_ds_sliced = ds_RAINFALL.sel(lat=slice(model.LAT_S, model.LAT_N), lon=slice(model.LON_W,model.LON_E)) print('Pickling domain- & feature-constrained input & RF datasets...') if model.period == "NE_mon": input_ds = ds_combined_sliced.sel(time=is_NE_mon(ds_combined_sliced['time.month'])) rf_ds = rf_ds_sliced.sel(time=is_NE_mon(rf_ds_sliced['time.month'])) input_ds_serialized_path = utils.to_pickle('raw_input_ds_NE_mon_serialized', input_ds, dest) rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_NE_mon_serialized', rf_ds, dest) return input_ds_serialized_path, rf_ds_serialized_path elif model.period == "SW_mon": input_ds = ds_combined_sliced.sel(time=is_SW_mon(ds_combined_sliced['time.month'])) rf_ds = rf_ds_sliced.sel(time=is_SW_mon(rf_ds_sliced['time.month'])) input_ds_serialized_path = utils.to_pickle('raw_input_ds_SW_mon_serialized', input_ds, dest) rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_SW_mon_serialized', rf_ds, dest) return input_ds_serialized_path, rf_ds_serialized_path elif model.period == "inter_mon": input_ds = ds_combined_sliced.sel(time=is_inter_mon(ds_combined_sliced['time.month'])) rf_ds = rf_ds_sliced.sel(time=is_inter_mon(rf_ds_sliced['time.month'])) input_ds_serialized_path = utils.to_pickle('raw_input_ds_inter_mon_serialized', input_ds, dest) rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_inter_mon_serialized', rf_ds, dest) return input_ds_serialized_path, rf_ds_serialized_path
def cut_dataset(model, alpha, dest, dataset_path, ds_name): dataset = utils.open_pickle(dataset_path) try: dataset = dataset.sel( level=slice(np.min(model.tl_model.unique_pressure_lvls),np.max(model.tl_model.unique_pressure_lvls)), lat=slice(model.tl_model.LAT_N, model.tl_model.LAT_S), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E), time=slice('1999', '2019')) except ValueError: dataset = dataset.sel( lat=slice(model.tl_model.LAT_S, model.tl_model.LAT_N), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E), time=slice('1999', '2019')) if model.tl_model.period == "NE_mon": dataset = dataset.sel(time=is_NE_mon(dataset['time.month'])) elif model.tl_model.period == "SW_mon": dataset = dataset.sel(time=is_SW_mon(dataset['time.month'])) elif model.tl_model.period == "inter_mon": dataset = dataset.sel(time=is_inter_mon(dataset['time.month'])) if alpha != model.ALPHAs: gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI] train_years = np.delete(model.tl_model.years, np.arange((alpha-1) * model.PSI, alpha * model.PSI)) test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years)) train = utils.cut_year(dataset, np.min(train_years), np.max(train_years)) else: gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI+model.runoff_years] train_years = np.delete(model.tl_model.years, np.arange((alpha-1)*model.PSI, alpha*model.PSI+model.runoff_years)) test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years)) train = utils.cut_year(dataset, np.min(train_years), np.max(train_years)) time.sleep(1); gc.collect() utils.to_pickle(f'{ds_name}_test_alpha_{alpha}_preprocessed', test, dest) utils.to_pickle(f'{ds_name}_train_alpha_{alpha}_preprocessed', train, dest)
def train_kmeans(self, alpha=None): if alpha: optimal_k = self.tl_model.optimal_k print(f'>> self.alpha_model_dir: {self.alpha_model_dir}') print(f'>> optimal_k: {optimal_k}') found = [i for i in Path(self.alpha_model_dir).glob(f'k-{optimal_k}_*')] if found: self.alpha_cluster_dir = found[0] else: self.alpha_cluster_dir = str(Path(self.alpha_model_dir) / f"k-{optimal_k}_NOT-singled-out-as-potential-cluster-for-this-split") os.makedirs(self.alpha_cluster_dir, exist_ok=True) print(f'>> self.alpha_cluster_dir: {self.alpha_cluster_dir}') destination = self.alpha_cluster_dir prefix = f'alpha_{alpha}_' else: optimal_k = self.optimal_k destination = self.cluster_dir prefix = '' print(f'optimal_k: "{optimal_k}", destination: "{destination}", prefix: "{prefix}"') for phrase in ('kmeans_model', 'labels_ar', 'labels_to_coords', 'label_markers', 'target_ds_withClusterLabels', 'dates_to_ClusterLabels', 'RFprec_to_ClusterLabels_dataset'): if utils.find(f'*{phrase}*.pkl', destination): print(f'>>>>>>>>> "self.{phrase}_path" initialized.') exec(f'self.{phrase}_path = utils.find(f\'*{phrase}*.pkl\', r"{destination}")[0]') else: print(f'{utils.time_now()} - No KMeans model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...') som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path) samples, features = som_weights_to_nodes.shape km = KMeans(n_clusters=optimal_k).fit(som_weights_to_nodes) print(f"n{utils.time_now()} - K-means estimator fitted, sample size is {samples} and number of features is {features}.") self.kmeans_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}kmeans_model', km, destination) self.serialize_kmeans_products(km, alpha) break
def prepare_alphafold_dataset(self, alpha): print(f'Preparing dataset for alpha-{alpha}') if alpha != self.ALPHAs: self.gt_years = np.array2string(self.tl_model.years[(alpha-1)*self.PSI : alpha*self.PSI], separator='-') else: self.gt_years = np.array2string(self.tl_model.years[(alpha-1)*self.PSI : alpha*self.PSI+self.runoff_years], separator='-') self.alpha_prepared_dir = str(Path(self.tl_model.prepared_data_dir) / f'alpha_{alpha}') self.alpha_model_dir = str(Path(self.tl_model.cluster_dir) / f'alpha_{alpha}_GT-{self.gt_years}') for pkl in utils.find(f'*alpha_{alpha}_preprocessed.pkl', self.alpha_prepared_dir): if "target_ds_train" in pkl: self.target_ds_preprocessed_path = pkl elif "rf_ds_train" in pkl: self.rf_ds_preprocessed_path = pkl elif "target_ds_test" in pkl: self.x_test_path = pkl elif "rf_ds_test" in pkl: self.y_test_path = pkl LocalModelParams(self, utils.open_pickle(self.target_ds_preprocessed_path)) if utils.find('*standardized_stacked_arr.pkl', self.alpha_prepared_dir): self.alpha_standardized_stacked_arr_path = utils.find(f'*standardized_stacked_arr.pkl', self.alpha_prepared_dir)[0] else: self.alpha_standardized_stacked_arr_path = prepare.flatten_and_standardize_dataset(self, self.alpha_prepared_dir) print(f'--> Months for this dataset are: {self.month_names}') print( f'paths created @ prepare_alphafold_dataset():\nself.alpha_prepared_dir: "{self.alpha_prepared_dir}", \nself.alpha_model_dir: "{self.alpha_model_dir}"' f'\nself.target_ds_preprocessed_path: "{self.target_ds_preprocessed_path}", \nself.rf_ds_preprocessed_path: "{self.rf_ds_preprocessed_path}"' \ f'\nself.rf_ds_preprocessed_path: "{self.rf_ds_preprocessed_path}", \nself.x_test_path: "{self.x_test_path}", \nself.y_test_path: "{self.y_test_path}"' \ f'\nself.alpha_standardized_stacked_arr_path: "{self.alpha_standardized_stacked_arr_path}", \nself.gt_years: {self.gt_years}' \ )
def preprocess_time_series(model, dest, nfold_ALPHA=None, desired_res=0.75): """ Preparing datasets for use in training algorithms - dropping missing values - ensuring both target & input datasets have same dates - coarsening spatial resolution of rainfall(target) dataset to desired resolution - pickling these "preprocessed" datasets """ target_ds = utils.open_pickle(model.input_ds_serialized_path) rf_target_ds = utils.open_pickle(model.rf_ds_serialized_path) # removing NA rows, supraneous dates, & coarsening dates accordingly print(f'{utils.time_now()} - Preprocessing data now.') try: rf_target_ds['time'] = rf_target_ds.indexes['time'].to_datetimeindex() #converting CFTimeIndex -> DateTime Index except AttributeError: print('AttributeError: \'DatetimeIndex\' object has no attribute \'to_datetimeindex\', continuing regardless...') pass earliest_rf_reading, latest_rf_reading = rf_target_ds.isel(time=0).time.values, rf_target_ds.isel(time=-1).time.values earliest_target_ds_reading, latest_target_ds_reading = target_ds.isel(time=0).time.values, target_ds.isel(time=-1).time.values earliest_date = earliest_target_ds_reading if earliest_target_ds_reading > earliest_rf_reading else earliest_rf_reading latest_date = latest_target_ds_reading if latest_target_ds_reading < latest_rf_reading else latest_rf_reading rf_ds_preprocessed = rf_target_ds.sel(time=slice(earliest_date, latest_date)) target_ds = target_ds.sel(time=slice(earliest_date, latest_date)) more_time_gaps = [i for i in target_ds.time.data if i not in rf_ds_preprocessed.time.data] more_time_gaps = more_time_gaps+[i for i in rf_ds_preprocessed.time.data if i not in target_ds.time.data] valid_dates = [date for date in target_ds.time.data if date not in more_time_gaps] target_ds = target_ds.sel(time = valid_dates) coarsen_magnitude = int(desired_res/np.ediff1d(target_ds.isel(lon=slice(0,2)).lon.data)[0]) print(f'Coarsen magnitude set at: {coarsen_magnitude} toward desired spatial resolu. of {desired_res}') target_ds_preprocessed = target_ds.coarsen(lat=coarsen_magnitude, lon=coarsen_magnitude, boundary='trim').mean() target_ds_preprocessed_path = utils.to_pickle('target_ds_preprocessed', target_ds_preprocessed, dest) rf_ds_preprocessed_path = utils.to_pickle('rf_ds_preprocessed', rf_ds_preprocessed, dest) target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed) if nfold_ALPHA: for alpha in range(nfold_ALPHA): pass return target_ds_preprocessed_path, rf_ds_preprocessed_path
def TEST_all(): dataset_in = open_pickle('../datasets/DSET_argentina.pkl') my_generator = get_enhansed_generator(segment_len=512, batch_size=20, dataset_in=dataset_in) batch = next(my_generator) print("shape of batch x= " + str(batch[0].shape)) print("shape of batch y= " + str(batch[1].shape))
def get_test_batch(): from annotation.ann_generator import delete_baseline_wander, extract_first_leads, shrink_dataset dataset_in = open_pickle(dataset_path) _, test_dset = split_dict_annotations(dataset_in) dataset_only_one_channel = extract_first_leads(test_dset) delete_baseline_wander(dataset_only_one_channel['x']) dataset_shrinked = shrink_dataset(dataset_only_one_channel) return dataset_shrinked
def train_SOM(self, alpha=None): d_hp_dir_path = str(utils.models_dir / self.dir_hp_str) self.d_hp_dir_path = d_hp_dir_path os.makedirs(d_hp_dir_path, exist_ok=True) if not utils.find(f'*extent_{self.dir_str}.png', self.d_hp_dir_path): visualization.get_domain_geometry(self, self.d_hp_dir_path) models_dir_path = str(utils.models_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}' os.makedirs(models_dir_path, exist_ok=True) self.models_dir_path = models_dir_path # utils.update_cfgfile('Paths', 'models_dir_path', self.models_dir_path) if alpha: destination = self.alpha_model_dir arr_path = self.alpha_standardized_stacked_arr_path prefix = f'alpha_{alpha}_' prompt = f'< alpha-{alpha} >' else: destination = self.models_dir_path arr_path = self.standardized_stacked_arr_path prefix = '' prompt = '' print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}"') if utils.find(f'*{prefix}som_model.pkl', destination): print(f'{utils.time_now()} - SOM model trained before, skipping...') self.som_model_path = utils.find(f'*{prefix}som_model.pkl', destination)[0] else: print(f'{utils.time_now()} - {prompt} No SOM model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...') standardized_stacked_arr = utils.open_pickle(arr_path) sominitstarttime = timer(); print(f'{utils.time_now()} - Initializing MiniSom... ') som = MiniSom(self.gridsize, self.gridsize, # square standardized_stacked_arr.shape[1], sigma=self.sigma, learning_rate=self.learning_rate, neighborhood_function='gaussian', random_seed=self.random_seed) """ Note: initializing PCA for weights is faster (~1/2 hour), but for serialized arrays > 300mb, chances are this will kill the RAM and halt the entire process. """ ## try: ## som.pca_weights_init(standardized_stacked_arr) ## except MemoryError as e: ## print(f'Memory error has occured: \n{e}') print(f"Initialization took {utils.time_since(sominitstarttime)}.\n") trainingstarttime = timer(); print(f"{utils.time_now()} - Beginning training.") getattr(som, self.training_mode)(standardized_stacked_arr, self.iterations, verbose=True) q_error = np.round(som.quantization_error(standardized_stacked_arr), 2) print(f"Training complete. Q error is {q_error}, time taken for training is {utils.time_since(trainingstarttime)}s\n") if alpha: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination) else: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination)
def normalize_data(X_train, X_test, folder_name): if os.path.isfile(folder_name + "mean.pkl"): mean = utils.open_pickle(folder_name + "mean.pkl") std = utils.open_pickle(folder_name + "std.pkl") else: train_matrix = [] for x in X_train: train_matrix.extend(x) train_matrix = np.array(train_matrix) mean = np.array(train_matrix).mean(0) std = np.array(train_matrix).std(0) utils.save_pickle(folder_name + "mean.pkl", mean) utils.save_pickle(folder_name + "std.pkl", std) X_train = [(x-mean)/std for x in X_train] X_test = [(x-mean)/std for x in X_test] return np.array(X_train), np.array(X_test)
def assign_test_clusters_to_datasets(self): target_ds_preprocessed = utils.open_pickle(utils.find('*target_ds_preprocessed.pkl', self.test_prepared_data_dir)[0]) rf_ds_preprocessed = utils.open_pickle(utils.find('*rf_ds_preprocessed.pkl', self.test_prepared_data_dir)[0]) standardized_stacked_arr = utils.open_pickle(utils.find('*standardized_stacked_arr.pkl', self.test_prepared_data_dir)[0]) self.n_datapoints = target_ds_preprocessed.time.shape[0] # length of xr_dataset self.lat_size = target_ds_preprocessed.lat.shape[0] self.lon_size = target_ds_preprocessed.lon.shape[0] self.months = np.unique(target_ds_preprocessed['time.month'].values) # month numbers self.month_names = [calendar.month_name[m][:3] for m in np.unique(target_ds_preprocessed['time.month'])] self.month_names_joined = '_'.join(self.month_names).upper() # to print months properly self.years = np.unique(target_ds_preprocessed['time.year'].values) # unique years self.X, self.Y = target_ds_preprocessed.lon, target_ds_preprocessed.lat km = utils.open_pickle(self.kmeans_model_path) predicted_clusters = km.predict(standardized_stacked_arr.astype(np.float)) target_ds_withClusterLabels = target_ds_preprocessed.assign_coords(cluster=("time", predicted_clusters)) dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords() RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels]) utils.to_pickle('target_ds_withClusterLabels', target_ds_withClusterLabels, self.test_prepared_data_dir) utils.to_pickle('RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, self.test_prepared_data_dir)
def get_generators_permute(train_batch, test_batch): """чтобы возвращаелось none,4,512, а не none, 513, 3 как обычно """ dataset_in = open_pickle(dataset_path) train_dset, test_dset = split_dict_annotations(dataset_in) my_generator_train = get_mulimask_generator_addon(segment_len, batch_size=train_batch, dataset_in=train_dset) my_generator_test = get_mulimask_generator_addon(segment_len, batch_size=test_batch, dataset_in=test_dset) return my_generator_train, my_generator_test
def retrieve_and_insert_actual_RF_array(conn, all_test_prepared_data_dir, period, domain, test_date, sn, cluster, w_lim, e_lim, s_lim, n_lim): test_ds = utils.open_pickle( Path(all_test_prepared_data_dir) / f'{period}_mon_{domain}_prepared/RFprec_to_ClusterLabels_dataset.pkl') wholegrid_gt1mm_pred = (test_ds.sel(time=test_date).precipitationCal > 1).values SG_only_gt1mm_actual = (test_ds.precipitationCal.sel( lon=slice(w_lim, e_lim), lat=slice(s_lim, n_lim), time=test_date) > 1).values insert_actual_rf_array(conn, sn, period, domain, cluster, test_date, SG_only_gt1mm_actual, wholegrid_gt1mm_pred)
def detect_som_products(self, alpha=None): if alpha: destination = self.alpha_model_dir arr_path = self.alpha_standardized_stacked_arr_path prefix = f'alpha_{alpha}_' prompt = f'< alpha-{alpha} >' else: destination = self.models_dir_path arr_path = self.standardized_stacked_arr_path prefix = '' prompt = '' print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}", prompt:"{prompt}"') for phrase in ('winner_coordinates', 'dmap', 'ar', 'som_weights_to_nodes'): if utils.find(f'*{prefix}{phrase}.pkl', destination): p = utils.find(f'*{prefix}{phrase}.pkl', destination) print(f'{utils.time_now()} - {prefix}{phrase} is found @: \n{p[0]}') exec(f'self.{phrase}_path = {p}[0]') else: print(f'{utils.time_now()} - {prompt} Some SOM products found missing in, generating all products now...') som = utils.open_pickle(self.som_model_path) standardized_stacked_arr = utils.open_pickle(arr_path) winner_coordinates = np.array([som.winner(x) for x in standardized_stacked_arr]) dmap = som.distance_map() ar = som.activation_response(standardized_stacked_arr) som_weights = som.get_weights() # weights for training via k-means som_weights_to_nodes = np.array( [som_weights[c,r] for r in range(self.gridsize) for c in range(self.gridsize)]) #kmeans clustering self.winner_coordinates_path = utils.to_pickle(f'{prefix}winner_coordinates', winner_coordinates, destination) self.dmap_path = utils.to_pickle(f'{prefix}dmap', dmap, destination) self.ar_path = utils.to_pickle(f'{prefix}ar', ar, destination) self.som_weights_to_nodes_path = utils.to_pickle(f'{prefix}som_weights_to_nodes', som_weights_to_nodes, destination) break print('SOM products serialized.')
def draw_shrinked(): dataset_in = open_pickle('../datasets/DSET_argentina.pkl') dataset_only_one_channel = extract_first_lines(dataset_in) dset_shrinked = shrink_dataset(dataset_only_one_channel) before_x = dataset_only_one_channel['x'][0, 0, 0:30] after_x = dset_shrinked['x'][0, 0, 0:30] figname = "shrinked_ecg.png" f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=False) ax1.plot(before_x, 'k-', label="несжатый") ax2.plot(after_x, 'm-', label="сжатый в " + str(SHRINK_FACTOR)) plt.legend(loc=2) plt.savefig(figname)
def get_test_ds_params(period, domain): path = Path(__file__).resolve().parents[ 1] / rf"data/external/casestudytesting_29_Jan/{period}_mon_{domain}_prepared" print(path / f'RFprec_to_ClusterLabels_dataset.pkl') test_ds = utils.open_pickle( str(path / f'RFprec_to_ClusterLabels_dataset.pkl')) w_lim = 103.5 e_lim = 104.055 s_lim = 1.1 n_lim = 1.55 rf_ds_lon = test_ds.sel(lon=slice(w_lim, e_lim), lat=slice(s_lim, n_lim)).lon.values rf_ds_lat = test_ds.sel(lon=slice(w_lim, e_lim), lat=slice(s_lim, n_lim)).lat.values return rf_ds_lon, rf_ds_lat
def get_generators(train_batch, test_batch): """ слепим два генератора- тестовый и трейновый :param train_batch: размер батча для трейнового генератора :param test_batch: размер батча для тестового генератора :return: """ dataset_in = open_pickle('./DSET_argentina.pkl') train_dset, test_dset = split_dict_annotations(dataset_in) my_generator_train = get_enhansed_generator(segment_len, batch_size=train_batch, dataset_in=train_dset) my_generator_test = get_enhansed_generator(segment_len, batch_size=test_batch, dataset_in=test_dset) return my_generator_train, my_generator_test
def test_random_dates(self, dates_to_test, plots=5): test_dir = str(Path(__file__).resolve().parents[1] / 'test/2021_Jan_28_testing2020randomdates') self.test_RF_raw_data_dir = str(Path(__file__).resolve().parents[1] / "data/external/casestudytesting_29_Jan/raw/GPM_L3") self.test_indp_vars_raw_data_dir = str(Path(__file__).resolve().parents[1] / "data/external/casestudytesting_29_Jan/raw/downloadERA") self.test_prepared_data_dir = str(Path(__file__).resolve().parents[1] / f"data/external/casestudytesting_29_Jan/{self.period}_{self.dir_str}_prepared") os.makedirs(self.test_prepared_data_dir, exist_ok=True) number_of_test_plots_created = len(utils.find(f'*{self.period}_{self.dir_str}*_test_zscore_against_fullmodel*.png', test_dir)) # number_of_test_plots_needed = date_to_test*plots if number_of_test_plots_created >= dates_to_test: print(f"{number_of_test_plots_created} random dates have already been tested, please review at {test_dir}") return else: print(f"{number_of_test_plots_created} dates tested so far.") if not utils.find('*target_ds_preprocessed.pkl', self.test_prepared_data_dir) \ or not utils.find('*rf_ds_preprocessed.pkl', self.test_prepared_data_dir) \ or not utils.find('*standardized_stacked_arr.pkl', self.test_prepared_data_dir): prepare.prep_for_testing_random_dates(self) if not utils.find('*target_ds_withClusterLabels.pkl', self.test_prepared_data_dir) \ or not utils.find('*RFprec_to_ClusterLabels_dataset.pkl', self.test_prepared_data_dir): self.assign_test_clusters_to_datasets() target_ds_withClusterLabels = utils.open_pickle(utils.find('*target_ds_withClusterLabels.pkl', self.test_prepared_data_dir)[0]) if self.period == "NE_mon": target_ds_withClusterLabels = target_ds_withClusterLabels.sel(time=prepare.is_NE_mon(target_ds_withClusterLabels['time.month'])) elif self.period == "SW_mon": target_ds_withClusterLabels = target_ds_withClusterLabels.sel(time=prepare.is_SW_mon(target_ds_withClusterLabels['time.month'])) elif self.period == "inter_mon": target_ds_withClusterLabels = target_ds_withClusterLabels.sel(time=prepare.is_inter_mon(target_ds_withClusterLabels['time.month'])) if target_ds_withClusterLabels.time.size == 0: print(f'There are no dates available in your test dataset to use for this {self.period} monsoon period, please verify. Ending testing here.') return random_sampled_dates = np.array(np.random.choice(target_ds_withClusterLabels.time.data, dates_to_test-number_of_test_plots_created, replace=False)) random_sampled_dates.sort() print(random_sampled_dates) for i, sn in enumerate(range(number_of_test_plots_created+1, dates_to_test+1)): # for sn in range(number_of_test_plots_created+1, dates_to_test+1): print(f'Printing {sn} out of {dates_to_test} test plots now:') # random_sampled_date = np.random.choice(target_ds_withClusterLabels.time.data, 1) random_sampled_date = [random_sampled_dates[i]] cluster = int(target_ds_withClusterLabels.sel(time=random_sampled_date).cluster.data)+1 # run_test.print_test_date_abv_1mm_bool(self, test_dir, sn, random_sampled_date, cluster) run_test.print_test_date_abv_1mm_to500mm(self, test_dir, sn, random_sampled_date, cluster) run_test.print_brier_gt1mm(self, test_dir, sn, random_sampled_date, cluster) run_test.print_heavyrfforecastcomparison_gt50mm(self, test_dir, sn, random_sampled_date, cluster) run_test.print_test_date_zscore_against_fullmodel(self, test_dir, sn, random_sampled_date, cluster)
def flatten_and_standardize_dataset(model, dest): target_ds_preprocessed = utils.open_pickle(model.target_ds_preprocessed_path) target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed) # reshaping reshapestarttime = timer(); print(f"{utils.time_now()} - Reshaping data now...") print(f"\n{utils.time_now()} - reshaping rhum dataarrays now, total levels to loop: {model.rhum_pressure_levels}.") reshaped_unnorma_darrays = {} reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {} for level in model.rhum_pressure_levels: print(f'@{level}... ') reshaped_unnorma_darrays['rhum'][level] = np.reshape( target_ds_preprocessed.rhum.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.") for level in model.uwnd_vwnd_pressure_lvls: print(f'@{level}... ') reshaped_unnorma_darrays['uwnd'][level] = np.reshape( target_ds_preprocessed.uwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) reshaped_unnorma_darrays['vwnd'][level] = np.reshape( target_ds_preprocessed.vwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) reshapetime = timer()-reshapestarttime; reshapetime = str(datetime.timedelta(seconds=reshapetime)).split(".")[0]; print(f'Time taken: {reshapetime}s.\n') # stacking unstandardized dataarrays stackingstarttime = timer(); print("Stacking unstandardized dataarrays now...") stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]]) stackingtime = timer()-stackingstarttime; stackingtime = str(datetime.timedelta(seconds=stackingtime)).split(".")[0]; print(f'Time taken: {stackingtime}s.\n') # standardizing the stacked dataarrays standardizestarttime = timer(); print("standardizing stacked dataarrays now...") print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}') transformer = RobustScaler(quantile_range=(25, 75)) standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training transformer.get_params() standardizetime = timer()-standardizestarttime; standardizetime = str(datetime.timedelta(seconds=standardizetime)).split(".")[0]; print(f'That took {standardizetime}s to complete.\n') standardized_stacked_arr_path = utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, dest) return standardized_stacked_arr_path
def retrieve_and_insert_predicted_RF_array(conn, period, domain, cluster, w_lim, e_lim, s_lim, n_lim): print( f'Inserting predicted_rf arr: {period}, {domain} - cluster: {cluster}') # pred_ds = utils.open_pickle(model.RFprec_to_ClusterLabels_dataset_path) ## NOT possible due to this only being for ONE domain-period pred_ds = utils.open_pickle([ *utils.models_dir.glob(f'**/{domain}*/{period}*/k-*/*RFprec*.pkl') ][0]) data_pred_wholegrid = (pred_ds.where(pred_ds.cluster == int(cluster) - 1, drop=True).precipitationCal > 1) data_pred_sgonly = (pred_ds.where(pred_ds.cluster == int(cluster) - 1, drop=True).precipitationCal > 1).sel( lon=slice(w_lim, e_lim), lat=slice(s_lim, n_lim)) whole_grid_gt1mm_pred = np.mean(data_pred_wholegrid, axis=0).values SG_only_gt1mm_pred = np.mean(data_pred_sgonly, axis=0).values insert_predicted_rf_array(conn, period, domain, cluster, SG_only_gt1mm_pred, whole_grid_gt1mm_pred)
import os import utils import lightgbm as lgb import numpy as np import math from google.cloud import storage from flask import Flask, render_template, request app = Flask(__name__) CLOUD_STORAGE_BUCKET = os.environ['NYPAB_BUCKET'] gcs = storage.Client() bucket = gcs.get_bucket(CLOUD_STORAGE_BUCKET) '''Prediction Files Here''' model = lgb.Booster(model_file='data/lgb_classifier.txt') scalar = utils.open_pickle('data/scalar1.pkl') ethnicity_mapping = { 'Black or African American': 0, 'Asian': 1, 'Hispanic or Latinx': 2, 'White': 3, 'American Indian': 4, 'Refused': 5, 'Other Race': 6, 'Unknown': 7 } gender_mapping_complainant = { 'Male': 0, 'Female': 1,
# 15 - number of keywords in inspect label is between 5, 10 and the model label is 5, 10, 15, therefore, extract 15. # But this number will be adjusted (same like how many number of keyphrase) when extract the features # 50 - number of keywords in news label is 50 if data == 'default': n_topics = 15 elif data == 'news': n_topics = 50 #store the keywords into a list all_topics = [] for n_doc in corpus: all_topics.append(TopicRank(n_doc).get_top_n(n = n_topics)) return all_topics #Read pickle file from the model txt_train_data = utils.open_pickle('./pickle/txt train data') txt_test_data = utils.open_pickle('./pickle/txt test data') xml_train_data = utils.open_pickle('./pickle/xml train data') xml_test_data = utils.open_pickle('./pickle/xml test data') #Read pickle file from the news dataset news_train_data = utils.open_pickle('./pickle/500N-KPCrowd/train data') news_test_data = utils.open_pickle('./pickle/500N-KPCrowd/test data') #Read pickle file from the inspec dataset inspec_train_data = utils.open_pickle('./pickle/inspec/train data') inspec_test_data = utils.open_pickle('./pickle/inspec/test data') #Processing on the model data txt_train_topics = calculate_topic_rank(txt_train_data, data='default')
def generate_k(self, alpha=None): """ - detection of metrices to infer "k", i.e. optimal_k value - creation of metrices pickles - creation of folders in models_dir to indicate potential k values/cluster combinations """ metrics_dir = str(utils.metrics_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}' os.makedirs(metrics_dir, exist_ok=True) self.metrics_dir_path = metrics_dir if alpha: self.alpha_metrics_dir_path = str(Path(self.tl_model.metrics_dir_path) / f'alpha_{alpha}') metric_destination = self.alpha_metrics_dir_path os.makedirs(metric_destination, exist_ok=True) model_destination = self.alpha_model_dir prefix = f'alpha_{alpha}_' prompt = f'< alpha-{alpha} >' else: metric_destination = self.metrics_dir_path model_destination = self.models_dir_path prefix = '' prompt = '' print(f'metric_destination: "{metric_destination}", model_destination: "{model_destination}", prefix: "{prefix}", prompt:"{prompt}"') for phrase in ('sil_peaks', 'ch_max', 'dbi_min', 'reasonable_sil', 'ch_dbi_tally', 'n_expected_clusters', 'dbs_err_dict'): if utils.find(f'*{prefix}{phrase}*.pkl', metric_destination): pass else: print(f'{utils.time_now()} - {prompt} Not all metrices have been found in {metric_destination}, generating them now...') # print all metrices if even 1 not found som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path) ch_scores, dbi_scores = validation.print_elbow_CH_DBI_plot(self, som_weights_to_nodes, metric_destination) yellowbrick_expected_k = validation.print_yellowbrickkelbow(self, som_weights_to_nodes, metric_destination) silhouette_avgs, reasonable_silhoutte_scores_mt50 = validation.print_silhoutte_plots(self, som_weights_to_nodes, metric_destination) dbstop10 = validation.print_dbs_plots(self, som_weights_to_nodes, metric_destination) eps_ls, dbs_k_ls, dbs_noisepts_ls, dbs_labels = [], [], [], [] for i in dbstop10: eps_ls.append(i[0]) dbs_k_ls.append(i[1]) dbs_noisepts_ls.append(i[2]) dbs_labels.append(i[3]) sil_peaks, ch_max, dbi_min, reasonable_sil, ch_dbi_tally, n_expected_clusters, dbs_err_dict = validation.get_cluster_determination_vars( silhouette_avgs, ch_scores, dbi_scores, reasonable_silhoutte_scores_mt50, dbs_k_ls, dbs_noisepts_ls, yellowbrick_expected_k) for cluster_num in n_expected_clusters: if alpha: save_dir = fr"{self.alpha_model_dir}/k-{cluster_num}" else: save_dir = fr"{self.models_dir_path}/k-{cluster_num}" if cluster_num == ch_max: save_dir += '_CHhighestPeak' if cluster_num == dbi_min: save_dir += '_lowestDBItrough' if cluster_num in sil_peaks: save_dir += '_SilhouetteAVG-peak' if cluster_num == reasonable_sil: save_dir += '_mostReasonable-basedon-Silhouetteplot' if cluster_num in ch_dbi_tally: save_dir += '_CHpeak-and-DBItrough' if cluster_num == yellowbrick_expected_k: save_dir += '_Yellowbrickexpected-K' if cluster_num in dbs_err_dict: save_dir += f'_DBSCANclusterErrorValsExpected-{dbs_err_dict[cluster_num]}' os.makedirs(save_dir, exist_ok=True) print(f'save_dir: {save_dir}') self.ch_max_path = utils.to_pickle(f"{prefix}ch_max", ch_max, metric_destination) self.dbi_min_path = utils.to_pickle(f"{prefix}dbi_min", dbi_min, metric_destination) self.sil_peaks_path = utils.to_pickle(f"{prefix}sil_peaks", sil_peaks, metric_destination) self.reasonable_sil_path = utils.to_pickle(f"{prefix}reasonable_sil", reasonable_sil, metric_destination) self.ch_dbi_tally_path = utils.to_pickle(f"{prefix}ch_dbi_tally", ch_dbi_tally, metric_destination) self.yellowbrick_expected_k_path = utils.to_pickle(f"{prefix}yellowbrick_expected_k", yellowbrick_expected_k, metric_destination) self.dbs_err_dict_path = utils.to_pickle(f"{prefix}dbs_err_dict", dbs_err_dict, metric_destination) self.n_expected_clusters_path = utils.to_pickle(f"{prefix}n_expected_clusters", n_expected_clusters, metric_destination) break print(f'{utils.time_now()} - Internal validation of clusters has been run, please view metrices folder @:\n{metric_destination} to determine optimal cluster number.\n'\ f'\nYou can view the separate folders constructed for each discovered cluster combination. See @: \n{model_destination}.')
def get_all_subject_data(): overall_train_meanEEG = np.zeros((1, 14)) overall_train_minEEG = np.zeros((1, 14)) overall_train_maxEEG = np.zeros((1, 14)) overall_train_stdEEG = np.zeros((1, 14)) overall_train_meanPeaks = np.zeros((1, 14)) overall_train_numPeaks = np.zeros((1, 14)) overall_train_q25Peaks = np.zeros((1, 14)) overall_train_q50Peaks = np.zeros((1, 14)) overall_train_q75Peaks = np.zeros((1, 14)) overall_train_yTruth = np.zeros(1) overall_test_meanEEG = np.zeros((1, 14)) overall_test_minEEG = np.zeros((1, 14)) overall_test_maxEEG = np.zeros((1, 14)) overall_test_stdEEG = np.zeros((1, 14)) overall_test_meanPeaks = np.zeros((1, 14)) overall_test_numPeaks = np.zeros((1, 14)) overall_test_q25Peaks = np.zeros((1, 14)) overall_test_q50Peaks = np.zeros((1, 14)) overall_test_q75Peaks = np.zeros((1, 14)) overall_test_yTruth = np.zeros(1) overall_test_sub_meanEEG = np.zeros((1, 14)) overall_test_sub_minEEG = np.zeros((1, 14)) overall_test_sub_maxEEG = np.zeros((1, 14)) overall_test_sub_stdEEG = np.zeros((1, 14)) overall_test_sub_meanPeaks = np.zeros((1, 14)) overall_test_sub_numPeaks = np.zeros((1, 14)) overall_test_sub_q25Peaks = np.zeros((1, 14)) overall_test_sub_q50Peaks = np.zeros((1, 14)) overall_test_sub_q75Peaks = np.zeros((1, 14)) overall_test_sub_yTruth = np.zeros(1) X_train, Y_train = utils.open_pickle('eng_train') X_val, Y_val = utils.open_pickle('eng_val') X_test, Y_test = utils.open_pickle('eng_test') X_sub_test, Y_sub_test = utils.open_pickle('sub_test') for i in range(0, len(X_train)): data_point = X_train[i] y_val = Y_train[i] meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features( data_point, y_val) # print(meanEEG.shape) # print(q50Peaks.shape) overall_train_meanEEG = np.vstack((overall_train_meanEEG, meanEEG)) overall_train_minEEG = np.vstack((overall_train_minEEG, minEEG)) overall_train_maxEEG = np.vstack((overall_train_maxEEG, maxEEG)) overall_train_stdEEG = np.vstack((overall_train_stdEEG, stdEEG)) overall_train_q25Peaks = np.vstack((overall_train_q25Peaks, q25Peaks)) overall_train_q50Peaks = np.vstack((overall_train_q50Peaks, q50Peaks)) overall_train_q75Peaks = np.vstack((overall_train_q75Peaks, q75Peaks)) overall_train_meanPeaks = np.vstack( (overall_train_meanPeaks, meanPeaks)) overall_train_numPeaks = np.vstack((overall_train_numPeaks, numPeaks)) overall_train_yTruth = np.vstack((overall_train_yTruth, yTruth)) for i in range(0, len(X_val)): data_point = X_val[i] y_val = Y_val[i] meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features( data_point, y_val) # print(meanEEG.shape) # print(q50Peaks.shape) overall_train_meanEEG = np.vstack((overall_train_meanEEG, meanEEG)) overall_train_minEEG = np.vstack((overall_train_minEEG, minEEG)) overall_train_maxEEG = np.vstack((overall_train_maxEEG, maxEEG)) overall_train_stdEEG = np.vstack((overall_train_stdEEG, stdEEG)) overall_train_q25Peaks = np.vstack((overall_train_q25Peaks, q25Peaks)) overall_train_q50Peaks = np.vstack((overall_train_q50Peaks, q50Peaks)) overall_train_q75Peaks = np.vstack((overall_train_q75Peaks, q75Peaks)) overall_train_meanPeaks = np.vstack( (overall_train_meanPeaks, meanPeaks)) overall_train_numPeaks = np.vstack((overall_train_numPeaks, numPeaks)) overall_train_yTruth = np.vstack((overall_train_yTruth, yTruth)) for i in range(0, len(X_test)): data_point = X_test[i] y_val = Y_test[i] meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features( data_point, y_val) # print(meanEEG.shape) # print(q50Peaks.shape) overall_test_meanEEG = np.vstack((overall_test_meanEEG, meanEEG)) overall_test_minEEG = np.vstack((overall_test_minEEG, minEEG)) overall_test_maxEEG = np.vstack((overall_test_maxEEG, maxEEG)) overall_test_stdEEG = np.vstack((overall_test_stdEEG, stdEEG)) overall_test_q25Peaks = np.vstack((overall_test_q25Peaks, q25Peaks)) overall_test_q50Peaks = np.vstack((overall_test_q50Peaks, q50Peaks)) overall_test_q75Peaks = np.vstack((overall_test_q75Peaks, q75Peaks)) overall_test_meanPeaks = np.vstack((overall_test_meanPeaks, meanPeaks)) overall_test_numPeaks = np.vstack((overall_test_numPeaks, numPeaks)) overall_test_yTruth = np.vstack((overall_test_yTruth, yTruth)) for i in range(0, len(X_sub_test)): data_point = X_sub_test[i] y_val = Y_sub_test[i] meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features( data_point, y_val) # print(meanEEG.shape) # print(q50Peaks.shape) overall_test_sub_meanEEG = np.vstack( (overall_test_sub_meanEEG, meanEEG)) overall_test_sub_minEEG = np.vstack((overall_test_sub_minEEG, minEEG)) overall_test_sub_maxEEG = np.vstack((overall_test_sub_maxEEG, maxEEG)) overall_test_sub_stdEEG = np.vstack((overall_test_sub_stdEEG, stdEEG)) overall_test_sub_q25Peaks = np.vstack( (overall_test_sub_q25Peaks, q25Peaks)) overall_test_sub_q50Peaks = np.vstack( (overall_test_sub_q50Peaks, q50Peaks)) overall_test_sub_q75Peaks = np.vstack( (overall_test_sub_q75Peaks, q75Peaks)) overall_test_sub_meanPeaks = np.vstack( (overall_test_sub_meanPeaks, meanPeaks)) overall_test_sub_numPeaks = np.vstack( (overall_test_sub_numPeaks, numPeaks)) overall_test_sub_yTruth = np.vstack((overall_test_sub_yTruth, yTruth)) # overall_train = np.dstack((overall_train_q75Peaks, overall_train_numPeaks, overall_train_meanPeaks, overall_train_stdEEG, overall_train_maxEEG, overall_train_minEEG)) # overall_test = np.dstack((overall_test_q75Peaks, overall_test_numPeaks, overall_test_meanPeaks, overall_test_stdEEG, overall_test_maxEEG, overall_test_minEEG)) return overall_train_maxEEG, overall_train_yTruth, overall_test_maxEEG, overall_test_yTruth, overall_test_sub_maxEEG, overall_test_sub_yTruth
import os import tensorflow as tf import numpy as np import pandas as pd import utils from tensorflow.contrib.tensorboard.plugins import projector PATH = os.getcwd() LOG_DIR = PATH + '/project-tensorboard/log-1' MODEL_DIR = os.path.join("F:/", "DL-code", "text2shape-data", "nrrd_256_filter_div_32_solid") # METADATA_DIR = os.path.join('project-tensorboard', 'log-1', 'metadata.tsv') # SPRITE_DIR = os.path.join('project-tensorboard', 'log-1', 'sprite.jpg') embeddings = utils.open_pickle( os.path.join("F:/", "DL-code", "text2shape-data", "shapenet", "shapenet-embeddings", "text_embeddings_train.p")) sample_tuple = embeddings['caption_embedding_tuples'][0] embedding_shape = list(sample_tuple[3].shape) assert len(embedding_shape) == 1 embedding_data = [item[3] for item in embeddings['caption_embedding_tuples']] model_id = [item[2] for item in embeddings['caption_embedding_tuples']] print("embedding shape: ", np.shape(embedding_data)) print("model_id shape: ", np.shape(model_id)) thumb_dir = os.path.join(MODEL_DIR, "resize_models") image_data = utils.get_images(thumb_dir) feat_cols = ['col' + str(i) for i in range(np.shape(embedding_data)[1])] df = pd.DataFrame(embedding_data, columns=feat_cols)
def __init__(self,filename,uncertainty): self.uncertainty=uncertainty #this is the central parameter of the classifier. 5 shouldn't be too aggressive self.filename=filename from utils import open_pickle self.dic,self.total_ngood,self.total_links=open_pickle(filename,({},0,0))
import datamodel import utils import cPickle from bayesian import Bayesian old_days=30 novel_days=1 hist_bins=8 chronology=utils.open_pickle("novelty.pck",{}) filter=Bayesian('novelty_bayes.pck',5) def predict(link): words=title_words(link.title) if not words: return 0. novelty=sum(1. for w in words if isnovel(w,link.date))/len(words) return filter.predict(["novelty_%d"%int(novelty*hist_bins)]) > 0. def train(links): chronology={} for l in links: words=title_words(l.title) for w in words: if chronology.has_key(w): chronology[w].append(l.date) else: chronology[w]=[l.date] cPickle.dump(chronology,open("novelty.pck","wb",-1)) training_set=[] for l in links: words=title_words(l.title) novelty=sum(1. for w in words if isnovel(w,l.date))/len(words)
def print_test_date_abv_1mm_to500mm(model, dest, sn, random_sampled_date, cluster): RFprec_to_ClusterLabels_dataset = utils.open_pickle( utils.find('*RFprec_to_ClusterLabels_dataset.pkl', model.test_prepared_data_dir)[0]) date_split = pd.DatetimeIndex(random_sampled_date).strftime( "%Y-%m-%d").values print(f'{utils.time_now()} - printing >1mm plot for {date_split}') rf_random_choice = RFprec_to_ClusterLabels_dataset.sel( time=random_sampled_date).precipitationCal[0] rf_random_choice_gt1mm = np.ma.masked_where(rf_random_choice <= 1, rf_random_choice) rf_ds_lon = RFprec_to_ClusterLabels_dataset.lon rf_ds_lat = RFprec_to_ClusterLabels_dataset.lat fig = plt.Figure(figsize=(12, 15)) ax = fig.add_subplot(111, projection=ccrs.PlateCarree()) fig.suptitle(f"RF received {date_split[0]} over 1mm", fontweight='bold', fontsize=16, y=.95) ax.set_title(f"Predicted cluster: {cluster}. \n"\ f"Areas in grey denote 0.0-0.99mm RF, and considered as no rain occurred.", fontsize=14, y=1.04) ax.set_facecolor('silver') ax.set_extent( [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1]) ax.coastlines( "50m", linewidth=.8, color='k', ) ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed') a = plt.cm.pink(np.linspace(.9, .2, 2)) b = plt.cm.gnuplot2(np.linspace(0.4, .9, 6)) all_colors = np.vstack((a, b)) terrain_map = colors.LinearSegmentedColormap.from_list( 'terrain_map', all_colors) RF = ax.contourf(rf_ds_lon, rf_ds_lat, rf_random_choice_gt1mm.T, np.linspace(0, 500, 501), cmap=terrain_map, extend='max') cbar_rf = fig.colorbar(RF, label='RF (mm)', orientation='horizontal', \ pad=0.05, shrink=.8, ticks=np.arange(0,500,50)) cbar_rf.ax.xaxis.set_ticks_position('top') cbar_rf.ax.xaxis.set_label_position('top') ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)), crs=ccrs.PlateCarree()) ax.xaxis.tick_top() ax.set_xlabel('') ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)), crs=ccrs.PlateCarree()) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() ax.set_ylabel('') fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_abv1mm_to500_sn{sn}_{date_split}.png' fig.savefig(fn, bbox_inches='tight', pad_inches=1) print(f'Extent saved @:\n{fn}') plt.close('all')
def __repr__(self): return ",".join(self.words).encode('utf-8') class PredicateClassifier: def __init__(self,predicate): self.predicate=predicate def train(self,titles,weights,evaluations): """titles is [[words]]""" total=sum(weights[n]*evaluations[n] for n,words in enumerate(titles) if self.predicate(words)) self.wordgood=1. if total >= 0 else -1. def predict(self,title): if self.predicate(title): return self.wordgood else: return 0. trained=open_pickle("adaboost.pck",[]) def predict(link): #words=tokenize(link.title) words=mash_post(link) if sum(alpha * c.predict(words) for c,alpha in trained) >= 0: return 1. else: return -1. def train(links): from math import exp,fabs,log fwords=most_frequent_words() classifiers=[PredicateClassifier(HasWordsPredicate([w])) for w in fwords] #classifiers.extend(PredicateClassifier(HasWordsPredicate(duo)) for duo in most_frequent_duos(fwords)) titles=[mash_post(l) for l in links]
def print_heavyrfforecastcomparison_gt50mm(model, dest, sn, random_sampled_date, cluster): date_split = pd.DatetimeIndex(random_sampled_date).strftime( "%Y-%m-%d").values print( f'{utils.time_now()} - printing heavyrfforecastcomparison >50mm plot for {date_split}' ) test_ds = utils.open_pickle( utils.find('*RFprec_to_ClusterLabels_dataset.pkl', model.test_prepared_data_dir)[0]) rf_ds_lon = test_ds.lon rf_ds_lat = test_ds.lat test_ds_random_date = test_ds.sel(time=random_sampled_date) training_rf_ds = utils.open_pickle( model.RFprec_to_ClusterLabels_dataset_path) clus_size = training_rf_ds.where(training_rf_ds.cluster == cluster, drop=True).time.size pred_gt50mm = np.mean(training_rf_ds.where( training_rf_ds.cluster == cluster - 1, drop=True).sel( lon=slice(model.LON_W, model.LON_E), lat=slice(model.LAT_S, model.LAT_N)).precipitationCal > 50, axis=0).values * 100 pred_gt50mm = np.ma.masked_where(pred_gt50mm == 0, pred_gt50mm) gt_arr_gt50mm = (test_ds_random_date.precipitationCal > 50)[0].values gt_arr_gt150mm = (test_ds_random_date.precipitationCal > 150)[0].values gt_arr_gt250mm = (test_ds_random_date.precipitationCal > 250)[0].values gt_arr_gt500mm = (test_ds_random_date.precipitationCal > 500)[0].values fig = plt.Figure(figsize=(12, 15)) ax = fig.add_subplot(111, projection=ccrs.PlateCarree()) fig.suptitle( f"Comparison between actual heavy RF chance occurrences on {date_split[0]} to \npredicted forecast of heavy RF in cluster {cluster}.", fontweight='bold', fontsize=16, y=1.02) ax.set_title(f'Number of training dates in cluster {cluster}: {clus_size}. \n' \ 'Areas with white indicate 0.0% predicted chance of heavy RF. \n' \ 'Hatched patterns in blue represent RF above 50mm -- in lime-green: >150mm, \n' \ 'In pink: >250mm -- in cyan: >500mm.', y=1.07) ax.set_facecolor('w') ax.set_extent( [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1]) ax.coastlines( "50m", linewidth=.8, color='k', ) ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed') zero_to_ten = plt.cm.pink(np.linspace(1, .2, 3)) eleven_to_25 = plt.cm.gist_earth(np.linspace(0.75, 0.2, 5)) twnty5_to_40 = plt.cm.gist_stern(np.linspace(0.3, 0.1, 5)) all_colors = np.vstack((zero_to_ten, eleven_to_25, twnty5_to_40)) terrain_map = colors.LinearSegmentedColormap.from_list( 'terrain_map', all_colors) ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)), crs=ccrs.PlateCarree()) ax.xaxis.tick_top() ax.set_xlabel('') ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)), crs=ccrs.PlateCarree()) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() ax.set_ylabel('') # predicted chance of heavy RF contf_predictions = ax.contourf(rf_ds_lon, rf_ds_lat, pred_gt50mm.T, np.arange(0, 50, 5), cmap=terrain_map, extend='max') cbar_rf = fig.colorbar(contf_predictions, label='Predicted chance of heavy RF (%)', orientation='horizontal', \ pad=0.08, shrink=.8, ticks = np.arange(0,50,5) ) cbar_rf.ax.xaxis.set_ticks_position('top') cbar_rf.ax.xaxis.set_label_position('top') # actual zones of heavy RF (>50mm) rf_gt50mm = ax.contourf(rf_ds_lon, rf_ds_lat, gt_arr_gt50mm.T, levels=[-1, 0, 1], colors='none', hatches=[None, '///']) rf_gt50mm.collections[1].set_edgecolor('royalblue') rf_gt50mm.collections[1].set_linewidth(0.05) rf_gt250mm = ax.contourf(rf_ds_lon, rf_ds_lat, gt_arr_gt150mm.T, levels=[-1, 0, 1], colors='none', hatches=[None, '\\\\\\']) rf_gt250mm.collections[1].set_edgecolor('lime') rf_gt250mm.collections[1].set_linewidth(0.05) rf_gt250mm = ax.contourf(rf_ds_lon, rf_ds_lat, gt_arr_gt250mm.T, levels=[-1, 0, 1], colors='none', hatches=[None, '...XX']) rf_gt250mm.collections[1].set_edgecolor('magenta') rf_gt250mm.collections[1].set_linewidth(0.1) rf_gt250mm = ax.contourf(rf_ds_lon, rf_ds_lat, gt_arr_gt500mm.T, levels=[-1, 0, 1], colors='none', hatches=[None, 'XX*']) rf_gt250mm.collections[1].set_edgecolor('aqua') rf_gt250mm.collections[1].set_linewidth(0.1) fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_heavyrfforecastcomparison_gt50mm_v2_sn{sn}_{date_split}.png' fig.savefig(fn, bbox_inches='tight', pad_inches=1) print(f'Extent saved @:\n{fn}') plt.close('all')
def print_brier_gt1mm(model, dest, sn, random_sampled_date, cluster): date_split = pd.DatetimeIndex(random_sampled_date).strftime( "%Y-%m-%d").values print(f'{utils.time_now()} - printing Brier >1mm plot for {date_split}') test_ds = utils.open_pickle( utils.find('*RFprec_to_ClusterLabels_dataset.pkl', model.test_prepared_data_dir)[0]) rf_ds_lon = test_ds.lon rf_ds_lat = test_ds.lat test_ds_random_date = test_ds.sel(time=random_sampled_date) test_ds_random_date_gt1mm = test_ds_random_date.precipitationCal > 1 gt_arr = test_ds_random_date_gt1mm[0].values training_rf_ds = utils.open_pickle( model.RFprec_to_ClusterLabels_dataset_path) clus_size = training_rf_ds.where(training_rf_ds.cluster == cluster, drop=True).time.size pred = np.mean(training_rf_ds.where( training_rf_ds.cluster == cluster - 1, drop=True).sel( lon=slice(model.LON_W, model.LON_E), lat=slice(model.LAT_S, model.LAT_N)).precipitationCal > 1, axis=0).values gt_arr_flat = np.reshape(gt_arr, (gt_arr.shape[0] * gt_arr.shape[1]))[:, None] pred_flat = np.reshape(pred, (pred.shape[0] * pred.shape[1]))[:, None] gridded_brier_flat = np.array([ np.apply_along_axis(func1d=brier_score_loss, axis=0, arr=e, y_prob=f) for e, f in zip(gt_arr_flat, pred_flat) ]) gridded_brier = gridded_brier_flat.reshape(gt_arr.shape) fig = plt.Figure(figsize=(12, 15)) ax = fig.add_subplot(111, projection=ccrs.PlateCarree()) fig.suptitle( f"Brier scores for {date_split[0]} compared to \npredicted forecast of rainday (>1mm) for cluster {cluster}.", fontweight='bold', fontsize=16, y=1) ax.set_title(f'Number of training dates in cluster {cluster}: {clus_size}. \n' \ 'Scores approaching 0 indicate better calibrated predictive models ' \ 'while 0.25 likely represent forecasts of 50%, regardless of outcome. \n' \ 'Areas occupied by white-grids: did NOT receive any RF (i.e. <1mm), ' \ 'While areas unoccupied by the white-grid have receive >1mm of RF.', y=1.06) ax.set_facecolor('w') ax.set_extent( [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1]) ax.coastlines( "50m", linewidth=.8, color='k', ) ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed') a = plt.cm.summer(np.linspace(0, 1, 6)) b = plt.cm.autumn(np.linspace(1, 0, 4)) all_colors = np.vstack((a, b)) terrain_map = colors.LinearSegmentedColormap.from_list( 'terrain_map', all_colors) ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)), crs=ccrs.PlateCarree()) ax.xaxis.tick_top() ax.set_xlabel('') ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)), crs=ccrs.PlateCarree()) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() ax.set_ylabel('') # brier: comparing >1mm predictions to GT >1mm briers = ax.contourf(rf_ds_lon, rf_ds_lat, gridded_brier.T, np.linspace(0, 1, 11), cmap=terrain_map, extend='neither') cbar_rf = fig.colorbar(briers, label='Brier score', orientation='horizontal', \ pad=0.07, shrink=.8, ticks=np.arange(0,1.1,.1)) cbar_rf.ax.xaxis.set_ticks_position('top') cbar_rf.ax.xaxis.set_label_position('top') # actual no-rain (<=1mm) zones rf_dots = ax.contourf(rf_ds_lon, rf_ds_lat, gt_arr.T, levels=[-1, 0, 1], colors='none', hatches=['/-', None]) rf_dots.collections[0].set_edgecolor('white') # rf_dots.collections[0].set_linewidth(0.) fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_brier_gt1mm_v2_sn{sn}_{date_split}.png' fig.savefig(fn, bbox_inches='tight', pad_inches=1) print(f'Extent saved @:\n{fn}') plt.close('all')
def print_test_date_zscore_against_fullmodel(model, dest, sn, random_sampled_date, cluster): RFprec_to_ClusterLabels_dataset = utils.open_pickle( utils.find('*RFprec_to_ClusterLabels_dataset.pkl', model.test_prepared_data_dir)[0]) rf_ds_lon = RFprec_to_ClusterLabels_dataset.lon rf_ds_lat = RFprec_to_ClusterLabels_dataset.lat date_split = pd.DatetimeIndex(random_sampled_date).strftime( "%Y-%m-%d").values print(f'{utils.time_now()} - printing z-score plot for {date_split}') rf_random_choice = RFprec_to_ClusterLabels_dataset.sel( time=random_sampled_date).precipitationCal[0] training_rf_ds = utils.open_pickle( model.RFprec_to_ClusterLabels_dataset_path) rf_for_random_choice_cluster = training_rf_ds.precipitationCal.where( training_rf_ds.cluster == cluster - 1, drop=True) gridmean = np.mean(rf_for_random_choice_cluster, axis=0) gridstd = np.std(rf_for_random_choice_cluster, axis=0) stdardized_rf_random_choice = ((rf_random_choice - gridmean) / gridstd).values fig = plt.Figure(figsize=(12, 15)) ax = fig.add_subplot(111, projection=ccrs.PlateCarree()) fig.suptitle(f"Z-Score for {date_split[0]}. Predicted cluster: {cluster}", fontweight='bold', fontsize=18, y=.99, ha='center') ax.set_title(f"Total dates/datapoints per grid in cluster {cluster}: "\ f"{training_rf_ds.where(training_rf_ds.cluster==cluster, drop=True).time.size}"\ f"\n-1.65<=Z<=1.65 == 90%\n-1.96<=Z<=1.96 == 95%\n-2.58<=Z<=2.58 == 99%", fontsize=14, y=1.04) ax.set_facecolor('w') ax.set_extent( [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1]) ax.coastlines("50m", linewidth=.5, color='w', alpha=1) ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed') two58_to_196 = plt.cm.gist_ncar(np.linspace(.75, .8, 3)) one96_to_0 = plt.cm.copper(np.linspace(1, 0, 4)) zero_to_196 = plt.cm.twilight_shifted(np.linspace(0, .4, 4)) one96_to_258 = plt.cm.gist_rainbow(np.linspace(.55, .3, 3)) all_colors = np.vstack( (two58_to_196, one96_to_0, zero_to_196, one96_to_258)) terrain_map = colors.LinearSegmentedColormap.from_list( 'terrain_map', all_colors) RF = ax.contourf(rf_ds_lon, rf_ds_lat, stdardized_rf_random_choice.T, np.linspace(-3.3333, 3.3333, 21), alpha=1, cmap=terrain_map, extend='both') cbar_rf = fig.colorbar(RF, label='Z-Score of grid computed against grid-mean & grid-SD of whole model.', orientation='horizontal', \ pad=0.05, shrink=.9, ticks=[-2.58, -1.96, -1.65, 0, 1.65, 1.96, 2.58]) cbar_rf.ax.tick_params(size=5) cbar_rf.ax.xaxis.set_ticks_position('top') cbar_rf.ax.xaxis.set_label_position('top') ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)), crs=ccrs.PlateCarree()) ax.xaxis.tick_top() ax.set_xlabel('') ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)), crs=ccrs.PlateCarree()) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() ax.set_ylabel('') fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_zscore_against_fullmodel_sn{sn}_{date_split}.png' fig.savefig(fn, bbox_inches='tight', pad_inches=1) print(f'Extent saved @:\n{fn}') plt.close('all')