def add_outage_longitudes_latitudes( df_outages, postcodes_fp: str = '../data/postcodes.csv', null_threshold=1): pcl = PostCodeLocator(postcodes_fp) s_incident_longitudes = pd.Series(index=df_outages.index, dtype='float64') s_incident_latitudes = pd.Series(index=df_outages.index, dtype='float64') for outage_idx, s_outage in track(df_outages.iterrows(), total=df_outages.shape[0]): incident_longitudes = [] incident_latitudes = [] for postcode_impacted in s_outage['postcodes_impacted']: longitude, latitude = pcl.get_postcode_location(postcode_impacted) incident_longitudes += [longitude] incident_latitudes += [latitude] s_incident_longitudes.loc[outage_idx] = pd.Series( incident_longitudes).mean() s_incident_latitudes.loc[outage_idx] = pd.Series( incident_latitudes).mean() df_outages = (df_outages.assign(longitude=s_incident_longitudes).assign( latitude=s_incident_latitudes)) idxs_to_keep = (df_outages.isnull().sum(axis=1) <= null_threshold).replace( False, np.nan).dropna().index df_outages = df_outages.loc[idxs_to_keep] return df_outages
def sketch_with_PPR(sketchers, PPR): n = PPR.shape[0] for i in track(range(n)): for s in sketchers: s.append(PPR[i]) return PPR
def compute_predictions(*, model, likelihood, test_X): t = -time() with torch.no_grad(): batch = test_X[0:1] pred = likelihood(model(batch)) del pred t += time() print('Predictions will take %.1f minutes' % (t * len(test_X) / 60)) t = -time() u = [] s = [] with torch.no_grad(): n = len(test_X) for i in track(range(n), total=n): batch = test_X[i:i + 1] pred = likelihood(model(batch)) u.append(pred.mean.detach().cpu().numpy()) s.append(pred.covariance_matrix.detach().cpu().numpy()) del pred u = np.concatenate(u, axis=0) s = np.stack(s, axis=0) t += time() return u, s
def plot_worst( self, model: tf.keras.Model, cadastre_indeces: Collection[int] = range(0, 10), metric: Tuple[Mapping, str] = (min, "iou"), number: int = 5, ): """Plot the worst predictions according to a given metric.""" cadastre_metrics = {} optimizer, metric = metric for cadastre_index in track(cadastre_indeces): try: x, y = self[cadastre_index:cadastre_index + 1] except ValueError: continue if x is None or x.shape[0] > 1: continue evaluation = model.evaluate(x=x, y=y, verbose=0) metrics = { name: value for name, value in zip(model.metrics_names, evaluation) } cadastre_metrics[cadastre_index] = metrics number = min(number, len(cadastre_metrics)) for _ in range(number): worst_cadastre = optimizer( cadastre_metrics.keys(), key=lambda key: cadastre_metrics[key][metric], ) self.plot_prediction(model=model, cadastre_index=worst_cadastre) del cadastre_metrics[worst_cadastre]
def first_time_setup(self): """Initialize the cadastre cache for the first time.""" self.directory.mkdir(parents=True, exist_ok=True) # Save pertinent cadastre cache metadata metadata = { "cadastre_path": str(self.cadastre_path.resolve().absolute()), "layer_name": self.layer_name, } self.metadata_path.write_text(json.dumps(metadata)) metadata = json.loads(self.metadata_path.read_text()) geometries = [] indeces = [] with fiona.open( metadata["cadastre_path"], layer=metadata["layer_name"], mode="r", ) as src: # srid = int(src.crs["init"].split(":")[0]) crs = src.crs["init"] for index, obj in track(enumerate(src), total=len(src)): geometries.append(vector.fiona_polygon(obj)) indeces.append(index + 1) self._dataframe = geopandas.GeoDataFrame( crs=crs, geometry=geopandas.GeoSeries(geometries), ) self._dataframe.set_index(pd.Index(indeces), inplace=True) self._dataframe.to_pickle( self.directory / "cadastre.pkl", protocol=-1, )
def sketch(sketchers, G, rw=True, nthreads=1): n = G.shape[0] x = np.zeros(n, dtype=np.float32) PPR = np.zeros((n, n), dtype=np.float32) inds = G.indptr inds = np.append(inds, n).astype(np.int32) degrees = G.sum(axis=0).A1.astype(np.int32) for i in track(range(n)): if rw: libc.ppr_row_rw(x.ctypes.data_as(POINTER(c_float)), inds.ctypes.data_as(POINTER(c_int)), G.indices.ctypes.data_as(POINTER(c_int)), degrees.ctypes.data_as(POINTER(c_int)), n, i, c_float(0.85), nthreads, 100000) else: libc.ppr_row_matmul(x.ctypes.data_as(POINTER(c_float)), inds.ctypes.data_as(POINTER(c_int)), G.indices.ctypes.data_as(POINTER(c_int)), degrees.ctypes.data_as(POINTER(c_int)), n, i, c_float(0.85), nthreads, c_float(1e-6), 100, 2048) row = np.zeros_like(x, dtype=np.float32) nnz = x.nonzero() row[nnz] = np.log(n * x[nnz]) PPR[i] = row for s in sketchers: s.append(row) return PPR
def cache_mask( self, ogr_path: Path, layer_name: str, mask_name: str, ) -> None: """Cache given mask to disk.""" mask_directory = self.directory / "mask" / mask_name mask_pickle = mask_directory / "mask.pkl" if mask_pickle.exists(): print("Already cached!") return else: mask_directory.mkdir(parents=True, exist_ok=True) with fiona.open(ogr_path, "r", layer=layer_name) as src: mask = MultiPolygon( [vector.fiona_polygon(feature) for feature in track(src)] ) print("Buffering multipolygon pickle... ", end="") mask = mask.buffer(0.0) print("Done!") print("Writing pickle file... ", end="") mask_pickle.write_bytes( pickle.dumps(mask, protocol=pickle.HIGHEST_PROTOCOL), ) print("Done!")
def download_datasets(self, datasets, product_id='EO:EUM:DAT:MSG:MSG15-RSS', download_all=True): """ Downloads a set of dataset from the EUMETSAT API in the defined date range and specified product Parameters: datasets: list of datasets returned by `identify_available_datasets` """ # Identifying dataset ids to download dataset_ids = sorted([dataset['id'] for dataset in datasets]) # Check which datasets to download download_ids, local_ids = self.check_if_downloaded(dataset_ids) # Downloading specified datasets if not dataset_ids: self.logger.info( 'No files will be downloaded. Set DownloadManager bucket_name argument for local download' ) return all_metadata = [] for dataset_id in track(dataset_ids): dataset_link = dataset_id_to_link(dataset_id) # Download the raw data if (dataset_id in download_ids) or (download_all == True): try: self.download_single_dataset(dataset_link) except: self.logger.info( 'The EUMETSAT access token has been refreshed') self.request_access_token() self.download_single_dataset(dataset_link) # Extract and save metadata dataset_metadata = extract_metadata(self.data_dir, product_id=product_id) dataset_metadata.update({'downloaded': pd.Timestamp.now()}) all_metadata += [dataset_metadata] self.metadata_table.insert(dataset_metadata) # Delete old metadata files for xml_file in ['EOPMetadata.xml', 'manifest.xml']: xml_filepath = f'{self.data_dir}/{xml_file}' if os.path.isfile(xml_filepath): os.remove(xml_filepath) df_new_metadata = pd.DataFrame(all_metadata) return df_new_metadata
def get_ssen_incidents_info(incidents_url='https://www.ssen.co.uk/Sse_Components/Views/Controls/FormControls/PowerTrackHandler.ashx'): raw_incidents_info = requests.get(incidents_url, verify=False).json() cleaned_incidents_info = dict() for incident in track(raw_incidents_info['Faults']): incident_ref = incident['Reference'] cleaned_incidents_info[incident_ref] = clean_ssen_incident_info(incident) return raw_incidents_info, cleaned_incidents_info
def convert_nbs_to_md(nbs_dir, docs_nb_img_dir, docs_dir): nb_files = [f for f in os.listdir(nbs_dir) if f[-6:]=='.ipynb'] for nb_file in track(nb_files): nb_fp = f'{nbs_dir}/{nb_file}' junix.export_images(nb_fp, docs_nb_img_dir) convert_md(nb_fp, docs_dir, img_path=f'{docs_nb_img_dir}/', jekyll=False) md_fp = docs_dir + '/'+ nb_file.replace('.ipynb', '') + '.md' encode_file_as_utf8(md_fp)
def get_full_search_df(num_pages): df_search = pd.DataFrame() for page_num in track(range(num_pages), label='Accounts'): r = get_accounts_raw_search(page_num=page_num) df_search_page = extract_search_df(r) df_search = df_search.append(df_search_page) df_search = (df_search.reset_index(drop=True).drop_duplicates()) return df_search
def query_API(self, from_date=None, to_date=None, data_stream='emissions', level='national'): """ Queries the National Grid ESO Carbon Intensity API, enabling data to be easily retrieved for emissions and fuel generation at both a regional and national level N.b. the regional emissions data only includes forecast levels as the API does not have an observed regional emissions data stream at time of writing Parameters ---------- from_date : str Start date for the queried data to_date : str End date for the queried data data_stream : str One of 'emissions' or 'generation' level : str spatial aggregation level, one of 'national' or 'regional' (DNOs) Returns ------- """ # Input checks assert data_stream in self.available_data_streams, f"Data stream must be one of: {', '.join(self.available_data_streams)}" assert level in self.available_levels, f"Level must be one of: {', '.join(self.available_levels)}" query_func = self.query_funcs[level][data_stream] if from_date is None or to_date is None: df = query_func(from_date, to_date) elif (pd.to_datetime(to_date) - pd.to_datetime(from_date)) > pd.Timedelta(weeks=2): df = pd.DataFrame() dt_rng_pairs = construct_date_range_pairs(from_date, to_date) for pair_from_date, pair_to_date in track(dt_rng_pairs): df_pair = query_func(pair_from_date, pair_to_date) df = df.append(df_pair) else: df = query_func(from_date, to_date) return df
def extract_ukpn_multiple_incident_ids(multiple_incident_urls): incident_ids = [] for multiple_incident_url in track(multiple_incident_urls, label='Multiple Ids'): r = requests.get(multiple_incident_url) soup = bs(r.text, features='lxml') incident_ids += [ link['data-url'].split('incidentId=')[1] for link in soup.find('div', {'class': 'multiple-incidents--wrapper mb-4'}).findAll('a') ] return incident_ids
def show_id_2_ratings_df(show_id): df_ratings = pd.DataFrame(index=range(100)) most_recent_season = show_id_2_most_recent_season(show_id) for season in track(range(1, most_recent_season + 1)): ratings = season_2_episode_ratings(show_id, season) df_ratings[season] = pd.Series(ratings, dtype='float64') df_ratings = (df_ratings.dropna(how='all', axis=0).dropna(how='all', axis=1)) df_ratings.index += 1 return df_ratings
def collate_cleaned_incidents_info( dnos=['ukpn', 'ssen', 'wpd', 'sp', 'np', 'enw']): cleaned_incidents_info = dict() for dno in track(dnos, label='DNOs'): try: raw_dno_incidents_info, cleaned_dno_incidents_info = getattr( retrieval, f'get_{dno}_incidents_info')() cleaned_incidents_info[dno] = cleaned_dno_incidents_info retrieval.save_json_data(raw_dno_incidents_info, f'{dno}_incidents_info') except: warn(f'Failed to process {dno}') return cleaned_incidents_info
def construct_ets_unit_dfs(account_ids, owners_col_rename_map, units_col_rename_map, label=None): df_owners = pd.DataFrame(index=account_ids, columns=owners_col_rename_map.keys()) df_units = pd.DataFrame(index=account_ids, columns=units_col_rename_map.keys()) ts_dfs = {} for account_id in track(account_ids, label=label): page_info, df_ts = extract_page_info(account_id) df_owners.loc[account_id] = pd.Series(collate_owner_info(page_info)) df_units.loc[account_id] = pd.Series(collate_unit_info(page_info)) ts_dfs[account_id] = df_ts df_owners = df_owners.rename(columns=owners_col_rename_map) df_units = df_units.rename(columns=units_col_rename_map) return df_owners, df_units, ts_dfs
def nonlinearities(paths, reduction='none', batch_size=8, device='cpu', verbose=0): nonliniearity = { 'complexity': [], 'gain change': [], 'temporal hold': [], 'shape change': [] } if isinstance(paths, str) and os.path.isdir(paths): paths = sorted(glob.glob(os.path.join(paths, 'dSTRF-*.pt'))) if verbose >= 1: print(f'Found {len(paths)} dSTRF files in specified directory.', flush=True) if verbose >= 1: paths = ipypb.track(paths) for path in paths: dstrf = torch.load(path).to(device) nonliniearity['complexity'].append( complexity(dstrf, batch_size=batch_size)) nonliniearity['gain change'].append( gain_change(dstrf, batch_size=batch_size)) nonliniearity['temporal hold'].append( temporal_hold(dstrf, batch_size=batch_size)) nonliniearity['shape change'].append( shape_change(dstrf, batch_size=batch_size, verbose=verbose)) del dstrf for k in nonliniearity: nonliniearity[k] = torch.stack(nonliniearity[k], dim=1) if reduction == 'mean': nonliniearity[k] = nonliniearity[k].mean(dim=1) elif reduction == 'median': nonliniearity[k] = nonliniearity[k].median(dim=1) elif reduction != 'none': raise NotImplementedError() return nonlinearity
def calc_psqi(*, u, s, norms, R2, df_Y): # Consider only elements with bounded norms inds = [i for i, (lower, upper) in enumerate(norms) if np.isinf(upper)] cols = [col for i, col in enumerate(df_Y.columns) if i in inds] print('PSQI: consider only %s' % cols) # Average R^2 across splits R2 = R2.mean(axis=0).round(3) u = u[:, inds] s = s[:, :, inds][:, inds, :] R2 = R2[inds] df_Y = df_Y.iloc[:, inds] norms = norms[inds] assert len(u.shape) == 2 assert len(s.shape) == 3 ret_ps = [] ret_cs = [] train_mean = df_Y.mean(axis=0) train_var = df_Y.var(axis=0) ws = softmax(np.clip(R2, 0, None)) t = -time() for i, (mean, cov) in track(enumerate(zip(u, s)), total=len(u)): if i % 1000 == 0: print('PSQI: processed %d / %d' % (i, len(u))) ps, cs = _calc_psqi( mean, cov, norms, R2=R2, train_mean=train_mean, train_var=train_var, ) ret_ps.append(ps) ret_cs.append(cs) t += time() print('PSQI: computations took %.1f seconds' % t) return np.array(ret_ps), np.array(ret_cs), ws
def evaluate_models(X, y, models, post_pred_proc_func=None, index=None): model_scores = dict() for model_name, model in track(models.items()): df_pred = generate_kfold_preds(X, y, model, index=index) if post_pred_proc_func is not None: df_pred['pred'] = post_pred_proc_func(df_pred['pred']) model_scores[model_name] = { 'mae': mean_absolute_error(df_pred['true'], df_pred['pred']), 'rmse': np.sqrt(mean_squared_error(df_pred['true'], df_pred['pred'])) } df_model_scores = pd.DataFrame(model_scores) df_model_scores.index.name = 'metric' df_model_scores.columns.name = 'model' return df_model_scores
def feature_selection(x_train, y_train, groups=None, model=RandomForestRegressor(), min_num_features=1, max_num_features=None, **sfs_kwargs): if max_num_features is None: max_num_features = 1 + x_train.shape[1] result_features = dict() result_scores = dict() for num_features in track(range(min_num_features, max_num_features)): sfs = SFS( model, k_features=num_features, **sfs_kwargs ) sfs.fit(x_train, y_train, groups=groups) result_features[num_features] = sfs.k_feature_names_ result_scores[num_features] = sfs.k_score_ return result_features, result_scores
def get_ukpn_incidents_info(incidents_url='https://www.ukpowernetworks.co.uk/Incidents/GetIncidents'): r_json = requests.get(incidents_url).json() incident_ids = ( extract_ukpn_single_incident_ids(r_json) + extract_ukpn_multiple_incident_ids(extract_ukpn_multiple_incident_urls(r_json)) ) raw_incidents_info = dict() cleaned_incidents_info = dict() for incident_id in track(incident_ids, label='Details'): try: incident_detail_url = get_ukpn_incident_detail_url(incident_id) r_json = requests.get(incident_detail_url).json() raw_incidents_info[incident_id] = r_json cleaned_incidents_info[incident_id] = extract_ukpn_relevant_info(r_json) except: warn(f'Failed to retrieve incident details for: {incident_id}') return raw_incidents_info, cleaned_incidents_info
def evaluate_discharge_models(df, models, features_kwargs={}): df_features = construct_df_discharge_features(df, **features_kwargs) s_discharge = construct_discharge_s(df['demand_MW'], start_time='15:30', end_time='20:30') evening_datetimes = extract_evening_datetimes(df_features) X = df_features.loc[evening_datetimes].values y = s_discharge.loc[evening_datetimes].values model_scores = dict() peak_reduction_calc = construct_peak_reduction_calculator( s_demand=df['demand_MW'], evening_datetimes=evening_datetimes) for model_name, model in track(models.items()): df_pred = clean.generate_kfold_preds(X, y, model, index=evening_datetimes) df_pred['pred'] = post_pred_discharge_proc_func(df_pred['pred']) model_scores[model_name] = { 'pct_optimal_reduction': peak_reduction_calc(df_pred['true'], df_pred['pred']), 'optimal_discharge_mae': mean_absolute_error(df_pred['true'], df_pred['pred']), 'optimal_discharge_rmse': np.sqrt(mean_squared_error(df_pred['true'], df_pred['pred'])) } df_model_scores = pd.DataFrame(model_scores) df_model_scores.index.name = 'metric' df_model_scores.columns.name = 'model' return df_model_scores
geom_f = "/data/SciBigData/radargrams/geom/" pattern = geom_f + "/**/*_geom.lbl" log.info(pattern) allgeoms = glob.glob(pattern) log.info(f"found {len(allgeoms)} radargrams geom-files") # %% # %% # this is the actual loop injesting the geom files and simplyfying them out = [] ids = [] for f in track(allgeoms): tab = lbl_to_geom(f) df = read_geom_lbl(tab) geometry = LineString([Point(xy) for xy in zip(df.longitude, df.latitude)]) geometry = geometry.simplify(0.001) i = ids.append(match_sharad_id(f)) out.append(geometry) # %% m # %% df = DataFrame() df["track_id"] = ids # %%
def plot_raster_spread(cache): minima, mean, maxima = [], [], [] for index in track(cache.cadastre_indeces): if index > 10_000: break tiles = cache.lidar_tiles(cadastre_index=index) masks = cache.mask_tiles(cadastre_index=index) for (tile, mask) in zip(tiles, masks): mask = mask.flatten() if mask.sum() <= 64: continue tile = tile.flatten() tile = np.ma.masked_where(tile < -3.4e38, tile) minima.append(tile.min()) mean.append(tile.mean()) maxima.append(tile.max()) minima = np.array(minima) mean = np.array(mean) maxima = np.array(maxima) sorter = mean.argsort() minima = minima[sorter] mean = mean[sorter] maxima = maxima[sorter] configure_latex(width_scaler=2) fig, (sorted_ax, dist_ax) = plt.subplots(1, 2) x = np.arange(len(minima)) sorted_ax.fill_between( x, mean, minima, label="Minima", color="g", facecolor="none", alpha=0.5, ) sorted_ax.fill_between( x, mean, maxima, label="Maxima", color="r", facecolor="none", alpha=0.5, ) sorted_ax.plot(x, mean, label="Mean", color="xkcd:black", linewidth=1.5) sorted_ax.legend(loc="upper left") sorted_ax.set_xlabel("Tiles sorted by mean elevation") sorted_ax.set_ylabel("Tile elevation [m]") range = maxima - minima dist_ax.hist(range, bins=200, density=True, label=(r"\textbf{Mean:} " f"{range.mean():.2f}m, " r"\textbf{SD:} " f"{range.std():.2f}m")) dist_ax.set_xlabel(r"Range ($\max - \min$) [m]") dist_ax.legend() dist_ax.set_ylabel(r"Frequency") dist_ax.yaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=0)) sorted_ax.get_xaxis().set_ticks([]) fig.savefig("/code/tex/img/elevation-spread.pdf") return fig, sorted_ax
def get_all_installation_allocations_df(df_search): col_renaming_map = { 'Installation ID': 'installation_id', 'Installation Name': 'installation_name', 'Address City': 'installation_city', 'Account Holder Name': 'account_holder', 'Account Status': 'account_status', 'Permit ID': 'permit_id', 'Status': 'status' } df_installation_allocations = pd.DataFrame() # Retrieving raw data for country in track(df_search['country'].unique()): df_installation_allocations_country = pd.DataFrame() country_installations_links = df_search.loc[ df_search['country'] == country, 'installations_link'] for installations_link in track(country_installations_links, label=country): url_root, params = get_url_root_and_params(installations_link) df_installation_allocations_country_phase = get_installation_allocations_df( root_url, params) if df_installation_allocations_country.size > 0: df_installation_allocations_country = pd.merge( df_installation_allocations_country, df_installation_allocations_country_phase, how='outer', on=list(col_renaming_map.keys())) else: df_installation_allocations_country = df_installation_allocations_country_phase df_installation_allocations_country['country'] = country df_installation_allocations = df_installation_allocations.append( df_installation_allocations_country) # Collating update datetimes update_cols = df_installation_allocations.columns[ df_installation_allocations.columns.str.contains('Latest Update')] df_installation_allocations['latest_update'] = df_installation_allocations[ update_cols].fillna('').max(axis=1) df_installation_allocations = df_installation_allocations.drop( columns=update_cols) # Renaming columns df_installation_allocations = (df_installation_allocations.reset_index( drop=True).rename(columns=col_renaming_map)) # Sorting column order non_year_cols = ['country'] + list( col_renaming_map.values()) + ['latest_update'] year_cols = sorted( list(set(df_installation_allocations.columns) - set(non_year_cols))) df_installation_allocations = df_installation_allocations[non_year_cols + year_cols] # Dropping header rows idxs_to_drop = df_installation_allocations['permit_id'].str.contains( '\*').replace(False, np.nan).dropna().index df_installation_allocations = df_installation_allocations.drop( idxs_to_drop) return df_installation_allocations
def query_orchestrator(self, stream, query_args, service_type='xml', track_label=None, wait_time=0): check_date_rng_args = lambda query_args: all( x in query_args.keys() for x in ['start_date', 'end_date']) df = pd.DataFrame() if stream_info.streams[stream]['request_type'] == 'date_range': ## Dealing with date range requests - main concern is whether capping has been applied assert check_date_rng_args( query_args ), 'All date range queries should be provided with a "start_date" and "end_date".' self.capping_applied = True date_col = stream_info.streams[stream]['date_col'] start_date, end_date = query_args['start_date'], query_args[ 'end_date'] absolute_start_date = start_date while self.capping_applied == True: response = self.make_request(stream, query_args, service_type) df_new = self.parse_response(response, stream) df = df.append(df_new) assert self.capping_applied != None, 'Whether or not capping limits had been breached could not be found in the response metadata' if self.capping_applied == True: start_date = pd.to_datetime( df[date_col]).max().tz_localize(None) warnings.warn( f'Response was capped, request is rerunning for missing data from {start_date}' ) if pd.to_datetime(start_date) >= pd.to_datetime(end_date): warnings.warn( f'End data ({end_date}) was earlier than start date ({start_date})\nThe start date will be set one day earlier.' ) start_date = ( pd.to_datetime(end_date) - pd.Timedelta(days=1)).strftime('%Y-%m-%d') query_args['start_date'] = start_date response = self.make_request(stream, query_args, service_type) df_new = self.parse_response(response, stream) df = df.append(df_new) elif stream_info.streams[stream]['request_type'] == 'SP_by_SP': ## Dealing with SP by SP requests - main concern is handling daylight savings assert check_date_rng_args( query_args ), 'All date range queries should be provided with a "start_date" and "end_date".' start_date, end_date = query_args['start_date'], query_args[ 'end_date'] absolute_start_date = start_date for key in ['start_date', 'end_date']: query_args.pop(key, None) df_dates_SPs = self.dt_rng_2_SPs(start_date, end_date) date_SP_tuples = list(df_dates_SPs.reset_index().itertuples( index=False, name=None)) for datetime, query_date, SP in track(date_SP_tuples, label=track_label, total=len(date_SP_tuples)): query_args['query_date'] = query_date query_args['SP'] = SP # request and parse response = self.make_request(stream, query_args, service_type) df_SP = self.parse_response(response, stream) if df_SP.shape[0] != 0: # processing df_SP = self.expand_cols(df_SP) df_SP['datetime'] = datetime # saving df = df.append(df_SP, sort=False) time.sleep(wait_time) df = df.reset_index(drop=True) df = df[~df.duplicated()] ## Assigning local_datetime for relevant dt_rng streams if all(col in stream_info.streams[stream].keys() for col in ['date_col', 'SP_col']): df = self.add_local_datetime(df, absolute_start_date, end_date, stream) return df
def intersectGDFMP(gdf1, keyfield1, gdf2, keyfield2, areas_in=None, verbosity=1, area_epsg=6931, apply_buffer=False, threshold=None, vertex_limit=10000, check_inputs=True, num_process=None, num_block=None): '''Multiprocess overlap between geometries in two geodataframes Parameters ---------- gdf1: GeoDataFrame (must match crs of gdf2, and have valid geometries if apply_buffer is False) keyfield1: column name in gdf1 which uniquely identifies each row and will be used to label the results gdf2: GeoDataFrame (must match crs of gdf1, and have valid geometries if apply_buffer is False) keyfield2: column name in gdf2 which uniquely identifies each row and will be used to label the results areas_in: list of lists of overlap areas between geometries in gdf1 and gdf2, with dimensions [gdf2.shape[0]][gdf1.shape[0]], intersections will be calculated only for geometry pairs with nonzero entries (default None) verbosity: int, detail level of reporting during execution: 0=none, 1+=announce computation sections (default 1) area_epsg: int, convert to this epsg for area calculation (default 6931) apply_buffer: bool, use .buffer(0) to coerce valid geometries in gdf1 and gdf2 (often works, sometimes corrupts geometry) (default False) threshold: float or None, a float value discards overlaps that area less than threshold fraction of the largest overlap for each geometry in gdf2 (default None) vertex_limit: int>3, limit geometry elements to at most this number of vertices when performing internal calculations (default 10000) check_inputs: bool, set to False to circumvent input checks (crs match, geometry validity, areas_in shape) for speedup (default True) num_process: int or None, number of processes to use for calculation (coerced up to 1 and down to the number of CPUs in the system, default is number of CPUs in the system minus two) (default None) num_block: int or None, number of rows of gdf2 to send to each process (coerced up to 1 and down to gdf2.shape[0], default is approximately gdf2.shape[0]/(3 times the number of CPUs in the system)) (default None) Returns ------- GeoDataFrame: each row contains an overlap geometry between one row in gdf1 and one row in gdf2, with the following fields: keyfield1: entry from gdf1[keyfield1] keyfield1+' Index': index of row from gdf1 keyfield2: entry from gdf2[keyfield2] keyfield2_' Index': index of row from gdf2 'Overlap Geometry': intersection as shapely Polygon or MultiPolygon 'Overlap Area': area of intersection geometry in units of area_epsg coordinate reference system 'Overlap Ratio': fraction of area that this geometry represents out of all intersection geometries involving the source row in gdf2 Notes ----- Relies on multiprocess, pandas, geopandas, numpy, and time Returns None on input error after printing error description ''' if verbosity >= 1: print('Executing intersectGDFMP') begin_time = time.time() # Input checks NCPU = multiprocess.cpu_count() N2 = gdf2.shape[0] if check_inputs: if num_process is None: num_process = max( 1, NCPU - 2 ) # empirical factor for circa 2020 multicore computers, prevents lockup by leaving processes available for the system and other applications else: try: num_process = int(num_process) except (ValueError, TypeError): print( f'Input Error: num_process must be an integer (there are {NCPU} CPUs detected, input was: {num_process})' ) return None if num_process < 0: tmp = num_process num_process = max(1, NCPU - num_process) else: num_process = min(NCPU, num_process) if verbosity >= 1: print(f' num_process set to {num_process} (of {NCPU})') if num_block is None: num_block = N2 if num_process > 1: num_block = max( 1, num_block // (3 * num_process) ) # empirical factor based on Canadian census CFSA and DA catographic geometry overlap calculation else: try: num_block = int(num_block) except (ValueError, TypeError): print( f'Input Error: num_block must be an integer (input was: {num_block})' ) return None num_block = max(1, min(num_block, N2)) if verbosity >= 1: print(f' num_block set to {num_block} (of {N2})') # Ensure coordinate reference systems match if gdf1.crs != gdf2.crs: print( f"Input coordinate reference systems must be the same: gdf1.crs={gdf1.crs}, gdf2.crs={gdf2.crs}" ) return None # Ensure geometries are valid n_invalid1 = gdf1.shape[0] - sum(gdf1.geometry.is_valid) n_invalid2 = gdf2.shape[0] - sum(gdf2.geometry.is_valid) if not apply_buffer and (n_invalid1 > 0 or n_invalid2 > 0): print( f"Input geometries must be valid: {n_invalid1} invalid geometries in gdf1, {n_invalid2} invalid geometries in gdf2" ) return None # Ensure areas_in is the proper size if not areas_in is None: N2 = len(areas_in) if N2 != gdf2.shape[0]: print( f"Input areas_in must have length {gdf2.shape[0]}, actual length is {N2}" ) return None N1 = len(areas_in[0]) irreg = not all([len(a) == N1 for a in areas_in]) if len(areas_in) != gdf2.shape[0] or N1 != gdf1.shape[0] or irreg: print( f"Input areas_in must have size [{gdf2.shape[0]}][{gdf1.shape[0]}], actual shape is [{N2}][{N1}]{' with irregular second dimension length' if irreg else ''}" ) return None # Checks performed here, no need to do so in intersectGDF check_inputs = False NDIV = N2 // num_block + (0 if N2 % num_block == 0 else 1) ITERIND = lambda ind: slice(ind * num_block, ((ind + 1) * num_block if (ind + 1) * num_block < N2 else N2)) results = [None] * NDIV start_time = time.time() completed = 0 with multiprocess.Pool(num_process) as pool: print( f'Generating pool, P={num_process}, N={N2}, DIV={num_block}, NDIV={NDIV}' ) ret = [ pool.apply_async( intersectGDF, (gdf1, keyfield1, gdf2.iloc[ITERIND(ind), :], keyfield2, areas_in if areas_in is None else areas_in[ITERIND(ind)], verbosity, area_epsg, apply_buffer, threshold, vertex_limit, check_inputs)) for ind in range(NDIV) ] print('Processing pool') for i in ipypb.track( range(NDIV) ): # Alternative: initialize pb and call next(pb) in loop, instead of having a while loop to process all updates since last loop while True: indb_finished = [r.ready() for r in ret] indb_empty = [r is None for r in results] indb_update = [ f and e for f, e in zip(indb_finished, indb_empty) ] if any(indb_update): ind_update = indb_update.index(True) results[ind_update] = ret[ind_update].get( 999) # Set finite timeout so that it returns properly completed += 1 if completed % 10 == 0 or completed == 1 or completed == ( NDIV): print( f'Finished {completed}/{NDIV}, wall time {time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))}' ) break time.sleep( 1 ) # Make the infinite loop run slower; might be improved with explicit async wall_time = time.time() - start_time processor_time = sum([r[1] for r in results]) gdf_joined = pandas.concat([r[0] for r in results ]).sort_values(keyfield2, ignore_index=True) print( f'Pool processing concluded, process count {sum([r!=None for r in results])}/{NDIV}, wall time {time.strftime("%H:%M:%S", time.gmtime(wall_time))}, processor time {time.strftime("%H:%M:%S", time.gmtime(processor_time))}, processor:wall ratio {processor_time/wall_time:.3}x' ) print( f'Total time: {time.strftime("%H:%M:%S", time.gmtime(time.time()-begin_time))}' ) return gdf_joined
def mkl_set_num_threads(cores): mkl_rt.mkl_set_num_threads(ctypes.byref(ctypes.c_int(cores))) mkl_set_num_threads(20) print('CPUs used now: ', mkl_get_max_threads()) NTHREADS = 20 # ! libppr = cdll.LoadLibrary('cpp/pprlib.so') libppr.init_rand() datapath = '/data/frededata/' for mat in track([ 'academic_confs.mat', 'academic_coa_2014.mat', 'flickr.mat', 'vk2016.mat' ]): tp.send_text(mat[:-4] + ' started ppr evaluation') matf = loadmat(datapath + mat) G = matf['network'] n = G.shape[0] print(mat, 'n = ', n) inds = G.indptr inds = np.append(inds, n).astype(np.int32) degrees = G.sum(axis=0).A1.astype(np.int32) ppr = np.zeros(n * n, dtype=np.float32) libppr.ppr_mat_matmul(ppr.ctypes.data_as(POINTER(c_float)), inds.ctypes.data_as(POINTER(c_int)), G.indices.ctypes.data_as(POINTER(c_int)), degrees.ctypes.data_as(POINTER(c_int)), n, c_float(0.85), 1, c_double(1e-6), 100,