def __init__(self): self.data = read_carto('meuse') self.data['log_zinc'] = np.log(self.data['zinc']) self.data = self.data.to_crs({'init': 'epsg:28992'}) self.data_lonlat = self.data.to_crs({'init': 'epsg:4326'}) self.data_grid = read_carto('meuse_grid') self.data_grid = self.data_grid.to_crs({'init': 'epsg:28992'}) self.data_grid_lonlat = self.data_grid.to_crs({'init': 'epsg:4326'})
def test_augment_with_persist_as(self): """DataObsClient.augment with persist_as""" do = DataObsClient(self.credentials) meta = do.discovery(self.test_read_table, keywords=('poverty', ), time=('2010 - 2014', )) gdf = do.augment(self.test_data_table, meta) anscols = set(meta['suggested_name']) origcols = set( read_carto(self.test_data_table, credentials=self.credentials, limit=1, decode_geom=True).columns) self.assertSetEqual( anscols, set(gdf.columns) - origcols - {'the_geom', 'cartodb_id'}) meta = [ { 'numer_id': 'us.census.acs.B19013001', 'geom_id': 'us.census.tiger.block_group', 'numer_timespan': '2011 - 2015' }, ] gdf = do.augment(self.test_data_table, meta, persist_as=self.test_write_table) self.assertSetEqual( set(('median_income_2011_2015', )), set(gdf.columns) - origcols - {'the_geom', 'cartodb_id'}) self.assertEqual(gdf.index.name, 'cartodb_id') self.assertEqual(gdf.index.dtype, 'int64') df = read_carto(self.test_write_table, credentials=self.credentials, decode_geom=False) self.assertEqual(df.index.name, 'cartodb_id') self.assertEqual(df.index.dtype, 'int64') # same number of rows self.assertEqual(len(df), len(gdf), msg='Expected number or rows') # same type of object self.assertIsInstance(df, pd.DataFrame, 'Should be a pandas DataFrame') # same column names self.assertSetEqual(set(gdf.columns.values), set(df.columns.values), msg='Should have the columns requested') # should have exected schema self.assertEqual(sorted(tuple(str(d) for d in df.dtypes)), sorted(tuple(str(d) for d in gdf.dtypes)), msg='Should have same schema/types')
def __init__(self, var, var_value): self.filename = '/tmp/UK_Police_street_crimes_2019_04.csv' self.data = read_carto('uk_police_street_crimes_2019_04') self.data = self.data[self.data[var] == var_value] self.data_lonlat = self.data self.data_lonlat = read_carto(''' SELECT c.* FROM uk_police_street_crimes_2019_04 as c JOIN london_borough_excluding_mhw as g ON ST_Intersects(c.the_geom, g.the_geom) ''') self.data = self.data.to_crs({'init': 'epsg:32630'})
def test_isochrones_from_table_query_as_new_table(self): self.skip(if_no_credits=True, if_no_credentials=True) iso = Isolines(credentials=self.credentials) gdf = read_carto(self.points_query()) result_table_name = self.get_test_table_name('isotbr') quota = self.used_quota(iso) # Preview result = iso.isochrones(gdf, [100, 1000], mode='car', table_name=result_table_name, dry_run=True, exclusive=True).metadata self.assertEqual(result.get('required_quota'), 6) self.assertEqual(self.used_quota(iso), quota) # Isochrones result = iso.isochrones(gdf, [100, 1000], mode='car', table_name=result_table_name, exclusive=True).data self.assertTrue(isinstance(result, GeoDataFrame)) self.assertTrue(result.is_remote()) quota += 6 self.assertEqual(self.used_quota(iso), quota) result_columns = result.get_column_names() self.assertTrue('the_geom' in result_columns) self.assertTrue('data_range' in result_columns) self.assertEqual(result.get_num_rows(), 6) self.assertTrue('cartodb_id' in result_columns) self.assertFalse('source_id' in result_columns)
def __init__(self): self.data_carto = read_carto('boston_housing') ## Renaming the geometry column from 'the_geom' to 'geometry' ## (pysal expect the geometry column to be called 'geometry') self.data = self.data_carto.copy() self.data['geometry'] = self.data.geometry self.data.drop(['the_geom'],axis = 1, inplace = True) self.data = gpd.GeoDataFrame(self.data, geometry = 'geometry') self.w = Queen.from_dataframe(self.data)
def get_table(tablename): """Retrieve tablename as a GeoDataFrame ordered by database id Returns: geopandas.GeoDataFrame: GeoDataFrame representation of table """ base_query = ("SELECT * FROM {tablename} ORDER BY cartodb_id ASC").format( tablename=tablename) data_carto = read_carto(base_query) ## Renaming the geometry column from 'the_geom' to 'geometry' ## (pysal expect the geometry column to be called 'geometry') data = data_carto.copy() data['geometry'] = data.geometry data.drop(['the_geom'], axis=1, inplace=True) data = gpd.GeoDataFrame(data, geometry='geometry') data.crs = {"init": "epsg:4326"} return data
def get_carto_years(carto_table, data_col): ''' Given a Carto table name and a column where we expect to have data, this function will return a list of all the years for which there is data in the table INPUT carto_table: name of Carto table (string) data_col: name of column where we want to make sure we have data (string) RETURN carto_years: years in table for which we have data (list of integers) ''' # if there are multiple data columns to check if ';' in data_col: # turn columns into a list cols = data_col.split(';') # generate a WHERE statement to use in SQL query to remove rows where these columns are null where = '' for col in cols: where += col + ' IS NOT NULL AND ' where = where[:-5] # if there is only one column to check else: # generate a WHERE statement to use in SQL query to remove rows where this column is null where = data_col + ' IS NOT NULL' # query Carto table to get rows where there is data carto_df = cartoframes.read_carto(f' SELECT * from {carto_table} WHERE {where}', credentials=auth) # pull out a list of years from the 'year' column carto_years = [int(year) for year in np.unique(carto_df['year'])] # get count of occurrences of each year vc = carto_df['year'].value_counts() # pull out list of years with fewer than 10 data points years_to_drop = vc[vc < 10].index # keep list of these years that are more than 10 years old years_to_drop = [year for year in years_to_drop if year < datetime.datetime.utcnow().year - 10] # remove years with less that 10 countries of data if it is more than 10 years old carto_years = [year for year in carto_years if year not in years_to_drop] # put these years in order from oldest to newest carto_years.sort() return carto_years
def test_augment(self): """DataObsClient.augment""" do = DataObsClient(self.credentials) meta = do.discovery(self.test_read_table, keywords=('poverty', ), time=('2010 - 2014', )) gdf = do.augment(self.test_data_table, meta) anscols = set(meta['suggested_name']) origcols = set( read_carto(self.test_data_table, credentials=self.credentials, limit=1, decode_geom=True).columns) self.assertSetEqual( anscols, set(gdf.columns) - origcols - {'the_geom', 'cartodb_id'}) meta = [ { 'numer_id': 'us.census.acs.B19013001', 'geom_id': 'us.census.tiger.block_group', 'numer_timespan': '2011 - 2015' }, ] gdf = do.augment(self.test_data_table, meta) self.assertSetEqual( set(('median_income_2011_2015', )), set(gdf.columns) - origcols - {'the_geom', 'cartodb_id'}) with self.assertRaises(ValueError, msg='no measures'): meta = do.discovery('United States', keywords='not a measure') do.augment(self.test_read_table, meta) with self.assertRaises(ValueError, msg='too many metadata measures'): # returns ~180 measures meta = do.discovery(region='united states', keywords='education') do.augment(self.test_read_table, meta)
def test_isochrones_from_dataframe_as_new_table(self): self.skip(if_no_credits=True, if_no_credentials=True) iso = Isolines(credentials=self.credentials) df = DataFrame(self.points, columns=['name', 'the_geom']) quota = self.used_quota(iso) table_name = self.get_test_table_name('isodfds') # Preview result = iso.isochrones(df, [100, 1000], mode='car', table_name=table_name, dry_run=True, exclusive=True).metadata self.assertEqual(result.get('required_quota'), 6) self.assertEqual(self.used_quota(iso), quota) # Isochrones result = iso.isochrones(df, [100, 1000], mode='car', table_name=table_name, exclusive=True).data self.assertTrue(isinstance(result, GeoDataFrame)) quota += 6 self.assertEqual(self.used_quota(iso), quota) self.assertTrue('the_geom' in result) self.assertTrue('data_range' in result) self.assertEqual(len(result.index), 6) gdf = read_carto(table_name, credentials=self.credentials) result_columns = gdf.columns self.assertTrue('the_geom' in result_columns) self.assertTrue('data_range' in result_columns) self.assertEqual(gdf.size, 6) self.assertTrue('source_id' in result_columns)
df_resp['pfaf_id_12'] = row['pfaf_id_12'] df_resp['hybas_id_6'] = row['hybas_id_6'] df_resp['hybas_id_3'] = row['hybas_id_3'] df_resp['variable'] = variable df_resp['depth'] = depth cols_reorder = [ 'longitude', 'latitude', 'gridCentreLon', 'gridCentreLat', 'dt', 'variable', 'depth', 'hyriv_id', 'pfaf_id_12', 'hybas_id_6', 'hybas_id_3' ] df_resp = df_resp[cols_reorder] return df_resp logger.debug('Pull river mouth data from Carto') gdf_mouths = read_carto('ocn_calcs_010_target_river_mouths') results = [] logger.info('Initiate multithreading for WMS requests') # request all records for all permutations, combine into single dataframe with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor: args_list = [] for index, row in gdf_mouths.iterrows(): args_list.append([index, row, variables[-1], depths[-1]]) future_to_args = { executor.submit(pull_data, args[1], args[2], args[3]): args for args in args_list } for future in concurrent.futures.as_completed(future_to_args): args = future_to_args[future] try:
} RETRIEVE_URL_HEADERS = { 'Authorization':os.getenv('GFW_API_KEY') } WORKING_DIR = os.path.join(os.getenv('DOWNLOAD_DIR'), 'gfw-api-data') Path(WORKING_DIR).mkdir(parents=True, exist_ok=True) logger.debug('Retrieve polygons from Carto') # retrieve polygons from carto eez_table = 'com_011_rw1_maritime_boundaries_edit' # dataset includes following types: # '12NM', '24NM', '200NM', 'Overlapping claim', 'Joint regime' # the final three are of potential relevance here # collect the data for them all, but maintain the distinction for later ease gdf_zones = read_carto("SELECT *, ST_AsGeoJSON(the_geom) AS the_geom_geojson FROM com_011_rw1_maritime_boundaries_edit WHERE pol_type IN ('Overlapping claim','200NM','Joint regime') AND mrgid=8464", index_col='cartodb_id') gdf_zones = gdf_zones.astype({'mrgid':'int','mrgid_ter1':'int','mrgid_sov1':'int', 'mrgid_eez':'int',}) gdf_zones['json'] = gdf_zones.the_geom.to_json() # create set of pairs of dates to loop through date_pairs = [(date(year,1,1), date(year,12,31)) for year in range(2012, 2022)] # create object to track api activity & results # mrgid, geoname, year, id, url, zip, csv, value # needed from original table: geoname, pol_type, iso_ter1, iso_sov1, iso_ter2, iso_sov2, iso_ter3, iso_sov3 col_type_dict = { 'mrgid':'int', 'geoname':'str', 'pol_type':'str', 'iso_ter1':'str',
zip_file = zip_file_template.format(region_id) with zipfile.ZipFile(zip_file, 'r') as zip: zip.extractall(local_data_dir) gdf_l12 = None for region_id in region_ids: shp_file = shp_file_template.format(region_id) gdf_reg = gpd.read_file(shp_file, ignore_geometry=True) if gdf_l12 is None: gdf_l12 = gdf_reg else: gdf_l12 = gdf_l12.append(gdf_reg, ignore_index=True, verify_integrity=True, sort=False) gdf_l12 = gdf_l12.astype({'HYBAS_ID':'str','PFAF_ID':'str'}) # load processed river mouth dataset gdf_mouths = read_carto('ocn_calcs_010_target_river_mouths') if 'pfaf_id_12' in gdf_mouths.columns: gdf_mouths.drop(columns=['pfaf_id_12','pfaf_id_5','pfaf_id_3','hybas_id_5','hybas_id_3'], inplace=True, errors='ignore') # join level12 river basin data to river mouth data # allows us to get the pfaf_id for each terminal river mouth level 12 basin gdf_mouths = gdf_mouths.merge(gdf_l12, how='left', left_on='hybas_l12', right_on='HYBAS_ID', sort=False, validate='many_to_one') gdf_mouths.drop(columns=['HYBAS_ID', 'NEXT_DOWN', 'NEXT_SINK', 'MAIN_BAS', 'DIST_SINK', 'DIST_MAIN', 'SUB_AREA', 'UP_AREA', 'ENDO', 'COAST', 'ORDER', 'SORT'], inplace=True, errors='raise') gdf_mouths.rename(columns={'PFAF_ID': 'pfaf_id_12'}, inplace=True, errors='raise') # use level 12 basin pfaf_id to find the corresponding level 5 basin gdf_mouths['pfaf_id_5'] = gdf_mouths['pfaf_id_12'].str.slice(stop=5) gdf_l5 = read_carto("SELECT hybas_id::text, pfaf_id::text FROM wat_068_rw0_watersheds_edit WHERE level=5")