def calc_nn(uid, tree=ball_tree): print('Working on uid:', uid[0]) iteration_start = datetime.datetime.now() loc_engine = gsta.connect_engine(db_config.colone_cargo_params, print_verbose=False) read_sql = f"""SELECT id, lat, lon FROM {source_table} where uid= '{uid[0]}';""" df = pd.read_sql(sql=read_sql, con=loc_engine) loc_engine.dispose() # Now we are going to use sklearn's BallTree to find the nearest neighbor of # each position for the nearest port. The resulting port_id and dist will be # pushed back to the db with the id, uid, and time to be used in the network # building phase of analysis. This takes up more memory, but means we have # fewer joins. Add an index on uid though before running network building. # transform to radians points_of_int = np.radians(df.loc[:, ['lat', 'lon']].values) # query the tree dist, ind = tree.query(points_of_int, k=1, dualtree=True) # make the data list to pass to the sql query data = np.column_stack( (np.round(((dist.reshape(1, -1)[0]) * 6371.0088), decimals=3), sites.iloc[ind.reshape(1, -1)[0], :].port_id.values.astype('int'), df['id'].values)) # define the sql statement sql_insert = f"INSERT INTO {target_table} (nearest_site_dist_km, nearest_site_id, id) " \ "VALUES(%s, %s, %s);" # write to db loc_conn = gsta.connect_psycopg2(db_config.colone_cargo_params, print_verbose=False) c = loc_conn.cursor() c.executemany(sql_insert, (data.tolist())) loc_conn.commit() c.close() loc_conn.close() print(f'UID {uid[0]} complete in:', datetime.datetime.now() - iteration_start)
#time tracking import datetime from sklearn.neighbors import BallTree from sklearn.metrics.pairwise import haversine_distances #%% Make and test conn and cursor using psycopg, # and create an engine using sql alchemy # Geo-Spatial Temporal Analysis package import gsta import db_config conn = gsta.connect_psycopg2(db_config.loc_cargo_params) loc_engine = gsta.connect_engine(db_config.loc_cargo_params) #%% center and purity calc functions def get_ports_wpi(engine): ports = pd.read_sql( 'wpi', loc_engine, columns=['index_no', 'port_name', 'latitude', 'longitude']) ports = ports.rename(columns={ 'latitude': 'lat', 'longitude': 'lon', 'index_no': 'port_id' }) return ports
import numpy as np import datetime from multiprocessing import Pool import os from sklearn.neighbors import BallTree cores = os.cpu_count() workers = cores - 1 print( f'This machine has {cores} cores. Will use {workers} for multiprocessing.' ) # set tables for processing source_table = 'ship_sample' target_table = 'nearest_sample' engine = gsta.connect_engine(db_config.colone_cargo_params) #%% get the sits as a df from the database sites = gsta.get_sites(engine) engine.dispose() # build the BallTree using the ports as the candidates candidates = np.radians(sites.loc[:, ['lat', 'lon']].values) ball_tree = BallTree(candidates, leaf_size=40, metric='haversine') def calc_nn(uid, tree=ball_tree): print('Working on uid:', uid[0]) iteration_start = datetime.datetime.now() loc_engine = gsta.connect_engine(db_config.colone_cargo_params, print_verbose=False) read_sql = f"""SELECT id, lat, lon
# reload modules when making edits from importlib import reload reload(gsta) #%% df = pd.read_csv('sample_ship_posits.csv') df['time'] = pd.to_datetime(df['time']) try: df.drop(['Unnamed: 0'], inplace=True, axis=1) except Exception as e: print (e) # %% # conn = gsta.connect_psycopg2(db_config.colone_cargo_params) loc_engine = gsta.connect_engine(db_config.colone_cargo_params, print_verbose=False) # %% create a df from the database df = pd.read_sql_query("SELECT id, time, lat, lon, cog, sog, status, anchored, moored, underway" " FROM ais_cargo.public.uid_positions " "WHERE uid = '636016432'" "ORDER BY time", loc_engine) #df.to_csv('sample_ship_posits.csv', index=False) #%% def calc_dist(df, unit='nm'): """ Takes a df with id, lat, and lon and returns the distance between the previous point to the current point as a series. :param df: :return: