def run(): """ TODO: write docstring """ # Set environment variables settings.load() # Get PostgreSQL database credentials psql_credentials = settings.get_psql() # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') # Get data to process from postgres quants_df = execute_sql('select * from features.quants;', engine, read_file=False, return_df=True) data_dir = settings.get_data_dir() labeled_fishing_dir = data_dir / 'labeled_data' / 'fishing' labeled_nonfishing_dir = data_dir / 'labeled_data' / 'nonfishing' cnn_split_dir = data_dir / 'cnn_split' if cnn_split_dir.exists(): shutil.rmtree(cnn_split_dir, ignore_errors=False, onerror=None) cnn_split_dir.mkdir(parents=True, exist_ok=True) # Create labeled data print('Creating labeled data.') fishy_stuff = fishing_prefilter(quants_df) nonfish = nonfishing_dataframe_creator(quants_df, fishy_stuff) dataset = sampler(fishy_stuff, nonfish) trajectory_separator(dataset, data_dir) # Create train / test split print("Creating train/test split") split_data(labeled_fishing_dir, labeled_nonfishing_dir, cnn_split_dir, binary_name='fishing', set_seed=223) # Train the cnn run_cnn(cnn_split_dir, batchsize=256, epochs=50, color_mode='rgb', start_filters=8, depth=2, dense_count = 2, dense_neurons = 256, bnorm = False)
def run(mode='replace', suffix='', chunksize=10000): """ Runs the end-to-end pipeline (depends on ETL completion) Parameters ---------- mode : str If 'replace', overwrite existing results; if 'append', append to existing results suffix : str Suffix if any to append to tables (for testing purposes) chunksize : int Chunk size (when writing queries - the max number of rows we think a DF can hold in memory without crashing) Returns ------- None """ ######## Prepare everything needed for the model ######## # Load environment variables settings.load() # Get folders ROOT_FOLDER = settings.get_root_dir() SQL_FOLDER = os.path.join(ROOT_FOLDER, 'sql/') RESULTS_FOLDER = os.path.join(ROOT_FOLDER, 'results/') # Get PostgreSQL database credentials psql_credentials = settings.get_psql() # Get OTP settings: port number of load balancer and number of OTP containers (i.e. splits) host, port, num_splits = settings.get_otp_settings() # Load model configuration model_config = os.path.join(ROOT_FOLDER, 'config/base/model_config.yaml') print('Configure models') params = load_yaml(model_config) population_dict = params.get('populations') poi_dict = params.get('points_of_interest') time_defs = params.get('time_defs') time_strata = params.get('time_strata') hyper_params = params.get('hyper_params') metrics = params.get('metrics') print('Model parameters loaded') # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') print('Database connected') # Sample timestamps and write to MODEL.timestamps model_functions.create_timestamps( time_defs, time_strata, n_timepoints=hyper_params.get('n_timepoint'), engine=engine, suffix=suffix) # Generate MODEL.k_poi (K-nearest POIs) model_functions.create_k_poi(SQL_FOLDER, k_poi=hyper_params.get('k_POI'), poi_dict=poi_dict, suffix=suffix, engine=engine) # Configure OTP query parameters and save to MODEL.trips model_functions.create_trips(SQL_FOLDER, suffix=suffix, engine=engine, mode=mode) ######## Run models and write results to database ######## # Generate RESULTS.populations model_functions.compute_populations(SQL_FOLDER, population_dict, engine) # Generate RESULTS.trips model_functions.split_trips(host, port, num_splits, SQL_FOLDER, RESULTS_FOLDER, engine, psql_credentials, suffix=suffix, mode=mode, chunksize=chunksize) # Generate VIS.map_attributes ######## Generate data for visualization and save to database ######## model_functions.compute_map_attributes(SQL_FOLDER, metrics, engine, suffix=suffix) # # Generate three histogram tables: # # VIS.histograms_oa_population, VIS.histograms_oa_access, VIS.histograms_individual_access model_functions.compute_histograms(engine, suffix=suffix) # Generate VIS.scoreboard model_functions.compute_scoreboard(engine, suffix=suffix)
from dashboard import individual_level from app import app from utils import * from dash.dependencies import Input, Output, State, ClientsideFunction import settings import os # Set up app settings.load() # Get folders ROOT_FOLDER = settings.get_root_dir() # Get PostgreSQL database credentials psql_credentials = settings.get_psql() mapbox_access_token = settings.get_mapbox_token() # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') print('Database connected') app.layout = html.Div([ html.H1('Public Transport Access Across the West Midlands'), dcc.Tabs(id="tabs", value='tab-1', children=[ dcc.Tab(label='Output Area-Level Analysis', value='tab-1'), dcc.Tab(label='Individual-Level Analysis', value='tab-2'), ]), html.Div(id='tabs-content')
def run(): """ Creates raw-cleaned-semantic schemas and populates the raw schema only. Parameters ---------- read_json: bool Whether or not the script should read original json files write_json: bool Whether or not the script should write csvs of ais files dirs: [str] List of names of the directories to import date_range: [int] List of two ints with the first and last day to collect files from Returns ------- None """ # Set environment variables settings.load() # Get root directory from environment base_dir = settings.get_base_dir() sql_dir = base_dir.joinpath('sql') data_dir = settings.get_data_dir() filtered_dir = data_dir.joinpath('ais_deduped') # Get PostgreSQL database credentials psql_credentials = settings.get_psql() # print('Running with credentials: ', psql_credentials) # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') ## ---- CREATE SCHEMAS ---- print("Creating schemas") execute_sql(os.path.join(sql_dir, 'create_schemas.sql'), engine, read_file=True) ## ---- CREATE TABLES ---- print("Creating tables") execute_sql(os.path.join(sql_dir, 'create_tables.sql'), engine, read_file=True) ## ---- UPLOAD TABLES ---- print("Processing scraped vessels table.") copy_csv_to_db(os.path.join(data_dir, 'updated_boats.csv'), 'raw.vessels', engine) print("Processing IUU list.") # load_iuu_list(os.path.join(data_dir, 'IUUList-20190902.txt'), engine) ## ---- UPLOAD SHAPEFILES ---- # print("Uploading shapefiles") # TODO: get this fully hooked up and working # load_shp(DATA_DIR, dir_dict, credentials_dict): ## ---- WRITE filtered CSVs to db ---- for path in filtered_dir.glob("*"): if path.is_dir(): filtered_subdir = path # this is where we upload csvs from the database # the intention is that we sometimes do this with previously parsed csvs print( f"Uploading csv files to database from {filtered_subdir.name}." ) try: load_csv(filtered_subdir, engine, 'raw.ais', sep='\t', quote='\b') except IsADirectoryError: #raise print('Found directory, not file') print(f"Finished converted json from {filtered_subdir.name}") ## ---- ClEAN DATA ---- print("Cleaning data") execute_sql(os.path.join(sql_dir, 'clean_data.sql'), engine, read_file=True) return
def run(): """ Execute Extract-Transform-Load (ETL) process. Note: This entire process will take several hours to complete. Parameters ---------- ROOT_FOLDER : str Directory where the project is stored locally. DATA_FOLDER : str Directory where the raw data are stored locally. Returns ------- None """ # Get environment folders ROOT_FOLDER = settings.get_root_dir() DATA_FOLDER = os.path.join(ROOT_FOLDER, 'data/') SQL_FOLDER = os.path.join(ROOT_FOLDER, 'sql/') # Data files to be loaded data_config = os.path.join(ROOT_FOLDER, 'config/base/data_files.yaml') # Get PostgreSQL database credentials psql_credentials = settings.get_psql() # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') ## ---- CREATE SCHEMAS ---- print("Creating schemas") execute_sql(os.path.join(SQL_FOLDER, 'create_schemas.sql'), engine, read_file=True) ## ---- CREATE TABLES WITHIN RAW SCHEMA ---- print("Creating tables") execute_sql(os.path.join(SQL_FOLDER, 'create_tables.sql'), engine, read_file=True) ## ---- LOAD RAW DATA TO DATABASE ---- text_dict, gis_dict, osm_file = load_data_dict(data_config) # Load CSV file to RAW schema print("Loading text files to RAW") load_text(DATA_FOLDER, text_dict, engine) # Load GIS data to GIS schema print("Loading shapefiles to GIS") load_gis(DATA_FOLDER, gis_dict, psql_credentials) # Load OSM data to RAW schema print("Loading OSM data to RAW") load_osm(DATA_FOLDER, osm_file, psql_credentials, os.path.join(SQL_FOLDER, 'update_osm_tables.sql'), engine) ## ---- CLEAN DATA TO CLEANED SCHEMA ---- print("Cleaning data") execute_sql(os.path.join(SQL_FOLDER, 'clean_data.sql'), engine, read_file=True) ## ---- ENTITIZE DATA TO SEMANTIC SCHEMA ---- print("Entitizing data") execute_sql(os.path.join(SQL_FOLDER, 'create_semantic.sql'), engine, read_file=True)
def run(min_pings_init=30, min_pings_split=20, min_dist=2.0): """ Runs feature generation that allows modeling stage to take place. Feature generation involves 3 main stages: - generating a sample to show the model - breaking sample up into trajectories - computing quantitative features on each trajectory - writing an image of each trajectory to folders grouped by 'vessel_type' :param min_pings_init: int The minimum number of AIS data points that must appear in a trajectory for it to be included in the sample. :param min_pings_split: int Applied after splitting trajectories at the gap. Should be smaller than min_pings_init. Ensures that split trajectories also have more than a certain minimum number of pings. :returns: None """ start = time.time() # Set environment variables settings.load() # Get PostgreSQL database credentials psql_credentials = settings.get_psql() base_dir = settings.get_base_dir() sql_dir = base_dir.joinpath('sql') data_dir = settings.get_data_dir() # Create SQLAlchemy engine from database credentials engine = create_connection_from_dict(psql_credentials, 'postgresql') # Create a sql table with complete trajectories sample_switch = input("Create new sample for Convolutional Neural Net? (Y/N)") if sample_switch in ['Y', 'y', '1', 'Yes']: print("Creating CNN sample.") create_cnn_sample(sql_dir, engine, min_pings_init=min_pings_init, min_dist=min_dist) # Get data to process from postgres execute_sql('drop table if exists features.quants;', engine, read_file=False) if (data_dir / 'trajectories').is_dir(): print("Removing old trajectories directory.") remove_dir(data_dir / 'trajectories') try: df = execute_sql("select * from features.cnn_sample", engine, read_file=False, return_df=True) print("Grabbing trajectory data") except db.exc.ProgrammingError: print("The table features.cnn_sample doesn't exist. Please create one.") raise SystemExit # Set data types of several key columns df = df.rename(columns={'time_stamp': 't'}) df['t'] = pd.to_datetime(df['t']) df['longitude'] = pd.to_numeric(df['longitude']) df['latitude'] = pd.to_numeric(df['latitude']) # Set df index df.index = df['t'] df_geo = df_to_geodf(df) # Filter by date and mmsi df_group = df_geo.groupby([pd.Grouper(freq='D'), 'mmsi']) # Loop through the grouped dataframes counter = 0 # Load basemap shape file base_map = geopandas.read_file( '/Akamai/ais_project_data/GSHHS_shp/c/GSHHS_c_L1.shp') # c: coarse, l: low, i: intermedate, h: high, f: full # Set CRS WGS 84 base_map = base_map.to_crs(epsg=4326) for name, group in df_group: if len(group) < min_pings_init: continue trajectory = mp.Trajectory(name, group) # Split the trajectory at the gap split_trajectories = list(trajectory.split_by_observation_gap(timedelta(minutes=30))) ### CREATE TRAJECTORY IDs for split_index, trajectory in enumerate(split_trajectories): # create a universal trajectory ID # format is: mmsi-date-split_index trajectory.df['traj_id'] = str(name[1]) + '-' + str(name[0].date()) + '-' + str(split_index) ### CREATE QUANT FEATURES AND WRITE IMAGES TO DISK for split in split_trajectories: # store the length of the split trajectory in km traj_length = split.get_length() / 1_000 if (len(split.df) < min_pings_split) or (traj_length < .5): print(f"Dropping a trajectory with length: {str(traj_length)} km and {str(len(split.df))} pings.") continue else: try: quants = compute_quants(split.df[['longitude', 'latitude']]) quants['traj_id'] = str(split.df['traj_id'].iloc[0]) quants['vessel_type'] = str(split.df['vessel_type'].iloc[0]) quants.to_sql('quants', engine, schema='features', if_exists='append', index=False) ### WRITE IMAGES TO DISK save_matplotlib_img(split, data_dir, base_map) counter += 1 except: print(f"An error occurred processing trajectory {split.df['traj_id'].iloc[0]}.") end = time.time() print(f"Generated features for {str(counter)} images in {str(round(end - start))} seconds.") return