예제 #1
0
def run():
    """
    TODO: write docstring
    """
    # Set environment variables
    settings.load()
    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()
    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')
    # Get data to process from postgres
    quants_df = execute_sql('select * from features.quants;', engine, read_file=False, return_df=True)

    data_dir = settings.get_data_dir()
    labeled_fishing_dir = data_dir / 'labeled_data' / 'fishing'
    labeled_nonfishing_dir = data_dir / 'labeled_data' / 'nonfishing'
    cnn_split_dir = data_dir / 'cnn_split'
    if cnn_split_dir.exists():
        shutil.rmtree(cnn_split_dir, ignore_errors=False, onerror=None)
    
    cnn_split_dir.mkdir(parents=True, exist_ok=True)


    # Create labeled data
    print('Creating labeled data.')
    fishy_stuff = fishing_prefilter(quants_df)
    nonfish = nonfishing_dataframe_creator(quants_df, fishy_stuff)
    dataset = sampler(fishy_stuff, nonfish)
    trajectory_separator(dataset, data_dir)

    # Create train / test split
    print("Creating train/test split")
    split_data(labeled_fishing_dir, labeled_nonfishing_dir, cnn_split_dir, binary_name='fishing', set_seed=223)
    
    # Train the cnn
    run_cnn(cnn_split_dir, batchsize=256, epochs=50, color_mode='rgb', start_filters=8, depth=2, dense_count = 2, dense_neurons = 256, bnorm = False)
예제 #2
0
def run(mode='replace', suffix='', chunksize=10000):
    """
    Runs the end-to-end pipeline (depends on ETL completion)

    Parameters
    ----------
    mode : str
        If 'replace', overwrite existing results; if 'append', append to existing results
    suffix : str
        Suffix if any to append to tables (for testing purposes)
    chunksize : int
        Chunk size (when writing queries - the max number of rows we think a DF can hold in memory without crashing)
    
    Returns
    -------
    None

    """

    ######## Prepare everything needed for the model ########

    # Load environment variables
    settings.load()

    # Get folders
    ROOT_FOLDER = settings.get_root_dir()
    SQL_FOLDER = os.path.join(ROOT_FOLDER, 'sql/')
    RESULTS_FOLDER = os.path.join(ROOT_FOLDER, 'results/')

    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()

    # Get OTP settings: port number of load balancer and number of OTP containers (i.e. splits)
    host, port, num_splits = settings.get_otp_settings()

    # Load model configuration
    model_config = os.path.join(ROOT_FOLDER, 'config/base/model_config.yaml')
    print('Configure models')
    params = load_yaml(model_config)
    population_dict = params.get('populations')
    poi_dict = params.get('points_of_interest')
    time_defs = params.get('time_defs')
    time_strata = params.get('time_strata')
    hyper_params = params.get('hyper_params')
    metrics = params.get('metrics')
    print('Model parameters loaded')

    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')
    print('Database connected')

    # Sample timestamps and write to MODEL.timestamps
    model_functions.create_timestamps(
        time_defs,
        time_strata,
        n_timepoints=hyper_params.get('n_timepoint'),
        engine=engine,
        suffix=suffix)
    # Generate MODEL.k_poi (K-nearest POIs)
    model_functions.create_k_poi(SQL_FOLDER,
                                 k_poi=hyper_params.get('k_POI'),
                                 poi_dict=poi_dict,
                                 suffix=suffix,
                                 engine=engine)
    # Configure OTP query parameters and save to MODEL.trips
    model_functions.create_trips(SQL_FOLDER,
                                 suffix=suffix,
                                 engine=engine,
                                 mode=mode)

    ######## Run models and write results to database ########
    # Generate RESULTS.populations
    model_functions.compute_populations(SQL_FOLDER, population_dict, engine)
    # Generate RESULTS.trips
    model_functions.split_trips(host,
                                port,
                                num_splits,
                                SQL_FOLDER,
                                RESULTS_FOLDER,
                                engine,
                                psql_credentials,
                                suffix=suffix,
                                mode=mode,
                                chunksize=chunksize)
    # Generate VIS.map_attributes

    ######## Generate data for visualization and save to database ########
    model_functions.compute_map_attributes(SQL_FOLDER,
                                           metrics,
                                           engine,
                                           suffix=suffix)
    # # Generate three histogram tables:
    # # VIS.histograms_oa_population, VIS.histograms_oa_access, VIS.histograms_individual_access
    model_functions.compute_histograms(engine, suffix=suffix)
    # Generate VIS.scoreboard
    model_functions.compute_scoreboard(engine, suffix=suffix)
예제 #3
0
from dashboard import individual_level
from app import app
from utils import *
from dash.dependencies import Input, Output, State, ClientsideFunction
import settings
import os

# Set up app

settings.load()

# Get folders
ROOT_FOLDER = settings.get_root_dir()

# Get PostgreSQL database credentials
psql_credentials = settings.get_psql()
mapbox_access_token = settings.get_mapbox_token()

# Create SQLAlchemy engine from database credentials
engine = create_connection_from_dict(psql_credentials, 'postgresql')
print('Database connected')

app.layout = html.Div([
    html.H1('Public Transport Access Across the West Midlands'),
    dcc.Tabs(id="tabs",
             value='tab-1',
             children=[
                 dcc.Tab(label='Output Area-Level Analysis', value='tab-1'),
                 dcc.Tab(label='Individual-Level Analysis', value='tab-2'),
             ]),
    html.Div(id='tabs-content')
예제 #4
0
def run():
    """
    Creates raw-cleaned-semantic schemas and populates the raw schema only.

    Parameters
    ----------
    read_json: bool
        Whether or not the script should read original json files
    write_json: bool
        Whether or not the script should write csvs of ais files
    dirs: [str]
        List of names of the directories to import
    date_range: [int]
        List of two ints with the first and last day to collect files from

    Returns
    -------
    None

    """
    # Set environment variables
    settings.load()
    # Get root directory from environment
    base_dir = settings.get_base_dir()
    sql_dir = base_dir.joinpath('sql')
    data_dir = settings.get_data_dir()
    filtered_dir = data_dir.joinpath('ais_deduped')

    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()
    #  print('Running with credentials: ', psql_credentials)

    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')

    ## ---- CREATE SCHEMAS ----

    print("Creating schemas")
    execute_sql(os.path.join(sql_dir, 'create_schemas.sql'),
                engine,
                read_file=True)

    ## ---- CREATE TABLES ----

    print("Creating tables")
    execute_sql(os.path.join(sql_dir, 'create_tables.sql'),
                engine,
                read_file=True)

    ## ---- UPLOAD TABLES ----

    print("Processing scraped vessels table.")
    copy_csv_to_db(os.path.join(data_dir, 'updated_boats.csv'), 'raw.vessels',
                   engine)
    print("Processing IUU list.")
    # load_iuu_list(os.path.join(data_dir, 'IUUList-20190902.txt'), engine)

    ## ---- UPLOAD SHAPEFILES ----

    # print("Uploading shapefiles")
    # TODO: get this fully hooked up and working
    # load_shp(DATA_DIR, dir_dict, credentials_dict):

    ## ---- WRITE filtered CSVs to db ----

    for path in filtered_dir.glob("*"):
        if path.is_dir():
            filtered_subdir = path
            #  this is where we upload csvs from the database
            #  the intention is that we sometimes do this with previously parsed csvs
            print(
                f"Uploading csv files to database from {filtered_subdir.name}."
            )
            try:
                load_csv(filtered_subdir,
                         engine,
                         'raw.ais',
                         sep='\t',
                         quote='\b')
            except IsADirectoryError:
                #raise
                print('Found directory, not file')
        print(f"Finished converted json from {filtered_subdir.name}")

    ## ---- ClEAN DATA ----
    print("Cleaning data")
    execute_sql(os.path.join(sql_dir, 'clean_data.sql'),
                engine,
                read_file=True)

    return
예제 #5
0
def run():
    """
    Execute Extract-Transform-Load (ETL) process.
    Note: This entire process will take several hours to complete.

    Parameters
    ----------
    ROOT_FOLDER : str
        Directory where the project is stored locally.
    DATA_FOLDER : str
        Directory where the raw data are stored locally.
    
    Returns
    -------
    None
    """

    # Get environment folders
    ROOT_FOLDER = settings.get_root_dir()
    DATA_FOLDER = os.path.join(ROOT_FOLDER, 'data/')
    SQL_FOLDER = os.path.join(ROOT_FOLDER, 'sql/')

    # Data files to be loaded
    data_config = os.path.join(ROOT_FOLDER, 'config/base/data_files.yaml')

    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()

    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')

    ## ---- CREATE SCHEMAS ----

    print("Creating schemas")
    execute_sql(os.path.join(SQL_FOLDER, 'create_schemas.sql'),
                engine,
                read_file=True)

    ## ---- CREATE TABLES WITHIN RAW SCHEMA ----
    print("Creating tables")
    execute_sql(os.path.join(SQL_FOLDER, 'create_tables.sql'),
                engine,
                read_file=True)

    ## ---- LOAD RAW DATA TO DATABASE ----
    text_dict, gis_dict, osm_file = load_data_dict(data_config)

    # Load CSV file to RAW schema
    print("Loading text files to RAW")
    load_text(DATA_FOLDER, text_dict, engine)

    # Load GIS data to GIS schema
    print("Loading shapefiles to GIS")
    load_gis(DATA_FOLDER, gis_dict, psql_credentials)

    # Load OSM data to RAW schema
    print("Loading OSM data to RAW")
    load_osm(DATA_FOLDER, osm_file, psql_credentials,
             os.path.join(SQL_FOLDER, 'update_osm_tables.sql'), engine)

    ## ---- CLEAN DATA TO CLEANED SCHEMA ----
    print("Cleaning data")
    execute_sql(os.path.join(SQL_FOLDER, 'clean_data.sql'),
                engine,
                read_file=True)

    ## ---- ENTITIZE DATA TO SEMANTIC SCHEMA ----
    print("Entitizing data")
    execute_sql(os.path.join(SQL_FOLDER, 'create_semantic.sql'),
                engine,
                read_file=True)
예제 #6
0
def run(min_pings_init=30, min_pings_split=20, min_dist=2.0):
    """
    Runs feature generation that allows modeling stage to take place.
    Feature generation involves 3 main stages:
        - generating a sample to show the model
        - breaking sample up into trajectories
        - computing quantitative features on each trajectory
        - writing an image of each trajectory to folders grouped by 'vessel_type'

    :param min_pings_init: int
        The minimum number of AIS data points that must appear in a trajectory for it to be
        included in the sample.
    :param min_pings_split: int
        Applied after splitting trajectories at the gap. Should be smaller than min_pings_init.
        Ensures that split trajectories also have more than a certain minimum number of pings.

    :returns:
        None
    """
    start = time.time()
    # Set environment variables
    settings.load()
    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()
    base_dir = settings.get_base_dir()
    sql_dir = base_dir.joinpath('sql')
    data_dir = settings.get_data_dir()

    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')
    # Create a sql table with complete trajectories
    sample_switch = input("Create new sample for Convolutional Neural Net? (Y/N)")
    if sample_switch in ['Y', 'y', '1', 'Yes']:
        print("Creating CNN sample.")
        create_cnn_sample(sql_dir, engine, min_pings_init=min_pings_init, min_dist=min_dist)
    # Get data to process from postgres
    execute_sql('drop table if exists features.quants;', engine, read_file=False)
    if (data_dir / 'trajectories').is_dir():
        print("Removing old trajectories directory.")
        remove_dir(data_dir / 'trajectories')

    try:
        df = execute_sql("select * from features.cnn_sample", engine, read_file=False, return_df=True)
        print("Grabbing trajectory data")
    except db.exc.ProgrammingError:
        print("The table features.cnn_sample doesn't exist. Please create one.")
        raise SystemExit

    # Set data types of several key columns
    df = df.rename(columns={'time_stamp': 't'})
    df['t'] = pd.to_datetime(df['t'])
    df['longitude'] = pd.to_numeric(df['longitude'])
    df['latitude'] = pd.to_numeric(df['latitude'])
    # Set df index
    df.index = df['t']
    df_geo = df_to_geodf(df)
    # Filter by date and mmsi
    df_group = df_geo.groupby([pd.Grouper(freq='D'), 'mmsi'])
    # Loop through the grouped dataframes
    counter = 0

    # Load basemap shape file
    base_map = geopandas.read_file(
        '/Akamai/ais_project_data/GSHHS_shp/c/GSHHS_c_L1.shp')  # c: coarse, l: low, i: intermedate, h: high, f: full
    # Set CRS WGS 84
    base_map = base_map.to_crs(epsg=4326)

    for name, group in df_group:
        if len(group) < min_pings_init:
            continue
        trajectory = mp.Trajectory(name, group)

        # Split the trajectory at the gap
        split_trajectories = list(trajectory.split_by_observation_gap(timedelta(minutes=30)))

        ### CREATE TRAJECTORY IDs
        for split_index, trajectory in enumerate(split_trajectories):
            # create a universal trajectory ID
            # format is: mmsi-date-split_index
            trajectory.df['traj_id'] = str(name[1]) + '-' + str(name[0].date()) + '-' + str(split_index)

        ### CREATE QUANT FEATURES AND WRITE IMAGES TO DISK

        for split in split_trajectories:
            # store the length of the split trajectory in km
            traj_length = split.get_length() / 1_000
            if (len(split.df) < min_pings_split) or (traj_length < .5):
                print(f"Dropping a trajectory with length: {str(traj_length)} km and {str(len(split.df))} pings.")
                continue
            else:
                try:
                    quants = compute_quants(split.df[['longitude', 'latitude']])
                    quants['traj_id'] = str(split.df['traj_id'].iloc[0])
                    quants['vessel_type'] = str(split.df['vessel_type'].iloc[0])
                    quants.to_sql('quants', engine, schema='features',
                                  if_exists='append', index=False)
                    ### WRITE IMAGES TO DISK
                    save_matplotlib_img(split, data_dir, base_map)
                    counter += 1
                except:
                    print(f"An error occurred processing trajectory {split.df['traj_id'].iloc[0]}.")

    end = time.time()
    print(f"Generated features for {str(counter)} images in {str(round(end - start))} seconds.")
    return