示例#1
0
 def __init__(self, img_size, range01=False, rgb_order=False, dummy=False):
     images_dir = settings.get_data_dir('office_webcam')
     super(OfficeWebcamDataset, self).__init__(img_size,
                                               range01,
                                               rgb_order,
                                               images_dir,
                                               dummy=dummy)
示例#2
0
 def __init__(self, img_size, range01=False, rgb_order=False, dummy=False):
     test_dir = settings.get_data_dir('visda17_clf_test')
     file_list_path = os.path.join(test_dir, 'image_list.txt')
     super(TestDataset, self).__init__(img_size,
                                       range01,
                                       rgb_order,
                                       file_list_path,
                                       test_dir,
                                       has_ground_truth=False,
                                       dummy=dummy)
示例#3
0
 def __init__(self, img_size, range01=False, rgb_order=False, dummy=False):
     val_dir = settings.get_data_dir('visda17_clf_validation')
     file_list_path = os.path.join(val_dir, 'image_list.txt')
     super(ValidationDataset, self).__init__(img_size,
                                             range01,
                                             rgb_order,
                                             file_list_path,
                                             val_dir,
                                             has_ground_truth=True,
                                             dummy=dummy)
示例#4
0
def run():
    """
    TODO: write docstring
    """
    # Set environment variables
    settings.load()
    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()
    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')
    # Get data to process from postgres
    quants_df = execute_sql('select * from features.quants;', engine, read_file=False, return_df=True)

    data_dir = settings.get_data_dir()
    labeled_fishing_dir = data_dir / 'labeled_data' / 'fishing'
    labeled_nonfishing_dir = data_dir / 'labeled_data' / 'nonfishing'
    cnn_split_dir = data_dir / 'cnn_split'
    if cnn_split_dir.exists():
        shutil.rmtree(cnn_split_dir, ignore_errors=False, onerror=None)
    
    cnn_split_dir.mkdir(parents=True, exist_ok=True)


    # Create labeled data
    print('Creating labeled data.')
    fishy_stuff = fishing_prefilter(quants_df)
    nonfish = nonfishing_dataframe_creator(quants_df, fishy_stuff)
    dataset = sampler(fishy_stuff, nonfish)
    trajectory_separator(dataset, data_dir)

    # Create train / test split
    print("Creating train/test split")
    split_data(labeled_fishing_dir, labeled_nonfishing_dir, cnn_split_dir, binary_name='fishing', set_seed=223)
    
    # Train the cnn
    run_cnn(cnn_split_dir, batchsize=256, epochs=50, color_mode='rgb', start_filters=8, depth=2, dense_count = 2, dense_neurons = 256, bnorm = False)
示例#5
0
    def __init__(self, img_size, range01=False, rgb_order=False, dummy=False):
        train_dir = settings.get_data_dir('visda17_clf_train')
        file_list_path = os.path.join(train_dir, 'image_list.txt')
        super(TrainDataset, self).__init__(img_size,
                                           range01,
                                           rgb_order,
                                           file_list_path,
                                           train_dir,
                                           has_ground_truth=True,
                                           dummy=dummy)

        self.object_ids = []
        self.cam_yaw = []
        self.light_yaw = []
        self.cam_pitch = []

        self.obj_id_to_idx = {}
        self.cam_yaw_to_idx = {}
        self.light_yaw_to_idx = {}
        self.cam_pitch_to_idx = {}
        for sample_idx, name in enumerate(self.names):
            fn, _ = os.path.splitext(name)
            object_id, _, tail = fn.partition('__')
            c_yaw, l_yaw, c_pitch = tail.split('_')
            c_yaw = float(c_yaw)
            l_yaw = float(l_yaw)
            c_pitch = float(c_pitch)
            obj_id_idx = self.obj_id_to_idx.setdefault(object_id,
                                                       len(self.obj_id_to_idx))
            c_yaw_idx = self.cam_yaw_to_idx.setdefault(
                c_yaw, len(self.cam_yaw_to_idx))
            l_yaw_idx = self.light_yaw_to_idx.setdefault(
                l_yaw, len(self.light_yaw_to_idx))
            c_pitch_idx = self.cam_pitch_to_idx.setdefault(
                c_pitch, len(self.cam_pitch_to_idx))
            self.object_ids.append(obj_id_idx)
            self.cam_yaw.append(c_yaw_idx)
            self.light_yaw.append(l_yaw_idx)
            self.cam_pitch.append(c_pitch_idx)
        self.object_ids = np.array(self.object_ids, dtype=np.int32)
        self.cam_yaw = np.array(self.cam_yaw, dtype=np.int32)
        self.light_yaw = np.array(self.light_yaw, dtype=np.int32)
        self.cam_pitch = np.array(self.cam_pitch, dtype=np.int32)

        sample_ndxs = np.arange(len(self.object_ids))
        self.samples_by_obj_id = [
            sample_ndxs[self.object_ids == i]
            for i in range(len(self.obj_id_to_idx))
        ]
        self.samples_by_cam_yaw = [
            sample_ndxs[self.cam_yaw == i]
            for i in range(len(self.cam_yaw_to_idx))
        ]
        self.samples_by_light_yaw = [
            sample_ndxs[self.light_yaw == i]
            for i in range(len(self.light_yaw_to_idx))
        ]
        self.samples_by_cam_pitch = [
            sample_ndxs[self.cam_pitch == i]
            for i in range(len(self.cam_pitch_to_idx))
        ]

        self.obj_X = self.ObjectImageAccessor(self)
示例#6
0
def run():
    """
    Creates raw-cleaned-semantic schemas and populates the raw schema only.

    Parameters
    ----------
    read_json: bool
        Whether or not the script should read original json files
    write_json: bool
        Whether or not the script should write csvs of ais files
    dirs: [str]
        List of names of the directories to import
    date_range: [int]
        List of two ints with the first and last day to collect files from

    Returns
    -------
    None

    """
    # Set environment variables
    settings.load()
    # Get root directory from environment
    base_dir = settings.get_base_dir()
    sql_dir = base_dir.joinpath('sql')
    data_dir = settings.get_data_dir()
    filtered_dir = data_dir.joinpath('ais_deduped')

    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()
    #  print('Running with credentials: ', psql_credentials)

    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')

    ## ---- CREATE SCHEMAS ----

    print("Creating schemas")
    execute_sql(os.path.join(sql_dir, 'create_schemas.sql'),
                engine,
                read_file=True)

    ## ---- CREATE TABLES ----

    print("Creating tables")
    execute_sql(os.path.join(sql_dir, 'create_tables.sql'),
                engine,
                read_file=True)

    ## ---- UPLOAD TABLES ----

    print("Processing scraped vessels table.")
    copy_csv_to_db(os.path.join(data_dir, 'updated_boats.csv'), 'raw.vessels',
                   engine)
    print("Processing IUU list.")
    # load_iuu_list(os.path.join(data_dir, 'IUUList-20190902.txt'), engine)

    ## ---- UPLOAD SHAPEFILES ----

    # print("Uploading shapefiles")
    # TODO: get this fully hooked up and working
    # load_shp(DATA_DIR, dir_dict, credentials_dict):

    ## ---- WRITE filtered CSVs to db ----

    for path in filtered_dir.glob("*"):
        if path.is_dir():
            filtered_subdir = path
            #  this is where we upload csvs from the database
            #  the intention is that we sometimes do this with previously parsed csvs
            print(
                f"Uploading csv files to database from {filtered_subdir.name}."
            )
            try:
                load_csv(filtered_subdir,
                         engine,
                         'raw.ais',
                         sep='\t',
                         quote='\b')
            except IsADirectoryError:
                #raise
                print('Found directory, not file')
        print(f"Finished converted json from {filtered_subdir.name}")

    ## ---- ClEAN DATA ----
    print("Cleaning data")
    execute_sql(os.path.join(sql_dir, 'clean_data.sql'),
                engine,
                read_file=True)

    return
示例#7
0
def run(min_pings_init=30, min_pings_split=20, min_dist=2.0):
    """
    Runs feature generation that allows modeling stage to take place.
    Feature generation involves 3 main stages:
        - generating a sample to show the model
        - breaking sample up into trajectories
        - computing quantitative features on each trajectory
        - writing an image of each trajectory to folders grouped by 'vessel_type'

    :param min_pings_init: int
        The minimum number of AIS data points that must appear in a trajectory for it to be
        included in the sample.
    :param min_pings_split: int
        Applied after splitting trajectories at the gap. Should be smaller than min_pings_init.
        Ensures that split trajectories also have more than a certain minimum number of pings.

    :returns:
        None
    """
    start = time.time()
    # Set environment variables
    settings.load()
    # Get PostgreSQL database credentials
    psql_credentials = settings.get_psql()
    base_dir = settings.get_base_dir()
    sql_dir = base_dir.joinpath('sql')
    data_dir = settings.get_data_dir()

    # Create SQLAlchemy engine from database credentials
    engine = create_connection_from_dict(psql_credentials, 'postgresql')
    # Create a sql table with complete trajectories
    sample_switch = input("Create new sample for Convolutional Neural Net? (Y/N)")
    if sample_switch in ['Y', 'y', '1', 'Yes']:
        print("Creating CNN sample.")
        create_cnn_sample(sql_dir, engine, min_pings_init=min_pings_init, min_dist=min_dist)
    # Get data to process from postgres
    execute_sql('drop table if exists features.quants;', engine, read_file=False)
    if (data_dir / 'trajectories').is_dir():
        print("Removing old trajectories directory.")
        remove_dir(data_dir / 'trajectories')

    try:
        df = execute_sql("select * from features.cnn_sample", engine, read_file=False, return_df=True)
        print("Grabbing trajectory data")
    except db.exc.ProgrammingError:
        print("The table features.cnn_sample doesn't exist. Please create one.")
        raise SystemExit

    # Set data types of several key columns
    df = df.rename(columns={'time_stamp': 't'})
    df['t'] = pd.to_datetime(df['t'])
    df['longitude'] = pd.to_numeric(df['longitude'])
    df['latitude'] = pd.to_numeric(df['latitude'])
    # Set df index
    df.index = df['t']
    df_geo = df_to_geodf(df)
    # Filter by date and mmsi
    df_group = df_geo.groupby([pd.Grouper(freq='D'), 'mmsi'])
    # Loop through the grouped dataframes
    counter = 0

    # Load basemap shape file
    base_map = geopandas.read_file(
        '/Akamai/ais_project_data/GSHHS_shp/c/GSHHS_c_L1.shp')  # c: coarse, l: low, i: intermedate, h: high, f: full
    # Set CRS WGS 84
    base_map = base_map.to_crs(epsg=4326)

    for name, group in df_group:
        if len(group) < min_pings_init:
            continue
        trajectory = mp.Trajectory(name, group)

        # Split the trajectory at the gap
        split_trajectories = list(trajectory.split_by_observation_gap(timedelta(minutes=30)))

        ### CREATE TRAJECTORY IDs
        for split_index, trajectory in enumerate(split_trajectories):
            # create a universal trajectory ID
            # format is: mmsi-date-split_index
            trajectory.df['traj_id'] = str(name[1]) + '-' + str(name[0].date()) + '-' + str(split_index)

        ### CREATE QUANT FEATURES AND WRITE IMAGES TO DISK

        for split in split_trajectories:
            # store the length of the split trajectory in km
            traj_length = split.get_length() / 1_000
            if (len(split.df) < min_pings_split) or (traj_length < .5):
                print(f"Dropping a trajectory with length: {str(traj_length)} km and {str(len(split.df))} pings.")
                continue
            else:
                try:
                    quants = compute_quants(split.df[['longitude', 'latitude']])
                    quants['traj_id'] = str(split.df['traj_id'].iloc[0])
                    quants['vessel_type'] = str(split.df['vessel_type'].iloc[0])
                    quants.to_sql('quants', engine, schema='features',
                                  if_exists='append', index=False)
                    ### WRITE IMAGES TO DISK
                    save_matplotlib_img(split, data_dir, base_map)
                    counter += 1
                except:
                    print(f"An error occurred processing trajectory {split.df['traj_id'].iloc[0]}.")

    end = time.time()
    print(f"Generated features for {str(counter)} images in {str(round(end - start))} seconds.")
    return