def export_inventory_to_csv(inventory,
                            output_path,
                            first_cols=[
                                'season', 'site', 'roll', 'image_rank_in_roll',
                                'capture', 'image_rank_in_capture'
                            ],
                            return_df=False):
    """ Export Inventory to CSV
        inventory: dict
        output_path: path to a file that is being created
    """
    df = pd.DataFrame.from_dict(inventory, orient='index')

    # re-arrange columns
    cols = df.columns.tolist()

    first_cols = [x for x in first_cols if x in cols]

    cols_rearranged = first_cols + [x for x in cols if x not in first_cols]
    df = df[cols_rearranged]

    # sort rows
    df.sort_values(by=first_cols, inplace=True)

    # export
    df.to_csv(output_path, index=False)

    # change permmissions to read/write for group
    set_file_permission(output_path)

    if return_df:
        return df
def setup_logger(log_file=None):
    # log to file and console
    handlers = list()
    if log_file is not None:
        file_handler = logging.FileHandler(filename=log_file, mode='a')
        handlers.append(file_handler)
    stdout_handler = logging.StreamHandler(sys.stdout)
    handlers.append(stdout_handler)
    # logger configuration
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"),
                        format='%(asctime)s - %(funcName)s - %(levelname)s:' +
                        '%(message)s',
                        handlers=handlers)
    logging.info("{} is running the script".format(getpass.getuser()))
    if log_file is not None:
        set_file_permission(log_file)
def plot_site_roll_timelines(df,
                             output_path,
                             date_col='datetime',
                             date_format='%Y-%m-%d %H:%M:%S'):
    """ Plot timelines for site_roll combination """
    df_copy = df.loc[df[date_col] != ''].copy()
    date_time_obj = \
        [datetime.strptime(x, date_format) for x in df_copy[date_col].values]
    df_copy['date_time'] = date_time_obj
    roll_site_group = \
        df_copy.groupby(by=['site', 'roll', 'date_time']).size().unstack(
                level=[0, 1], fill_value=0)
    # Aggregate per Day - count number of instances
    roll_site_group.index = roll_site_group.index.to_period('D')
    roll_site_group = roll_site_group.groupby(roll_site_group.index).sum()
    # Plot
    n_subplots = roll_site_group.shape[1]
    fig, ax = plt.subplots(figsize=(8, n_subplots * 2), sharex=True)
    plt.subplots_adjust(bottom=0.5, top=1.5, hspace=1)
    plt.tight_layout()
    roll_site_group.plot(subplots=True, ax=ax)
    fig.savefig(output_path)
    set_file_permission(output_path)
예제 #4
0
    # Join and Export Data
    ######################################

    # Join season captures with preds
    season_df = pd.DataFrame.from_dict(season_dict, orient='index')
    season_df.index.name = 'capture_id'

    if args['export_only_with_predictions']:
        df_merged = pd.merge(season_df,
                             df_preds,
                             how='inner',
                             left_index=True,
                             right_index=True)
    else:
        df_merged = pd.merge(season_df,
                             df_preds,
                             how='left',
                             left_index=True,
                             right_index=True)
        df_merged.fillna('', inplace=True)

    # export
    sort_df_by_capture_id(df_merged)
    df_merged.to_csv(args['output_csv'], index=False)

    logger.info("Wrote {} records to {}".format(df_merged.shape[0],
                                                args['output_csv']))

    # change permmissions to read/write for group
    set_file_permission(args['output_csv'])
예제 #5
0
    parser.add_argument("--captures", type=str, required=True)
    parser.add_argument("--log_dir", type=str, default=None)
    parser.add_argument("--log_filename", type=str, default='generate_actions')
    args = vars(parser.parse_args())

    # check existence of root dir
    if not os.path.isfile(args['action_list']):
        raise FileNotFoundError(
            "action_list {} does not exist -- must be a file".format(
                args['action_list']))

    if not os.path.isfile(args['captures']):
        raise FileNotFoundError(
            "captures {} does not exist -- must be a file".format(
                args['captures']))

    # logging
    set_logging(args['log_dir'], args['log_filename'])
    logger = logging.getLogger(__name__)

    # read files
    captures = read_image_inventory(args['captures'], unique_id='image_name')
    action_list = read_image_inventory(args['action_list'], unique_id=None)

    actions_inventory = generate_actions(action_list, captures)

    # Export actions list
    df = pd.DataFrame.from_records(actions_inventory, columns=Action._fields)
    df.to_csv(args['actions_to_perform_csv'], index=False)
    set_file_permission(args['actions_to_perform_csv'])
    if args['split_order'] is 'random':
        random.seed(123)
        random.shuffle(capture_ids)

    logger.info("Creating %s splits" % (n_batches))

    for batch_no, (i_start, i_end) in enumerate(slices):
        batch_manifest = OrderedDict()

        batch_path = file_path_generator(
            dir=os.path.dirname(args['manifest']),
            id=file_name_parts['id'],
            batch="batch_%s" % (batch_no + 1),
            name=file_name_parts['name'],
            file_delim=file_name_parts['file_delim'],
            file_ext=file_name_parts['file_ext'])

        for batch_id in capture_ids[i_start:i_end]:
            batch_manifest[batch_id] = manifest[batch_id]

        logger.info("Writing batch %s to %s with %s records" %
                    (batch_no + 1, batch_path, len(batch_manifest.keys())))

        export_dict_to_json_with_newlines(batch_manifest, batch_path)

        logger.info("Finished writing to {}".format(batch_path))

        # change permmissions to read/write for group
        set_file_permission(batch_path)
output_csv = '/home/packerc/shared/machine_learning/data/meta_data/label_overview_all.csv'
root_path = '/home/packerc/shared/zooniverse/SpeciesReports/'

counts_questions = plurality_aggregation_flags['QUESTION_COUNTS']
non_binary_questions = counts_questions + [
    plurality_aggregation_flags['QUESTION_MAIN']
]

reports = find_all_reports(root_path)

stats_list = []
for season_id, path_report in reports.items():
    location, season = season_id.split('_')
    path_cleaned = '/home/packerc/shared/season_captures/{}/cleaned/{}_cleaned.csv'.format(
        location, season_id)
    df_report = pd.read_csv(path_report, dtype='str', index_col=None)
    df_report.fillna('', inplace=True)
    question_stats = get_question_stats(df_report, non_binary_questions)
    for question, question_answers in question_stats.items():
        for question_answer in question_answers.keys():
            stats_list.append([location, season, question, question_answer])

df_stats = pd.DataFrame(stats_list)
df_stats.columns = ['location', 'season', 'question', 'answer']

# export df
df_stats.to_csv(output_csv, index=False)

set_file_permission(output_csv)
    # add remaining cols
    remaining_cols = [x for x in all_cols if x not in output_cols]
    output_cols += remaining_cols

    # remove cols
    output_cols = [x for x in output_cols if x not in exclude_columns]
    if exclude_exif_data:
        output_cols = [x for x in output_cols if not x.startswith('exif__')]
    if exclude_image_check_flags:
        output_cols = [
            x for x in output_cols if not x.startswith('image_check__')
        ]

    # select cols
    df = df[output_cols]
    logger.info("Found the following columns for export: {}".format(
        df.columns))
    sort_df(df)

    ######################################
    # Export
    ######################################

    # export
    df.to_csv(args['captures_cleaned'], index=False)

    logger.info("Exported {} records to {}".format(df.shape[0],
                                                   args['captures_cleaned']))

    set_file_permission(args['captures_cleaned'])
    # define tracker file path
    tracker_file_path = file_path_generator(
        dir=os.path.dirname(args['manifest']),
        id=file_name_parts['id'],
        name='upload_tracker_file',
        batch=file_name_parts['batch'],
        file_delim=file_name_parts['file_delim'],
        file_ext='txt')

    logger.info(
        "Defining upload tracker file at: {}".format(tracker_file_path))

    # read upload tracker file
    if not os.path.exists(tracker_file_path):
        uploader.create_tracker_file(tracker_file_path)
        set_file_permission(tracker_file_path)
    tracker_data = uploader.read_tracker_file(tracker_file_path)

    n_in_tracker_file = len(tracker_data.keys())
    logger.info("Found {} already uploaded subjects in tracker file".format(
        n_in_tracker_file))

    ###################################
    # Load Files
    ###################################

    # import manifest
    with open(args['manifest'], 'r') as f:
        mani = json.load(f)

    logger.info("Imported Manifest file {} with {} records".format(
예제 #10
0
            logger.info('{:15} - {:15} - {:10} / {} ({:.2f} %)'.format(
                question, answer, n, n_tot, percent))

    # Print examples
    logger.info("Show some example classifications")
    for i, (_id, data) in enumerate(classifications.items()):
        if i > 10:
            break
        logger.info("ID: {}, Data: {}".format(_id, data))

    ######################################
    # Export Annotations to a File
    ######################################

    # access a random classification and get all the keys of the first
    # annotation -- this is consistent for all other annotations
    header = list(classifications[list(classifications.keys())[0]][0].keys())

    output_paths = {
        k: os.path.join(args['output_path'],
                        'SER_{}_annotations.csv'.format(k))
        for k in all_seasons.keys()
    }

    legacy_extractor.export_cleaned_annotations(output_paths[s_id],
                                                classifications, header, flags,
                                                flags_global)

    # change permmissions to read/write for group
    set_file_permission(output_paths[s_id])
                'attribution': args['attribution'],
                'license': args['license']
            }
            # store additional information
            info = {'uploaded': False}
            manifest[capture_id] = {
                'upload_metadata': upload_metadata,
                'info': info,
                'images': []
            }
        # Add image information
        manifest[capture_id]['images'].append(image_path)

        if (row_no % 10000) == 0:
            logger.info("Processed {}/{} records".format(
                row_no, n_records_total))

    logger.info("Omitted %s images due to invalid/no_upload flags" %
                n_omitted_images)
    logger.info("Number of images not found in images folder %s" %
                images_not_found_counter)
    logger.info("Writing %s captures to %s" %
                (len(manifest.keys()), manifest_path))

    export_dict_to_json_with_newlines(manifest, manifest_path)

    logger.info("Finished writing to {}".format(manifest_path))

    # change permmissions to read/write for group
    set_file_permission(manifest_path)