def export_inventory_to_csv(inventory, output_path, first_cols=[ 'season', 'site', 'roll', 'image_rank_in_roll', 'capture', 'image_rank_in_capture' ], return_df=False): """ Export Inventory to CSV inventory: dict output_path: path to a file that is being created """ df = pd.DataFrame.from_dict(inventory, orient='index') # re-arrange columns cols = df.columns.tolist() first_cols = [x for x in first_cols if x in cols] cols_rearranged = first_cols + [x for x in cols if x not in first_cols] df = df[cols_rearranged] # sort rows df.sort_values(by=first_cols, inplace=True) # export df.to_csv(output_path, index=False) # change permmissions to read/write for group set_file_permission(output_path) if return_df: return df
def setup_logger(log_file=None): # log to file and console handlers = list() if log_file is not None: file_handler = logging.FileHandler(filename=log_file, mode='a') handlers.append(file_handler) stdout_handler = logging.StreamHandler(sys.stdout) handlers.append(stdout_handler) # logger configuration logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"), format='%(asctime)s - %(funcName)s - %(levelname)s:' + '%(message)s', handlers=handlers) logging.info("{} is running the script".format(getpass.getuser())) if log_file is not None: set_file_permission(log_file)
def plot_site_roll_timelines(df, output_path, date_col='datetime', date_format='%Y-%m-%d %H:%M:%S'): """ Plot timelines for site_roll combination """ df_copy = df.loc[df[date_col] != ''].copy() date_time_obj = \ [datetime.strptime(x, date_format) for x in df_copy[date_col].values] df_copy['date_time'] = date_time_obj roll_site_group = \ df_copy.groupby(by=['site', 'roll', 'date_time']).size().unstack( level=[0, 1], fill_value=0) # Aggregate per Day - count number of instances roll_site_group.index = roll_site_group.index.to_period('D') roll_site_group = roll_site_group.groupby(roll_site_group.index).sum() # Plot n_subplots = roll_site_group.shape[1] fig, ax = plt.subplots(figsize=(8, n_subplots * 2), sharex=True) plt.subplots_adjust(bottom=0.5, top=1.5, hspace=1) plt.tight_layout() roll_site_group.plot(subplots=True, ax=ax) fig.savefig(output_path) set_file_permission(output_path)
# Join and Export Data ###################################### # Join season captures with preds season_df = pd.DataFrame.from_dict(season_dict, orient='index') season_df.index.name = 'capture_id' if args['export_only_with_predictions']: df_merged = pd.merge(season_df, df_preds, how='inner', left_index=True, right_index=True) else: df_merged = pd.merge(season_df, df_preds, how='left', left_index=True, right_index=True) df_merged.fillna('', inplace=True) # export sort_df_by_capture_id(df_merged) df_merged.to_csv(args['output_csv'], index=False) logger.info("Wrote {} records to {}".format(df_merged.shape[0], args['output_csv'])) # change permmissions to read/write for group set_file_permission(args['output_csv'])
parser.add_argument("--captures", type=str, required=True) parser.add_argument("--log_dir", type=str, default=None) parser.add_argument("--log_filename", type=str, default='generate_actions') args = vars(parser.parse_args()) # check existence of root dir if not os.path.isfile(args['action_list']): raise FileNotFoundError( "action_list {} does not exist -- must be a file".format( args['action_list'])) if not os.path.isfile(args['captures']): raise FileNotFoundError( "captures {} does not exist -- must be a file".format( args['captures'])) # logging set_logging(args['log_dir'], args['log_filename']) logger = logging.getLogger(__name__) # read files captures = read_image_inventory(args['captures'], unique_id='image_name') action_list = read_image_inventory(args['action_list'], unique_id=None) actions_inventory = generate_actions(action_list, captures) # Export actions list df = pd.DataFrame.from_records(actions_inventory, columns=Action._fields) df.to_csv(args['actions_to_perform_csv'], index=False) set_file_permission(args['actions_to_perform_csv'])
if args['split_order'] is 'random': random.seed(123) random.shuffle(capture_ids) logger.info("Creating %s splits" % (n_batches)) for batch_no, (i_start, i_end) in enumerate(slices): batch_manifest = OrderedDict() batch_path = file_path_generator( dir=os.path.dirname(args['manifest']), id=file_name_parts['id'], batch="batch_%s" % (batch_no + 1), name=file_name_parts['name'], file_delim=file_name_parts['file_delim'], file_ext=file_name_parts['file_ext']) for batch_id in capture_ids[i_start:i_end]: batch_manifest[batch_id] = manifest[batch_id] logger.info("Writing batch %s to %s with %s records" % (batch_no + 1, batch_path, len(batch_manifest.keys()))) export_dict_to_json_with_newlines(batch_manifest, batch_path) logger.info("Finished writing to {}".format(batch_path)) # change permmissions to read/write for group set_file_permission(batch_path)
output_csv = '/home/packerc/shared/machine_learning/data/meta_data/label_overview_all.csv' root_path = '/home/packerc/shared/zooniverse/SpeciesReports/' counts_questions = plurality_aggregation_flags['QUESTION_COUNTS'] non_binary_questions = counts_questions + [ plurality_aggregation_flags['QUESTION_MAIN'] ] reports = find_all_reports(root_path) stats_list = [] for season_id, path_report in reports.items(): location, season = season_id.split('_') path_cleaned = '/home/packerc/shared/season_captures/{}/cleaned/{}_cleaned.csv'.format( location, season_id) df_report = pd.read_csv(path_report, dtype='str', index_col=None) df_report.fillna('', inplace=True) question_stats = get_question_stats(df_report, non_binary_questions) for question, question_answers in question_stats.items(): for question_answer in question_answers.keys(): stats_list.append([location, season, question, question_answer]) df_stats = pd.DataFrame(stats_list) df_stats.columns = ['location', 'season', 'question', 'answer'] # export df df_stats.to_csv(output_csv, index=False) set_file_permission(output_csv)
# add remaining cols remaining_cols = [x for x in all_cols if x not in output_cols] output_cols += remaining_cols # remove cols output_cols = [x for x in output_cols if x not in exclude_columns] if exclude_exif_data: output_cols = [x for x in output_cols if not x.startswith('exif__')] if exclude_image_check_flags: output_cols = [ x for x in output_cols if not x.startswith('image_check__') ] # select cols df = df[output_cols] logger.info("Found the following columns for export: {}".format( df.columns)) sort_df(df) ###################################### # Export ###################################### # export df.to_csv(args['captures_cleaned'], index=False) logger.info("Exported {} records to {}".format(df.shape[0], args['captures_cleaned'])) set_file_permission(args['captures_cleaned'])
# define tracker file path tracker_file_path = file_path_generator( dir=os.path.dirname(args['manifest']), id=file_name_parts['id'], name='upload_tracker_file', batch=file_name_parts['batch'], file_delim=file_name_parts['file_delim'], file_ext='txt') logger.info( "Defining upload tracker file at: {}".format(tracker_file_path)) # read upload tracker file if not os.path.exists(tracker_file_path): uploader.create_tracker_file(tracker_file_path) set_file_permission(tracker_file_path) tracker_data = uploader.read_tracker_file(tracker_file_path) n_in_tracker_file = len(tracker_data.keys()) logger.info("Found {} already uploaded subjects in tracker file".format( n_in_tracker_file)) ################################### # Load Files ################################### # import manifest with open(args['manifest'], 'r') as f: mani = json.load(f) logger.info("Imported Manifest file {} with {} records".format(
logger.info('{:15} - {:15} - {:10} / {} ({:.2f} %)'.format( question, answer, n, n_tot, percent)) # Print examples logger.info("Show some example classifications") for i, (_id, data) in enumerate(classifications.items()): if i > 10: break logger.info("ID: {}, Data: {}".format(_id, data)) ###################################### # Export Annotations to a File ###################################### # access a random classification and get all the keys of the first # annotation -- this is consistent for all other annotations header = list(classifications[list(classifications.keys())[0]][0].keys()) output_paths = { k: os.path.join(args['output_path'], 'SER_{}_annotations.csv'.format(k)) for k in all_seasons.keys() } legacy_extractor.export_cleaned_annotations(output_paths[s_id], classifications, header, flags, flags_global) # change permmissions to read/write for group set_file_permission(output_paths[s_id])
'attribution': args['attribution'], 'license': args['license'] } # store additional information info = {'uploaded': False} manifest[capture_id] = { 'upload_metadata': upload_metadata, 'info': info, 'images': [] } # Add image information manifest[capture_id]['images'].append(image_path) if (row_no % 10000) == 0: logger.info("Processed {}/{} records".format( row_no, n_records_total)) logger.info("Omitted %s images due to invalid/no_upload flags" % n_omitted_images) logger.info("Number of images not found in images folder %s" % images_not_found_counter) logger.info("Writing %s captures to %s" % (len(manifest.keys()), manifest_path)) export_dict_to_json_with_newlines(manifest, manifest_path) logger.info("Finished writing to {}".format(manifest_path)) # change permmissions to read/write for group set_file_permission(manifest_path)