def why_galaxies_not_included(galaxies, nsa, bricks): logging.info('Original galaxies: {}'.format(len(galaxies))) in_nsa_maybe_duplicate, not_in_nsa = matching_utils.match_galaxies_to_catalog_table( galaxies=galaxies, catalog=nsa, galaxy_suffix='', catalog_suffix='_nsa') not_in_nsa_save_loc = 'galaxies_not_in_nsa.csv' not_in_nsa.to_pandas().to_csv(not_in_nsa_save_loc) logging.info('{} galaxies not in NSA listed in {}'.format( len(not_in_nsa), not_in_nsa_save_loc)) # Are they duplicates? in_nsa = table.unique(in_nsa_maybe_duplicate, keep='first', keys='sdss_id') logging.info( 'Duplicate NSA cross-matches, selecting {} first matches only'.format( len(in_nsa_maybe_duplicate) - len(in_nsa))) # Are they in the NSA? logging.info('In NSA 1_0_0: {}'.format(len(in_nsa))) # Do they pass the selection cuts? good_petrotheta = selection_cuts.apply_selection_cuts(in_nsa) logging.info('Good petrotheta: {}'.format(len(good_petrotheta))) # Are they in decals? joint_catalog = get_joint_nsa_decals_catalog.create_joint_catalog( in_nsa, bricks, '5') # dont apply selection cuts logging.info('In DECALS bricks: {}'.format(len(joint_catalog))) # Are they successfully downloaded? fits_dir = download_decals_settings.fits_dir png_dir = download_decals_settings.png_dir set_download_directory(joint_catalog, fits_dir, png_dir) joint_catalog = download_images_threaded.check_images_are_downloaded( joint_catalog, n_processes=1) image_download_stats(joint_catalog) return joint_catalog
def subjects_not_yet_classified(catalog, subject_extract, classification_extract, workflow_id, start_date): # TODO temporary until I track down the logging level switch logging.basicConfig(level=logging.INFO) """ Filter for galaxies in catalog that are not classified Will return uploaded galaxies with 0 classifications. Do not run with fresh subject batch. Args: catalog (astropy.Table): all galaxies, with metadata for upload subject_extract (pd.DataFrame): Panoptes subject extract classification_extract (pd.DataFrame): Panoptes classification extract workflow_id (str): filter classifications and subjects to be from workflow id start_date (datetime.pyi): filter classifications to be made after start_date Returns: (astropy.Table) galaxies in catalog which have 0 classifications or are not yet uploaded anywhere """ relevant_classifications = classification_extract[ (pd.to_datetime(classification_extract['created_at']) >= start_date) & (classification_extract['workflow_id'].astype(str) == workflow_id)] uploaded_subjects = set(relevant_classifications['subject_ids']) logging.info('Subjects uploaded since launch: {}'.format( len(uploaded_subjects))) # subjects must have at least 1 classification since upload. Don't rapidly re-run this. # 'subjects_already_added' includes any subject id duplicates: each workflow will have a row for that subject_id subjects_already_added = subject_extract[ (subject_extract['subject_id'].isin(uploaded_subjects)) & (subject_extract['workflow_id'].astype(str) == '6122')] logging.info( 'Unique subjects identified as classified since launch: {}'.format( len(subjects_already_added['subject_id'].unique()))) if not subjects_already_added.empty: # get ra and dec from subject metadata subjects_already_added = panoptes_utils.load_current_subjects( subjects_already_added, workflow='6122', save_loc='temp.csv') _, subjects_not_yet_added = matching_utils.match_galaxies_to_catalog_table( galaxies=catalog, catalog=Table.from_pandas( subjects_already_added), # duplicates don't matter here galaxy_suffix='', catalog_suffix='_from_extract', matching_radius=10. * u.arcsec) else: logging.warning( 'Found no previously uploaded subjects with relevant classifications - not filtering!' ) subjects_not_yet_added = joint_catalog.copy() return subjects_not_yet_added
def download_galaxies_without_cuts(galaxies, nsa, joint_loc, png_dir, fits_dir): """[summary] Args: galaxies ([type]): [description] nsa ([type]): [description] joint_loc ([type]): save location for new joint catalog """ # let's redownload all, without filtering - make temp joint catalog galaxies_with_nsa_maybe_duplicate, _ = matching_utils.match_galaxies_to_catalog_table( galaxies=galaxies, catalog=nsa, galaxy_suffix='', catalog_suffix='_nsa') # if duplicate match to NSA catalog, pick the first galaxies_with_nsa = table.unique(galaxies_with_nsa_maybe_duplicate, keys='iauname', keep='first') logging.warning( 'Dropped {} galaxies that matched to the same NSA entry'.format( len(galaxies_with_nsa_maybe_duplicate) - len(galaxies_with_nsa))) assert len(table.unique(galaxies_with_nsa, keys='iauname')) == len(galaxies_with_nsa) logging.info('In NSA: {}'.format(len(galaxies_with_nsa))) in_decals_bricks = get_joint_nsa_decals_catalog.create_joint_catalog( nsa=galaxies_with_nsa, bricks=bricks, data_release='5') # dont apply selection cuts assert len(table.unique(in_decals_bricks, keys='iauname')) == len(in_decals_bricks) for dir in [png_dir, fits_dir]: if not os.path.isdir(dir): os.mkdir(dir) joint_catalog = download_images_threaded.download_images_multithreaded( in_decals_bricks, '5', fits_dir, png_dir, overwrite_fits=False, overwrite_png=False) logging.info('Downloaded {} galaxies without cuts'.format(joint_catalog)) image_download_stats(joint_catalog) joint_catalog['iauname'] = list( map(lambda x: str(x), joint_catalog['iauname'])) # avoid dtype problems joint_catalog.write( joint_loc, overwrite=False) # not allowed to overwrite, for safety logging.info('Written new joint catalog to {} for uploader'.format( joint_catalog_loc))
def test_match_galaxies_to_catalog_table(galaxies, catalog): matched, unmatched = matching_utils.match_galaxies_to_catalog_table( galaxies, catalog) assert matched['name'] == ['a'] assert unmatched['name'] == ['b'] assert set(matched.colnames) == { 'dec_subject', 'galaxy_data', 'name_subject', 'ra_subject', 'z_subject', 'best_match', 'sky_separation', 'dec', 'name', 'ra', 'table_data', 'z' } assert set(unmatched.colnames) == { 'dec', 'name', 'ra', 'z', 'best_match', 'sky_separation', 'galaxy_data' }
def test_match_galaxies_to_catalog_table_right_join(galaxies, catalog): matched, unmatched = matching_utils.match_galaxies_to_catalog_table( galaxies, catalog, join_type='right') assert set(matched['name']) == { 'a', 'c' } # should include both (right) catalog galaxies, but not the unmatched (left) galaxy assert unmatched['name'] == ['b'] assert set(matched.colnames) == { 'dec_subject', 'galaxy_data', 'name_subject', 'ra_subject', 'z_subject', 'best_match', 'sky_separation', 'dec', 'name', 'ra', 'table_data', 'z' } assert set(unmatched.colnames) == { 'dec', 'name', 'ra', 'z', 'best_match', 'sky_separation', 'galaxy_data' }
def get_expert_catalog_joined_with_decals(decals_catalog, expert_catalog, plot=False): """ Match Nair 2010 to the joint nsa-decals catalog. Decode Nair's binary type encoding. Add convenience columns to indicate bar or ring. Optionally, plot bar/ring statistics. Args: decals_catalog (astropy.Table): catalog of nsa galaxies in decals expert_catalog (astropy.Table): Nair 2010 expert classifications plot (bool): if True, make bar charts of T-Type, Bar and Ring counts in output catalog Returns: (astropy.Table): matched catalog, with extra bar/ring columns """ output_catalog, _ = matching_utils.match_galaxies_to_catalog_table( galaxies=expert_catalog, catalog=decals_catalog, matching_radius=5 * u.arcsec, galaxy_suffix='_expert') if plot: save_output_catalog_statistics(output_catalog) return output_catalog
def upload_decals_to_panoptes(joint_catalog_all, previous_subjects, expert_catalog, calibration_dir): """ Using the DECALS joint catalog created by a_download_decals, upload DECALS sets to Panoptes Only upload new galaxies by checking against previous subjects used Create calibration images with different rgb conversions to check if classifications are affected Upload the calibration images Args: joint_catalog (astropy.Table): NSA subjects imaged by DECALS. Includes png_loc, png_ready columns. previous_subjects (astropy.Table): expert_catalog (astropy.Table): Nair 2010 (human expert) catalog of rings, bars, etc. calibration_dir (str): directory to save calibration images subject_set_name (str): name to give subject set on Panoptes. Must not already exist. new_calibration_images (bool): if True, remake calibration images. If False, do not. Returns: None """ logging.info('Galaxies in joint catalog: {}'.format( len(joint_catalog_all))) logging.info('fits in joint catalog: {}'.format( joint_catalog_all['fits_ready'].sum())) joint_catalog = joint_catalog_all.copy() joint_catalog = joint_catalog[joint_catalog['png_ready'] == True] joint_catalog = joint_catalog[joint_catalog['fits_filled'] == True] dr2_galaxies, dr5_only_galaxies = matching_utils.match_galaxies_to_catalog_table( # unmatched galaxies are new galaxies=joint_catalog, catalog=previous_subjects, galaxy_suffix='', catalog_suffix='_dr1_2') # if field exists in both catalogs logging.info('Previously classified galaxies: {}'.format( len(dr2_galaxies))) logging.info('New galaxies: {}'.format(len(dr5_only_galaxies))) # TODO something after here is resetting the log value # use Nair galaxies previously classified in DR2 calibration_catalog = get_expert_catalog_joined_with_decals( dr2_galaxies, expert_catalog) # print(len(calibration_catalog)) # calibration_set_name = 'decals_dr2_nair_calibration_dr2_style_all' # calibration_catalog_dr2_style = make_catalog_png_images( # calibration_catalog[:20], # image_utils.get_dr2_style_image, # '{}/{}'.format(calibration_dir, calibration_set_name), # size=424, # overwrite=True) """ upload standard calibration set of Nair/DR2 galaxies, coloured by DR1/2 rules """ # _ = upload_subject_set.upload_nair_calibration_subject_set( # calibration_catalog, calibration_set_name) """ upload all Nair/DR2 galaxies, coloured by Lupton rules """ # calibration_set_name = 'decals_dr2_nair_lupton_style_all' # calibration_catalog_lupton_style = make_catalog_png_images( # calibration_catalog, # image_utils.get_colour_style_image, # '{}/{}'.format(calibration_dir, calibration_set_name), # size=424, # overwrite=True) # places new png in calibration folder under this name # upload_subject_set.upload_galaxy_subject_set( # calibration_catalog_lupton_style, calibration_set_name) """ upload all Nair/DR2 galaxies, coloured by DR2 rules """ # calibration_set_name = 'decals_dr2_nair_dr2_style_all' # calibration_catalog_dr2_style = make_catalog_png_images( # calibration_catalog, # image_utils.get_dr2_style_image, # '{}/{}'.format(calibration_dir, calibration_set_name), # size=424, # overwrite=False) # _ = upload_subject_set.upload_galaxy_subject_set(calibration_catalog_dr2_style, calibration_set_name) """ upload first n DR2-only galaxies """ # dr2_only_name = 'first_1k_decals_dr2' # _ = upload_subject_set.upload_galaxy_subject_set(dr2_galaxies[:1000], dr2_only_name) """ upload first n DR5-only galaxies """ # dr5_only_name = 'first_3k_decals_dr5_only' # _ = upload_subject_set.upload_galaxy_subject_set(dr5_only_galaxies[:3000], dr5_only_name) # dr5_only_name = '3k_to_5k_decals_dr5_only' # _ = upload_subject_set.upload_galaxy_subject_set(dr5_only_galaxies[3000:5000], dr5_only_name) # dr5_only_name = '10k_to_30k_decals_dr5_only' # _ = upload_subject_set.upload_galaxy_subject_set(dr5_only_galaxies[10000:30000], dr5_only_name) """ Upload first n DR5-only galaxies NOT already uploaded Must redo exports before uploading new galaxies. Alternative: use endpoint API """ latest_export_date_str = '2018-11-05' logging.info( 'Uploading first n DR5 galaxies NOT already uploaded as of {}'.format( latest_export_date_str)) latest_workflow_classification_export_loc = '/data/galaxy_zoo/decals/panoptes/reduction/raw/classifications/extracts/{}_panoptes-classifications.csv'.format( latest_export_date_str) previous_classifications = pd.read_csv( latest_workflow_classification_export_loc, dtype={'workflow_id': str}, parse_dates=['created_at'], usecols=['created_at', 'workflow_id', 'subject_ids']) latest_subject_extract_loc = '/data/galaxy_zoo/decals/panoptes/reduction/raw/subjects/{}_panoptes-subjects.csv'.format( latest_export_date_str) uploaded_subjects = pd.read_csv( latest_subject_extract_loc, dtype={'workflow_id': str}, usecols=['workflow_id', 'subject_id', 'metadata', 'locations']) subjects_not_yet_added = subjects_not_yet_classified( catalog=dr5_only_galaxies, subject_extract=uploaded_subjects, classification_extract=previous_classifications, workflow_id='6122', # dr5 workflow id start_date=datetime.datetime(year=2018, month=3, day=15)) # public launch date of DR5 logging.info( 'Galaxies in catalog not yet classified (to upload): {}'.format( len(subjects_not_yet_added))) subjects_not_yet_added_name = '5k_subjects_not_yet_classified' max_new_subjects = 5000 _ = upload_subject_set.upload_galaxy_subject_set( subjects_not_yet_added[:max_new_subjects], subjects_not_yet_added_name) logging.info('Subject set {} successfully uploaded'.format( subjects_not_yet_added_name)) """
def enforce_joint_catalog_columns(joint_catalog, overwrite_cache=False): """ Make sure joint catalog has the required columns for Panoptes upload. If not, load the cached NSA catalog and add them. If no cached NSA catalog, make one. Args: joint_catalog (astropy.Table): NSA and DECALS galaxies. Potentially only including a column subset. overwrite_cache (bool): if True, always make a new NSA cache. If False, only make cache if none exists. Returns: (astropy.Table): joint catalog with all missing catalog columns added """ # png_loc and fits_loc must have been added to joint catalog by this point by a downloader - raise error if not required_user_cols = ['png_loc', 'fits_loc'] for user_col in required_user_cols: assert user_col in set(joint_catalog.colnames) required_data_cols = [ 'nsa_id', 'iauname', 'ra', 'dec', 'petroth50', 'petrotheta', 'petroflux', 'nsa_version', 'z', 'mag', 'absmag', 'nmgy', ] if set(required_data_cols) not in set(joint_catalog.colnames): print('Warning: joint catalog is missing columns: {}'.format( set(required_data_cols) - set(joint_catalog.colnames))) if not os.path.exists(settings.nsa_cached_loc) or overwrite_cache: print('No cache found - creating new cache at {}'.format( settings.nsa_cached_loc)) kwargs = {'nsa_version': settings.nsa_version} astropy_utils.cache_table(settings.nsa_catalog_loc, settings.nsa_cached_loc, required_data_cols, get_nsa_catalog, kwargs) cached_nsa = Table.read( settings.nsa_cached_loc ) # cached nsa table has already been through get_nsa_catalog # exclude columns not already included catalog_cols = joint_catalog.colnames nsa_cols = cached_nsa.colnames cached_nsa = cached_nsa[list(set(nsa_cols) - set(catalog_cols)) + ['ra', 'dec']] # add the missing data columns expanded_joint_catalog, _ = matching_utils.match_galaxies_to_catalog_table( joint_catalog, cached_nsa) assert len(expanded_joint_catalog) == len(joint_catalog) return expanded_joint_catalog