Exemplo n.º 1
0
def why_galaxies_not_included(galaxies, nsa, bricks):
    logging.info('Original galaxies: {}'.format(len(galaxies)))
    in_nsa_maybe_duplicate, not_in_nsa = matching_utils.match_galaxies_to_catalog_table(
        galaxies=galaxies,
        catalog=nsa,
        galaxy_suffix='',
        catalog_suffix='_nsa')
    not_in_nsa_save_loc = 'galaxies_not_in_nsa.csv'
    not_in_nsa.to_pandas().to_csv(not_in_nsa_save_loc)
    logging.info('{} galaxies not in NSA listed in {}'.format(
        len(not_in_nsa), not_in_nsa_save_loc))
    # Are they duplicates?
    in_nsa = table.unique(in_nsa_maybe_duplicate, keep='first', keys='sdss_id')
    logging.info(
        'Duplicate NSA cross-matches, selecting {} first matches only'.format(
            len(in_nsa_maybe_duplicate) - len(in_nsa)))
    # Are they in the NSA?
    logging.info('In NSA 1_0_0: {}'.format(len(in_nsa)))
    # Do they pass the selection cuts?
    good_petrotheta = selection_cuts.apply_selection_cuts(in_nsa)
    logging.info('Good petrotheta: {}'.format(len(good_petrotheta)))
    # Are they in decals?
    joint_catalog = get_joint_nsa_decals_catalog.create_joint_catalog(
        in_nsa, bricks, '5')  # dont apply selection cuts
    logging.info('In DECALS bricks: {}'.format(len(joint_catalog)))
    # Are they successfully downloaded?
    fits_dir = download_decals_settings.fits_dir
    png_dir = download_decals_settings.png_dir
    set_download_directory(joint_catalog, fits_dir, png_dir)
    joint_catalog = download_images_threaded.check_images_are_downloaded(
        joint_catalog, n_processes=1)
    image_download_stats(joint_catalog)
    return joint_catalog
Exemplo n.º 2
0
def subjects_not_yet_classified(catalog, subject_extract,
                                classification_extract, workflow_id,
                                start_date):
    # TODO temporary until I track down the logging level switch
    logging.basicConfig(level=logging.INFO)
    """
    Filter for galaxies in catalog that are not classified
    Will return uploaded galaxies with 0 classifications. Do not run with fresh subject batch.
    Args:
        catalog (astropy.Table): all galaxies, with metadata for upload
        subject_extract (pd.DataFrame): Panoptes subject extract
        classification_extract (pd.DataFrame): Panoptes classification extract
        workflow_id (str): filter classifications and subjects to be from workflow id
        start_date (datetime.pyi): filter classifications to be made after start_date

    Returns:
        (astropy.Table) galaxies in catalog which have 0 classifications or are not yet uploaded anywhere
    """

    relevant_classifications = classification_extract[
        (pd.to_datetime(classification_extract['created_at']) >= start_date)
        & (classification_extract['workflow_id'].astype(str) == workflow_id)]

    uploaded_subjects = set(relevant_classifications['subject_ids'])
    logging.info('Subjects uploaded since launch: {}'.format(
        len(uploaded_subjects)))
    # subjects must have at least 1 classification since upload. Don't rapidly re-run this.

    # 'subjects_already_added' includes any subject id duplicates: each workflow will have a row for that subject_id
    subjects_already_added = subject_extract[
        (subject_extract['subject_id'].isin(uploaded_subjects))
        & (subject_extract['workflow_id'].astype(str) == '6122')]
    logging.info(
        'Unique subjects identified as classified since launch: {}'.format(
            len(subjects_already_added['subject_id'].unique())))

    if not subjects_already_added.empty:
        # get ra and dec from subject metadata
        subjects_already_added = panoptes_utils.load_current_subjects(
            subjects_already_added, workflow='6122', save_loc='temp.csv')
        _, subjects_not_yet_added = matching_utils.match_galaxies_to_catalog_table(
            galaxies=catalog,
            catalog=Table.from_pandas(
                subjects_already_added),  # duplicates don't matter here
            galaxy_suffix='',
            catalog_suffix='_from_extract',
            matching_radius=10. * u.arcsec)
    else:
        logging.warning(
            'Found no previously uploaded subjects with relevant classifications - not filtering!'
        )
        subjects_not_yet_added = joint_catalog.copy()

    return subjects_not_yet_added
Exemplo n.º 3
0
def download_galaxies_without_cuts(galaxies, nsa, joint_loc, png_dir,
                                   fits_dir):
    """[summary]
    
    Args:
        galaxies ([type]): [description]
        nsa ([type]): [description]
        joint_loc ([type]): save location for new joint catalog
    """

    # let's redownload all, without filtering - make temp joint catalog
    galaxies_with_nsa_maybe_duplicate, _ = matching_utils.match_galaxies_to_catalog_table(
        galaxies=galaxies,
        catalog=nsa,
        galaxy_suffix='',
        catalog_suffix='_nsa')
    # if duplicate match to NSA catalog, pick the first
    galaxies_with_nsa = table.unique(galaxies_with_nsa_maybe_duplicate,
                                     keys='iauname',
                                     keep='first')
    logging.warning(
        'Dropped {} galaxies that matched to the same NSA entry'.format(
            len(galaxies_with_nsa_maybe_duplicate) - len(galaxies_with_nsa)))
    assert len(table.unique(galaxies_with_nsa,
                            keys='iauname')) == len(galaxies_with_nsa)

    logging.info('In NSA: {}'.format(len(galaxies_with_nsa)))
    in_decals_bricks = get_joint_nsa_decals_catalog.create_joint_catalog(
        nsa=galaxies_with_nsa, bricks=bricks,
        data_release='5')  # dont apply selection cuts
    assert len(table.unique(in_decals_bricks,
                            keys='iauname')) == len(in_decals_bricks)

    for dir in [png_dir, fits_dir]:
        if not os.path.isdir(dir):
            os.mkdir(dir)

    joint_catalog = download_images_threaded.download_images_multithreaded(
        in_decals_bricks,
        '5',
        fits_dir,
        png_dir,
        overwrite_fits=False,
        overwrite_png=False)
    logging.info('Downloaded {} galaxies without cuts'.format(joint_catalog))
    image_download_stats(joint_catalog)
    joint_catalog['iauname'] = list(
        map(lambda x: str(x),
            joint_catalog['iauname']))  # avoid dtype problems
    joint_catalog.write(
        joint_loc, overwrite=False)  # not allowed to overwrite, for safety
    logging.info('Written new joint catalog to {} for uploader'.format(
        joint_catalog_loc))
def test_match_galaxies_to_catalog_table(galaxies, catalog):

    matched, unmatched = matching_utils.match_galaxies_to_catalog_table(
        galaxies, catalog)

    assert matched['name'] == ['a']
    assert unmatched['name'] == ['b']

    assert set(matched.colnames) == {
        'dec_subject', 'galaxy_data', 'name_subject', 'ra_subject',
        'z_subject', 'best_match', 'sky_separation', 'dec', 'name', 'ra',
        'table_data', 'z'
    }
    assert set(unmatched.colnames) == {
        'dec', 'name', 'ra', 'z', 'best_match', 'sky_separation', 'galaxy_data'
    }
def test_match_galaxies_to_catalog_table_right_join(galaxies, catalog):

    matched, unmatched = matching_utils.match_galaxies_to_catalog_table(
        galaxies, catalog, join_type='right')

    assert set(matched['name']) == {
        'a', 'c'
    }  # should include both (right) catalog galaxies, but not the unmatched (left) galaxy
    assert unmatched['name'] == ['b']

    assert set(matched.colnames) == {
        'dec_subject', 'galaxy_data', 'name_subject', 'ra_subject',
        'z_subject', 'best_match', 'sky_separation', 'dec', 'name', 'ra',
        'table_data', 'z'
    }
    assert set(unmatched.colnames) == {
        'dec', 'name', 'ra', 'z', 'best_match', 'sky_separation', 'galaxy_data'
    }
Exemplo n.º 6
0
def get_expert_catalog_joined_with_decals(decals_catalog,
                                          expert_catalog,
                                          plot=False):
    """
    Match Nair 2010 to the joint nsa-decals catalog. Decode Nair's binary type encoding.
    Add convenience columns to indicate bar or ring. Optionally, plot bar/ring statistics.
    Args:
        decals_catalog (astropy.Table): catalog of nsa galaxies in decals
        expert_catalog (astropy.Table): Nair 2010 expert classifications
        plot (bool): if True, make bar charts of T-Type, Bar and Ring counts in output catalog

    Returns:
        (astropy.Table): matched catalog, with extra bar/ring columns
    """
    output_catalog, _ = matching_utils.match_galaxies_to_catalog_table(
        galaxies=expert_catalog,
        catalog=decals_catalog,
        matching_radius=5 * u.arcsec,
        galaxy_suffix='_expert')

    if plot:
        save_output_catalog_statistics(output_catalog)

    return output_catalog
Exemplo n.º 7
0
def upload_decals_to_panoptes(joint_catalog_all, previous_subjects,
                              expert_catalog, calibration_dir):
    """
    Using the DECALS joint catalog created by a_download_decals, upload DECALS sets to Panoptes
    Only upload new galaxies by checking against previous subjects used
    Create calibration images with different rgb conversions to check if classifications are affected
    Upload the calibration images

    Args:
        joint_catalog (astropy.Table): NSA subjects imaged by DECALS. Includes png_loc, png_ready columns.
        previous_subjects (astropy.Table):
        expert_catalog (astropy.Table): Nair 2010 (human expert) catalog of rings, bars, etc.
        calibration_dir (str): directory to save calibration images
        subject_set_name (str): name to give subject set on Panoptes. Must not already exist.
        new_calibration_images (bool): if True, remake calibration images. If False, do not.

    Returns:
        None
    """

    logging.info('Galaxies in joint catalog: {}'.format(
        len(joint_catalog_all)))
    logging.info('fits in joint catalog: {}'.format(
        joint_catalog_all['fits_ready'].sum()))

    joint_catalog = joint_catalog_all.copy()
    joint_catalog = joint_catalog[joint_catalog['png_ready'] == True]
    joint_catalog = joint_catalog[joint_catalog['fits_filled'] == True]

    dr2_galaxies, dr5_only_galaxies = matching_utils.match_galaxies_to_catalog_table(  # unmatched galaxies are new
        galaxies=joint_catalog,
        catalog=previous_subjects,
        galaxy_suffix='',
        catalog_suffix='_dr1_2')  # if field exists in both catalogs

    logging.info('Previously classified galaxies: {}'.format(
        len(dr2_galaxies)))
    logging.info('New galaxies: {}'.format(len(dr5_only_galaxies)))
    # TODO something after here is resetting the log value

    # use Nair galaxies previously classified in DR2
    calibration_catalog = get_expert_catalog_joined_with_decals(
        dr2_galaxies, expert_catalog)
    # print(len(calibration_catalog))

    # calibration_set_name = 'decals_dr2_nair_calibration_dr2_style_all'

    # calibration_catalog_dr2_style = make_catalog_png_images(
    #     calibration_catalog[:20],
    #     image_utils.get_dr2_style_image,
    #     '{}/{}'.format(calibration_dir, calibration_set_name),
    #     size=424,
    #     overwrite=True)
    """
    upload standard calibration set of Nair/DR2 galaxies, coloured by DR1/2 rules
    """
    # _ = upload_subject_set.upload_nair_calibration_subject_set(
    #     calibration_catalog, calibration_set_name)
    """
    upload all Nair/DR2 galaxies, coloured by Lupton rules
    """
    # calibration_set_name = 'decals_dr2_nair_lupton_style_all'
    # calibration_catalog_lupton_style = make_catalog_png_images(
    #     calibration_catalog,
    #     image_utils.get_colour_style_image,
    #     '{}/{}'.format(calibration_dir, calibration_set_name),
    #     size=424,
    #     overwrite=True)  # places new png in calibration folder under this name
    # upload_subject_set.upload_galaxy_subject_set(
    #     calibration_catalog_lupton_style, calibration_set_name)
    """
    upload all Nair/DR2 galaxies, coloured by DR2 rules
    """
    # calibration_set_name = 'decals_dr2_nair_dr2_style_all'
    # calibration_catalog_dr2_style = make_catalog_png_images(
    #     calibration_catalog,
    #     image_utils.get_dr2_style_image,
    #     '{}/{}'.format(calibration_dir, calibration_set_name),
    #     size=424,
    #     overwrite=False)
    # _ = upload_subject_set.upload_galaxy_subject_set(calibration_catalog_dr2_style, calibration_set_name)
    """
    upload first n DR2-only galaxies
    """
    # dr2_only_name = 'first_1k_decals_dr2'
    # _ = upload_subject_set.upload_galaxy_subject_set(dr2_galaxies[:1000], dr2_only_name)
    """
    upload first n DR5-only galaxies
    """
    # dr5_only_name = 'first_3k_decals_dr5_only'
    # _ = upload_subject_set.upload_galaxy_subject_set(dr5_only_galaxies[:3000], dr5_only_name)
    # dr5_only_name = '3k_to_5k_decals_dr5_only'
    # _ = upload_subject_set.upload_galaxy_subject_set(dr5_only_galaxies[3000:5000], dr5_only_name)
    # dr5_only_name = '10k_to_30k_decals_dr5_only'
    # _ = upload_subject_set.upload_galaxy_subject_set(dr5_only_galaxies[10000:30000], dr5_only_name)
    """
    Upload first n DR5-only galaxies NOT already uploaded
    Must redo exports before uploading new galaxies. 
    Alternative: use endpoint API
    """
    latest_export_date_str = '2018-11-05'
    logging.info(
        'Uploading first n DR5 galaxies NOT already uploaded as of {}'.format(
            latest_export_date_str))
    latest_workflow_classification_export_loc = '/data/galaxy_zoo/decals/panoptes/reduction/raw/classifications/extracts/{}_panoptes-classifications.csv'.format(
        latest_export_date_str)
    previous_classifications = pd.read_csv(
        latest_workflow_classification_export_loc,
        dtype={'workflow_id': str},
        parse_dates=['created_at'],
        usecols=['created_at', 'workflow_id', 'subject_ids'])

    latest_subject_extract_loc = '/data/galaxy_zoo/decals/panoptes/reduction/raw/subjects/{}_panoptes-subjects.csv'.format(
        latest_export_date_str)
    uploaded_subjects = pd.read_csv(
        latest_subject_extract_loc,
        dtype={'workflow_id': str},
        usecols=['workflow_id', 'subject_id', 'metadata', 'locations'])

    subjects_not_yet_added = subjects_not_yet_classified(
        catalog=dr5_only_galaxies,
        subject_extract=uploaded_subjects,
        classification_extract=previous_classifications,
        workflow_id='6122',  # dr5 workflow id
        start_date=datetime.datetime(year=2018, month=3,
                                     day=15))  # public launch date of DR5

    logging.info(
        'Galaxies in catalog not yet classified (to upload): {}'.format(
            len(subjects_not_yet_added)))
    subjects_not_yet_added_name = '5k_subjects_not_yet_classified'
    max_new_subjects = 5000
    _ = upload_subject_set.upload_galaxy_subject_set(
        subjects_not_yet_added[:max_new_subjects], subjects_not_yet_added_name)
    logging.info('Subject set {} successfully uploaded'.format(
        subjects_not_yet_added_name))
    """
Exemplo n.º 8
0
def enforce_joint_catalog_columns(joint_catalog, overwrite_cache=False):
    """
    Make sure joint catalog has the required columns for Panoptes upload.
    If not, load the cached NSA catalog and add them.
    If no cached NSA catalog, make one.
    Args:
        joint_catalog (astropy.Table): NSA and DECALS galaxies. Potentially only including a column subset.
        overwrite_cache (bool): if True, always make a new NSA cache. If False, only make cache if none exists.

    Returns:
        (astropy.Table): joint catalog with all missing catalog columns added
    """

    # png_loc and fits_loc must have been added to joint catalog by this point by a downloader - raise error if not
    required_user_cols = ['png_loc', 'fits_loc']
    for user_col in required_user_cols:
        assert user_col in set(joint_catalog.colnames)

    required_data_cols = [
        'nsa_id',
        'iauname',
        'ra',
        'dec',
        'petroth50',
        'petrotheta',
        'petroflux',
        'nsa_version',
        'z',
        'mag',
        'absmag',
        'nmgy',
    ]

    if set(required_data_cols) not in set(joint_catalog.colnames):
        print('Warning: joint catalog is missing columns: {}'.format(
            set(required_data_cols) - set(joint_catalog.colnames)))

        if not os.path.exists(settings.nsa_cached_loc) or overwrite_cache:
            print('No cache found - creating new cache at {}'.format(
                settings.nsa_cached_loc))
            kwargs = {'nsa_version': settings.nsa_version}
            astropy_utils.cache_table(settings.nsa_catalog_loc,
                                      settings.nsa_cached_loc,
                                      required_data_cols, get_nsa_catalog,
                                      kwargs)

        cached_nsa = Table.read(
            settings.nsa_cached_loc
        )  # cached nsa table has already been through get_nsa_catalog

        # exclude columns not already included
        catalog_cols = joint_catalog.colnames
        nsa_cols = cached_nsa.colnames
        cached_nsa = cached_nsa[list(set(nsa_cols) - set(catalog_cols)) +
                                ['ra', 'dec']]

        # add the missing data columns
        expanded_joint_catalog, _ = matching_utils.match_galaxies_to_catalog_table(
            joint_catalog, cached_nsa)

        assert len(expanded_joint_catalog) == len(joint_catalog)

        return expanded_joint_catalog