示例#1
0
    return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
                          taxonomy_string, match)

# ...def get_preferred_taxonomic_match()


#%% Initialization

initialize_taxonomy_lookup()


#%% Test single-query lookup

if False:
    #%%
    matches = get_taxonomic_info('lion')
    print_taxonomy_matches(matches)


#%% Read the input data

df = pd.read_excel(species_by_dataset_file)


#%% Run all our taxonomic lookups

# i_row = 0; row = df.iloc[i_row]
# query = 'lion'

output_rows = []
for i_row, row in df.iterrows():
示例#2
0
def get_preferred_taxonomic_match(query: str) -> TaxonomicMatch:
    """
    Wrapper for species_lookup.py, but expressing a variety of heuristics and
    preferences that are specific to our scenario.
    """

    # query = 'person'
    matches = get_taxonomic_info(query)

    # Do we have an iNat match?
    inat_matches = [m for m in matches if m['source'] == 'inat']
    gbif_matches = [m for m in matches if m['source'] == 'gbif']

    # print_taxonomy_matches(inat_matches, verbose=True)
    # print_taxonomy_matches(gbif_matches, verbose=True)

    scientific_name = ''
    common_name = ''
    taxonomic_level = ''
    match = ''
    source = ''
    taxonomy_string = ''

    # Prefer iNat matches; they're considerably less quirky
    if len(inat_matches) > 0:

        i_match = 0

        if len(inat_matches) > 1:
            # print('Warning: multiple iNat matches for {}'.format(query))

            # Prefer chordates... most of the names that aren't what we want
            # are esoteric insects, like a moth called "cheetah"
            #
            # If we can't find a chordate, just take the first match.
            #
            # i_test_match = 0
            for i_test_match, match in enumerate(inat_matches):
                found_vertebrate = False
                taxonomy = match['taxonomy']
                for taxonomy_level in taxonomy:
                    taxon_rank = taxonomy_level[1]
                    scientific_name = taxonomy_level[2]
                    if taxon_rank == 'phylum' and scientific_name == 'chordata':
                        i_match = i_test_match
                        found_vertebrate = True
                        break
                if found_vertebrate:
                    break

        match = inat_matches[i_match]['taxonomy']

        # This is (taxonID, taxonLevel, scientific, [list of common])
        lowest_level = match[0]
        taxonomic_level = lowest_level[1]
        scientific_name = lowest_level[2]
        assert len(scientific_name) > 0
        common_names = lowest_level[3]
        if len(common_names) > 1:
            # print(f'Warning: multiple iNat common names for {query}')
            # Default to returning the query
            if query in common_names:
                common_name = query
            else:
                common_name = common_names[0]
        elif len(common_names) > 0:
            common_name = common_names[0]

        # print(f'Matched iNat {query} to {scientific_name},{common_name}')
        source = 'inat'

    # ...if we had iNat matches

    # If we didn't match to iNat, try GBIF
    #
    # Code is deliberately redundant here; I'm expecting some subtleties in how
    # handle GBIF and iNat.
    elif len(gbif_matches) > 0:

        i_match = 0

        if len(gbif_matches) > 1:
            # print('Warning: multiple GBIF matches for {}'.format(query))

            # Prefer chordates... most of the names that aren't what we want
            # are esoteric insects, like a moth called "cheetah"
            #
            # If we can't find a chordate, just take the first match.
            #
            # i_test_match = 0
            for i_test_match, match in enumerate(gbif_matches):
                found_vertebrate = False
                taxonomy = match['taxonomy']
                for taxonomy_level in taxonomy:
                    taxon_rank = taxonomy_level[1]
                    scientific_name = taxonomy_level[2]
                    if taxon_rank == 'phylum' and scientific_name == 'chordata':
                        i_match = i_test_match
                        found_vertebrate = True
                        break
                if found_vertebrate:
                    break

        match = gbif_matches[i_match]['taxonomy']

        # This is (taxonID, taxonLevel, scientific, [list of common])
        lowest_level = match[0]
        taxonomic_level = lowest_level[1]
        scientific_name = lowest_level[2]
        assert len(scientific_name) > 0

        common_names = lowest_level[3]
        if len(common_names) > 1:
            # print(f'Warning: multiple GBIF common names for {query}')
            # Default to returning the query
            if query in common_names:
                common_name = query
            else:
                common_name = common_names[0]
        elif len(common_names) > 0:
            common_name = common_names[0]

        source = 'gbif'

    # ...if we needed to look in the GBIF taxonomy

    taxonomy_string = str(match)

    return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
                          taxonomy_string, match)
示例#3
0
    return TaxonomicMatch(scientific_name, common_name, taxonomic_level,
                          source, taxonomy_string, match)


# ...def get_preferred_taxonomic_match()

#%% Initialization

initialize_taxonomy_lookup()

#%% Test single-query lookup

if False:
    #%%
    matches = get_taxonomic_info('equus quagga')
    print_taxonomy_matches(matches)
    #%%
    q = 'equus quagga'
    # q = "grevy's zebra"
    taxonomy_preference = 'gbif'
    m = get_preferred_taxonomic_match(q)
    print(m.source)
    print(m.taxonomy_string)
    import clipboard
    clipboard.copy(m.taxonomy_string)

#%% Read the input data

df = pd.read_excel(species_by_dataset_file)