def main(): res = {} with open(DATA_FILE, encoding='utf8') as fp: for line in fp.read().split('\n'): if line: cols = line.split(',') res[cols[0]] = cols[1:] ecoregions = [(er['properties']['eco_code'], shape(er['geometry'])) for er in jsonload(data_file('ecoregions.json'))['features'] if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES] for fname in os.listdir(data_file('external', 'gbif')): sid = fname.split('.')[0] v = res.get(sid, ['', '']) if len(v) == 1: v.append('') if not v[0] or not v[1]: occurrences = jsonload(data_file('external', 'gbif', fname)).get('results', []) if not v[0]: v[0] = format_ids(match(occurrences, ecoregions)) if not v[1]: v[1] = format_ids(r.get('countryCode') for r in occurrences) res[sid] = v with open(DATA_FILE, 'w', encoding='utf8') as fp: for key in sorted(res.keys()): fp.write('%s,%s\r\n' % (key, ','.join(res[key])))
def check(p): count = 0 existing = [i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url']] for id, fname in [(n.split('.')[0], n) for n in os.listdir(data_file('cn/images'))]: if id in existing: count += 1 os.remove(data_file('cn', 'images', fname)) print(count)
def check(p): count = 0 existing = [ i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url'] ] for id, fname in [(n.split('.')[0], n) for n in os.listdir(data_file('cn/images'))]: if id in existing: count += 1 os.remove(data_file('cn', 'images', fname)) print(count)
def test(): data = {n: read_csv(n) for n in CSV} ids = {n: {r[1]['id'] for r in rows} for n, rows in data.items()} ids['ecoregions'] = set() for ecoregion in jsonload(data_file('ecoregions.json'))['features']: ids['ecoregions'].add(ecoregion['properties']['eco_code']) ids['sources'] = set() with io.open(data_file('sources.bib'), encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: ids['sources'].add(match.group('id')) ids['countries'] = set([country.alpha2 for country in countries]) def check_ref(name, line, item): for ref in item['refs__ids'].split(';'): if ref: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): error('invalid reference %s' % (ref, ), name, line) else: source_id = ref if source_id not in ids['sources']: error('invalid sources id referenced: %s' % (source_id, ), name, line) for name in ['names', 'taxa']: for line, item in data[name]: check_ref(name, line, item) for name, items in data.items(): for line, item in items: for col in item.keys(): if '__' in col: ref, card = col.split('__', 1) if ref not in ids: continue for v in split_ids(item[col]): if v not in ids[ref]: error('invalid %s id referenced: %s' % (ref, v), name, line) if not SUCCESS: raise ValueError('integrity checks failed!')
def update_taxa(): parser = argparse.ArgumentParser( description="""\ Update the supplemental data for taxa from external sources. We go through the taxa listed in taxa.csv and look for additional information at GBIF, EOL and Catalogue Of Life.""") parser.add_argument("--distribution-only", action="store_true") args = parser.parse_args() if not args.distribution_only: fname = data_file('taxa.json') taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict) ids = set(spec['id'] for spec in taxa) # add stubs for new entries in taxa.csv: for i, item in enumerate(csv_items('taxa.csv')): if item['id'] not in ids: taxa.insert(i, item2spec(item)) for cls in [CatalogueOfLife, GBIF, EOL]: with cls() as provider: for i, spec in enumerate(taxa): if i % 500 == 0: print(i) provider.update_taxon(spec) jsondump(taxa, fname, indent=4) main()
def update_taxa(): parser = argparse.ArgumentParser(description="""\ Update the supplemental data for taxa from external sources. We go through the taxa listed in taxa.csv and look for additional information at GBIF, EOL and Catalogue Of Life.""") parser.add_argument("--distribution-only", action="store_true") args = parser.parse_args() if not args.distribution_only: fname = data_file('taxa.json') taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict) ids = set(spec['id'] for spec in taxa) # add stubs for new entries in taxa.csv: for i, item in enumerate(csv_items('taxa.csv')): if item['id'] not in ids: taxa.insert(i, item2spec(item)) for cls in [CatalogueOfLife, GBIF, EOL]: with cls() as provider: for i, spec in enumerate(taxa): if i % 500 == 0: print(i) provider.update_taxon(spec) jsondump(taxa, fname, indent=4) main()
def test(): data = {n: read_csv(n) for n in CSV} ids = {n: {r[1]['id'] for r in rows} for n, rows in data.items()} ids['ecoregions'] = set() for ecoregion in jsonload(data_file('ecoregions.json'))['features']: ids['ecoregions'].add(ecoregion['properties']['eco_code']) ids['sources'] = set() with io.open(data_file('sources.bib'), encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: ids['sources'].add(match.group('id')) ids['countries'] = set([country.alpha2 for country in countries]) def check_ref(name, line, item): for ref in item['refs__ids'].split(';'): if ref: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): error('invalid reference %s' % (ref,), name, line) else: source_id = ref if source_id not in ids['sources']: error('invalid sources id referenced: %s' % (source_id,), name, line) for name in ['names', 'taxa']: for line, item in data[name]: check_ref(name, line, item) for name, items in data.items(): for line, item in items: for col in item.keys(): if col and '__' in col: ref, card = col.split('__', 1) if ref not in ids: continue for v in split_ids(item[col]): if v not in ids[ref]: error('invalid %s id referenced: %s' % (ref, v), name, line) if not SUCCESS: raise ValueError('integrity checks failed!')
def update(p): data = jsonload(data_file('cn', 'images.json'), default={}) try: info = None for img in csv_items('cn/' + p): key = '%s-%s' % (img['taxa__id'], img['tags']) if key in data: print('+++', img['id'] or img['source'], data[key]['source']) continue info = get_image_info(img) if info: data[key] = get_image(info, data_file('cn', 'images')) except: print('----->') print(img) if info: print(info) jsondump(data, data_file('cn', 'images.json'), indent=4) raise jsondump(data, data_file('cn', 'images.json'), indent=4)
def save_occurrences(sid, sname): api = GBIF() out = data_file('external', 'gbif', '%s.json' % sid) if not os.path.exists(out): try: res = api.get_info(api.get_id(sname)) jsondump(res, out) print('%s: %s occurrences' % (sname, min([res['count'], res['limit']]))) except: # we'll have to try again next time! res = None else: try: res = jsonload(out) except: os.remove(out) res = None return res
def rewrite(p): visit('cn/' + p, JSON2CSV(data_file('cn', 'images.json')))
def select(p): shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv')) visit('cn/staged_images.csv', Selector()) print( len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
def __init__(self): self._data = {i['id']: i for i in jsonload(data_file('taxa.json'))}
def select(p): shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv')) visit('cn/staged_images.csv', Selector()) print(len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
('order', item['order'].capitalize() or None), ('family', item['family'].capitalize() or None), ('genus', item['genus'].capitalize() or None), ('ecoregions', split_ids(item.get('ecoregions__ids', ''))), ('countries', split_ids(item.get('countries__ids', ''))), ('wikipedia_url', wikipedia_url(item.get('wikipedia_url', ''))), ('eol_id', None), ('gbif_id', None), ('catalogueoflife_id', None), ]: spec[k] = v return spec if __name__ == '__main__': fname = data_file('taxa.json') taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict) ids = set(spec['id'] for spec in taxa) # add stubs for new entries in taxa.csv: for i, item in enumerate(csv_items('taxa.csv')): if item['id'] not in ids: taxa.insert(i, item2spec(item)) for cls in [CatalogueOfLife, GBIF, EOL]: with cls() as provider: for i, spec in enumerate(taxa): if i % 500 == 0: print(i) provider.update_taxon(spec)
def __init__(self): self.cols = {} with open(data_file('images_md.json'), 'rb') as fp: self.md = json.load(fp) self.count = 0
from __future__ import print_function, unicode_literals import os from io import open from shapely.geometry import shape, Point from shapely.geos import PredicateError, TopologicalError from tsammalexdata.util import data_file, jsonload, unique INVALID_ECO_CODES = {'AA0803', 'Lake', 'AT1202', 'IM1303', 'AA0803'} DATA_FILE = data_file('distribution.csv') def format_ids(iterable): return ';'.join(unique(iterable)) def main(): res = {} with open(DATA_FILE, encoding='utf8') as fp: for line in fp.read().split('\n'): if line: cols = line.split(',') res[cols[0]] = cols[1:] ecoregions = [(er['properties']['eco_code'], shape(er['geometry'])) for er in jsonload(data_file('ecoregions.json'))['features'] if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES] for fname in os.listdir(data_file('external', 'gbif')):
def __init__(self): self.edmond_urls = file_urls(data_file('Edmond.xml')) self.cols = {} self.count = 0
self.edmond_urls = file_urls(data_file('Edmond.xml')) self.cols = {} self.count = 0 def __call__(self, index, row): if index == 0: self.cols = {col: i for i, col in enumerate(row)} return row _id = row[self.cols['id']] if _id in self.edmond_urls: row[self.cols['source_url']] = self.edmond_urls[_id]['full'] self.count += 1 else: # # FIXME: check whether source_url is an Edmond image URL, if not, upload the # image to Edmond, insert the URL here! Depends on the imeji API being # available on Edmond. # print(_id, row) return row if __name__ == '__main__': with open(data_file('Edmond.xml'), 'w', encoding='utf8') as fp: fp.write(requests.get(URL).text) v = Visitor() visit(sys.argv[1] if len(sys.argv) > 1 else 'images.csv', v) print(v.count)