Exemplo n.º 1
0
def update(repos, verbose=True):
    ecoregions = [(er['properties']['eco_code'], shape(er['geometry']))
                  for er in jsonlib.load(
                      data_file('ecoregions.json', repos=repos))['features']
                  if er['geometry']
                  and er['properties']['eco_code'] not in INVALID_ECO_CODES]

    with CsvData('distribution', repos=repos) as data:
        res = {i.id: i for i in data.items}

        occurrence_data = list(
            data_file('external', 'gbif', repos=repos).glob('*.json'))
        if verbose:  # pragma: no cover
            occurrence_data = tqdm(occurrence_data)
        for fname in occurrence_data:
            sid = fname.stem
            d = res.get(sid, Distribution(sid, '', ''))
            if not d.countries__ids or not d.ecoregions__ids:
                occurrences = jsonlib.load(fname).get('results', [])
                if not d.ecoregions__ids:
                    d.ecoregions__ids = list(match(occurrences, ecoregions))
                if not d.countries__ids:
                    d.countries__ids = list(
                        r.get('countryCode') for r in occurrences)
            res[sid] = d
            data.items = [res[key] for key in sorted(res.keys())]
Exemplo n.º 2
0
def upload_images(args):
    """
    tsammalex upload_images path/to/cdstar/catalog
    """

    images_path = data_file('images.csv', repos=args.tsammalex_data)
    staged_images_path = data_file('staged_images.csv',
                                   repos=args.tsammalex_data)
    checksums = set(
        d.id for d in models.CsvData('images', repos=args.tsammalex_data))
    providers = [prov(args.tsammalex_data) for prov in PROVIDERS]
    with MediaCatalog('cdstar.json',
                      repos=args.tsammalex_data,
                      json_opts=dict(indent=4)) as mcat:
        with Catalog(args.args[0],
                     cdstar_url=os.environ['CDSTAR_URL'],
                     cdstar_user=os.environ['CDSTAR_USER'],
                     cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
            for item in models.CsvData('staged_images',
                                       repos=args.tsammalex_data):
                for provider in providers:
                    if item in provider:
                        img = provider.retrieve(item, cat, checksums, mcat)
                        if img:
                            try:
                                add_rows(images_path, img.csv_row())
                            except:
                                print(img)
                                raise
                            filter_rows(staged_images_path,
                                        lambda d: d['id'] != item.id)
                        break
Exemplo n.º 3
0
def upload_images(args):
    """
    tsammalex upload_images path/to/cdstar/catalog
    """
    images_path = data_file('images.csv', repos=args.tsammalex_data)
    staged_images_path = data_file('staged_images.csv', repos=args.tsammalex_data)
    checksums = set(d.id for d in models.CsvData('images', repos=args.tsammalex_data))
    providers = [prov(args.tsammalex_data) for prov in PROVIDERS]
    with MediaCatalog(
            'cdstar.json', repos=args.tsammalex_data, json_opts=dict(indent=4)) as mcat:
        with Catalog(
                args.args[0],
                cdstar_url=os.environ['CDSTAR_URL'],
                cdstar_user=os.environ['CDSTAR_USER'],
                cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
            for item in models.CsvData('staged_images', repos=args.tsammalex_data):
                for provider in providers:
                    if item in provider:
                        img = provider.retrieve(item, cat, checksums, mcat)
                        if img:
                            try:
                                add_rows(images_path, img.csv_row())
                            except:
                                print(img)
                                raise
                            filter_rows(staged_images_path, lambda d: d['id'] != item.id)
                        break
Exemplo n.º 4
0
def test():
    if not REPOS.exists():
        return
    data = {
        n: OrderedDict([(item.id, item)
                        for item in models.CsvData(n, on_error=error)])
        for n in CSV
    }
    data['ecoregions'] = {}
    for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']:
        data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion

    data['refs'] = {}
    with data_file('sources.bib').open(encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                data['refs'][match.group('id')] = 1

    data['countries'] = {country.alpha2: country for country in countries}

    for name in ['names', 'taxa']:
        for line, item in enumerate(data[name].values()):
            for ref in item.refs__ids:
                if '[' in ref:
                    source_id, pages = ref.split('[', 1)
                    if not pages.endswith(']'):  # pragma: no cover
                        error('invalid reference %s' % (ref, ), name, line + 2)
                else:
                    source_id = ref
                if source_id not in data['refs']:  # pragma: no cover
                    error('invalid id referenced: %s' % (source_id, ), name,
                          line + 2)

    for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]:
        for line, item in enumerate(data[name].values()):
            for col in [f.name for f in attr.fields(model)]:
                if '__' in col:
                    ref, cardinality = col.split('__', 1)
                    #if ref not in data:
                    #    continue
                    ids = getattr(item, col)
                    if cardinality == 'id':
                        assert not isinstance(ids, list)
                        ids = [ids]
                    for v in ids:
                        if ref not in data:
                            raise ValueError(ref)  # pragma: no cover
                        if ref == 'refs' and '[' in v:
                            v = v.split('[')[0]
                        if v not in data[ref]:  # pragma: no cover
                            error('invalid %s id referenced: %s' % (ref, v),
                                  name, line + 2)

    if not SUCCESS:  # pragma: no cover
        raise ValueError('integrity checks failed!')
Exemplo n.º 5
0
def test():
    if not REPOS.exists():
        return
    data = {
        n: OrderedDict([(item.id, item) for item in models.CsvData(n, on_error=error)])
        for n in CSV}
    data['ecoregions'] = {}
    for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']:
        data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion

    data['refs'] = {}
    with data_file('sources.bib').open(encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                data['refs'][match.group('id')] = 1

    data['countries'] = {country.alpha2: country for country in countries}

    for name in ['names', 'taxa']:
        for line, item in enumerate(data[name].values()):
            for ref in item.refs__ids:
                if '[' in ref:
                    source_id, pages = ref.split('[', 1)
                    if not pages.endswith(']'):  # pragma: no cover
                        error('invalid reference %s' % (ref,), name, line + 2)
                else:
                    source_id = ref
                if source_id not in data['refs']:  # pragma: no cover
                    error('invalid id referenced: %s' % (source_id,), name, line + 2)

    for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]:
        for line, item in enumerate(data[name].values()):
            for col in [f.name for f in attr.fields(model)]:
                if '__' in col:
                    ref, cardinality = col.split('__', 1)
                    #if ref not in data:
                    #    continue
                    ids = getattr(item, col)
                    if cardinality == 'id':
                        assert not isinstance(ids, list)
                        ids = [ids]
                    for v in ids:
                        if ref not in data:
                            raise ValueError(ref)  # pragma: no cover
                        if ref == 'refs' and '[' in v:
                            v = v.split('[')[0]
                        if v not in data[ref]:  # pragma: no cover
                            error(
                                'invalid %s id referenced: %s' % (ref, v), name, line + 2)

    if not SUCCESS:  # pragma: no cover
        raise ValueError('integrity checks failed!')
Exemplo n.º 6
0
    def test_JsonData(self):
        from pytsammalex.util import JsonData, data_file

        tmpdir = create_repos(self.tmp_path())
        with JsonData('test.json', repos=tmpdir) as jdat:
            jdat['a'] = 1
        self.assertTrue(data_file('test.json', repos=tmpdir).exists())
        with JsonData('test.json', repos=tmpdir) as jdat:
            self.assertEqual(len(jdat), 1)
            self.assertEqual(jdat['a'], 1)
Exemplo n.º 7
0
def test_json_data(tmpdir):
    tmp_ = create_repos(tmpdir)

    with JsonData('test.json', repos=Path(tmp_)) as jdat:
        jdat['a'] = 1

    assert (data_file('test.json', repos=Path(tmp_)).exists() is True)

    with JsonData('test.json', repos=Path(tmp_)) as jdat:
        assert (len(jdat) == 1)
        assert (jdat['a'] == 1)
Exemplo n.º 8
0
    def cached_metadata(self, sid, id=None, name=None, refresh=False):
        if data_file('external', self.name, repos=self.repos).is_dir():
            fname = data_file('external', self.name, sid + '.json', repos=self.repos)
            if not fname.exists() or refresh:
                try:
                    data = self.metadata(id or self.identify(name))
                except:  # pragma: no cover
                    data = None
                if not data:
                    return  # pragma: no cover
                jsonlib.dump(data, fname)
                return data
            return jsonlib.load(fname)

        if sid not in self.items or refresh:
            try:
                self.items[sid] = self.metadata(id or self.identify(name))
            except:
                return
        return self.items[sid]
Exemplo n.º 9
0
def update(repos, log):
    ecoregions = [
        (er['properties']['eco_code'], shape(er['geometry']))
        for er in jsonlib.load(data_file('ecoregions.json', repos=repos))['features']
        if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES]

    with CsvData('distribution', repos=repos) as data:
        res = {i.id: i for i in data.items}

        occurrence_data = list(data_file('external', 'gbif', repos=repos).glob('*.json'))
        for fname in tqdm(occurrence_data):
            sid = fname.stem
            d = res.get(sid, Distribution(sid, '', ''))
            if not d.countries__ids or not d.ecoregions__ids:
                occurrences = jsonlib.load(fname).get('results', [])
                if not d.ecoregions__ids:
                    d.ecoregions__ids = list(match(occurrences, ecoregions, log))
                if not d.countries__ids:
                    d.countries__ids = list(r.get('countryCode') for r in occurrences)
            res[sid] = d
            data.items = [res[key] for key in sorted(res.keys())]
Exemplo n.º 10
0
    def cached_metadata(self, sid, id=None, name=None, refresh=False):
        if data_file('external', self.name, repos=self.repos).is_dir():
            fname = data_file('external',
                              self.name,
                              sid + '.json',
                              repos=self.repos)
            if not fname.exists() or refresh:
                try:
                    data = self.metadata(id or self.identify(name))
                except:  # pragma: no cover
                    data = None
                if not data:
                    return  # pragma: no cover
                jsonlib.dump(data, fname)
                return data
            return jsonlib.load(fname)

        if sid not in self.items or refresh:
            try:
                self.items[sid] = self.metadata(id or self.identify(name))
            except:
                return
        return self.items[sid]
Exemplo n.º 11
0
 def identify(self, item):
     p = data_file('staged_images', item.id, repos=self.repos)
     if p.exists():
         return p