Exemplo n.º 1
0
def rewrite_records(infname, outfname):
    self = BisonFiller(infname)

    drdr, inf = get_csv_dict_reader(infname, BISON_DELIMITER, ENCODING)
    self._files.append(inf)

    deleteme = []
    for fld in self._bison_ordered_flds:
        if fld not in drdr.fieldnames:
            deleteme.append(fld)

    for fld in deleteme:
        self._bison_ordered_flds.remove(fld)

    dwtr, outf = get_csv_dict_writer(outfname, BISON_DELIMITER, ENCODING,
                                     self._bison_ordered_flds)

    dwtr.writeheader()
    self._files.append(outf)

    recno = 0
    for rec in drdr:
        rec.pop('taxonKey')
        dwtr.writerow(rec)
        recno += 1
        print(recno)
Exemplo n.º 2
0
    def _pull_resource_from_db_or_records(self, fname, legacy_ident,
                                          db_resource_name, db_resource_url):
        # Default - standardize db values
        new_resource_id = legacy_ident
        res_name = db_resource_name
        res_url = db_resource_url
        db_std_res_url, db_new_resource_id = self._parse_bison_url(
            db_resource_url)
        if db_std_res_url is not None:
            self.loginfo('Dataset {} has good URL in db {} '.format(
                legacy_ident, db_resource_url))
            new_resource_id = db_new_resource_id
            res_url = db_resource_url
        else:
            self.loginfo('Dataset {} has non-standard URL in db {} '.format(
                legacy_ident, db_resource_url))

        # Check standardize record values, override db with record values
        try:
            dict_reader, inf = get_csv_dict_reader(fname, BISON_DELIMITER,
                                                   ENCODING)
            _ = next(dict_reader)
            rec = next(dict_reader)
            for rec in dict_reader:
                rec_resource_url = rec['resource_url']
                rec_std_res_url, rec_new_resource_id = self._parse_bison_url(
                    rec_resource_url)
                if rec_std_res_url is not None:
                    self.loginfo('Dataset {} has good URL in rec {} '.format(
                        legacy_ident, rec_resource_url))
                    new_resource_id = rec_new_resource_id
                    res_url = rec_std_res_url
                    res_name = rec['resource']
                    break
                else:
                    self.loginfo(
                        'Dataset {} has non-standard URL in rec {} '.format(
                            legacy_ident, rec_resource_url))
        except:
            raise
        finally:
            inf.close()

        if db_std_res_url is not None and rec_std_res_url is not None:
            if db_std_res_url != rec_std_res_url:
                self.loginfo(
                    'Dataset {} has mismatched URL in db {} and rec {}'.format(
                        legacy_ident, db_resource_url, rec_resource_url))

        if db_new_resource_id != rec_new_resource_id:
            self.loginfo(
                'Dataset {} has mismatched id in db {} and rec {}'.format(
                    legacy_ident, db_new_resource_id, rec_new_resource_id))

        return new_resource_id, res_name, res_url
Exemplo n.º 3
0
    def read_lookup(self, fname, prioritized_keyfld_lst, delimiter, ignore_quotes=True):
        '''
        @summary: Read and populate dictionary with key = uuid and
                  val = dictionary of record values
        '''
        no_old_legacy = 0
        no_new_legacy = 0
        if os.path.exists(fname):
            if self.valtype == VAL_TYPE.DICT:
                try:
                    rdr, inf = get_csv_dict_reader(
                        fname, delimiter, self.encoding,
                        ignore_quotes=ignore_quotes)
                except Exception as e:
                    print('Failed reading data in {}: {}'
                                    .format(fname, e))
                else:
                    for data in rdr:
                        for keyfld in prioritized_keyfld_lst:
                            datakey = data[keyfld]
                            if datakey:
                                self.lut[datakey] = data
                                break
                        if not datakey:
                            print('No {} for record {}'.format(keyfld, data))
                finally:
                    inf.close()
                print('no_old_legacy {}  no_new_legacy (default -9999) {}'
                      .format(no_old_legacy, no_new_legacy))

            elif self.valtype == VAL_TYPE.SET:
                recno = 0
                try:
                    rdr, inf = get_csv_reader(fname, delimiter, self.encoding)
                    # get header
                    line, recno = getLine(rdr, recno)
                    # read lookup vals into dictionary
                    while (line is not None):
                        line, recno = getLine(rdr, recno)
                        if line and len(line) > 0:
                            try:
                                # First item is scientificName, rest are taxonKeys
                                self.lut[line[0]] = set(line[1:])
                            except Exception:
                                print('Failed to parse line {} {}'
                                               .format(recno, line))
                except Exception as e:
                    print('Failed reading data in {}: {}'
                                    .format(fname, e))
                finally:
                    inf.close()
Exemplo n.º 4
0
def read_some_points(infname, count):
    recno = 0
    points = []
    try:
        drdr, inf = get_csv_dict_reader(infname, BISON_DELIMITER, ENCODING)
        for rec in drdr:
            if recno > count:
                break
            recno += 1
            lon, lat = get_coords(rec)
            if lon is not None:
                points.append((lon, lat))
    except Exception as e:
        print('Failed reading data from record {}: {}'.format(recno, e))
    finally:
        inf.close()
    return points
Exemplo n.º 5
0
    def fix_bison_data(self, infile, outfile, resource_key, resource_pvals):
        if not os.path.exists(infile):
            raise Exception('File {} does not exist'.format(infile))

        action = resource_pvals['action']
        new_res_id = resource_pvals['resource_id']
        const_res_name = resource_pvals['resource_name']
        const_res_url = resource_pvals['resource_url']
        if not const_res_name:
            raise Exception('{} must have resource_name {}'.format(
                resource_key, new_res_id, const_res_name))

        if action in PROVIDER_ACTIONS:
            # Step 1: rewrite with updated resource/provider values
            self.loginfo("""{} for ticket {},
                infile {} to outfile {}
                with name {}, id {}""".format(action, resource_key, infile,
                                              outfile, const_res_name,
                                              new_res_id))

            dl_fields = list(BISON2020_FIELD_DEF.keys())
            try:
                # Open incomplete BISON CSV file as input
                dict_reader, inf = get_csv_dict_reader(infile, BISON_DELIMITER,
                                                       ENCODING)
                header = next(dict_reader)
                csv_writer, outf = get_csv_writer(outfile, BISON_DELIMITER,
                                                  ENCODING)
                csv_writer.writerow(header)
                recno = 0
                for rec in dict_reader:
                    recno += 1
                    self._remove_internal_delimiters(rec)

                    row = makerow(rec, dl_fields)
                    csv_writer.writerow(row)
            except:
                raise
            finally:
                inf.close()
                outf.close()
        else:
            self.loginfo('Unknown action {} for input {}, ({})'.format(
                action, const_res_name, resource_key))
Exemplo n.º 6
0
 def get_organization_uuids(self, dset_lut_fname):
     """
     @summary: Get organization UUIDs from dataset metadata pulled from GBIF
               and written to the dset_lut_fname file
     @param dset_lut_fname: dataset lookup table filename with absolute path
     """
     org_uuids = set()
     try:
         rdr, inf = get_csv_dict_reader(dset_lut_fname, BISON_DELIMITER,
                                        ENCODING)
         for dset_data in rdr:
             orgUUID = dset_data['gbif_publishingOrganizationKey']
             org_uuids.add(orgUUID)
     except Exception as e:
         print('Failed read {} ({})'.format(dset_lut_fname, e))
     finally:
         inf.close()
     self._log.info(
         'Read {} unique organiziation UUIDs from datasets in {}'.format(
             len(org_uuids), dset_lut_fname))
     return org_uuids
Exemplo n.º 7
0
    def test_bison_data(self, infile, resource_key, resource_pvals):
        if not os.path.exists(infile):
            raise Exception('File {} does not exist'.format(infile))

        action = resource_pvals['action']
        new_res_id = resource_pvals['resource_id']
        const_res_name = resource_pvals['resource_name']
        if not const_res_name:
            raise Exception('{} must have resource_name {}'.format(
                resource_key, new_res_id, const_res_name))

        if action in PROVIDER_ACTIONS:
            # Step 1: rewrite with updated resource/provider values
            self.loginfo(
                'Test ticket {}, infile {} with name {}, id {}'.format(
                    action, resource_key, infile, const_res_name, new_res_id))

            try:
                # Open incomplete BISON CSV file as input
                dict_reader, inf = get_csv_dict_reader(infile, BISON_DELIMITER,
                                                       ENCODING)
                header = next(dict_reader)
                recno = 0
                probrecs = 0
                for rec in dict_reader:
                    recno += 1
                    self._remove_internal_delimiters(rec)
            except:
                raise
            finally:
                inf.close()
                print('Found {} problem records out of {} total'.format(
                    probrecs, recno))
        else:
            self.loginfo('Unknown action {} for input {}, ({})'.format(
                action, const_res_name, resource_key))