def normalize(self): dd = self.unpacked_filepath subdirs = ['{}_families'.format(i) for i in 'AGPB'] dn = self.normalized_filedir assure_dir_exists(dn) stem = '' for s in subdirs: nsd = os.path.join(dn, s) assure_dir_exists(nsd) in_sd = os.path.join(dd, s) csvf = [i for i in os.listdir(in_sd) if i.endswith('.csv')] csvf.sort() if not csvf: continue for i in csvf: inp_fp = os.path.join(in_sd, i) stem = i[:-4] out_dir = os.path.join(nsd, stem) normalize_plantlist_file(inp_fp, out_dir, stem, maj_group_id=s[0]) _LOG.info('csvf = {}'.format(stem)) _LOG.info("dd = {} dn = {}".format(dd, subdirs))
def scrape_families_from_higher_group(out_dir, top_file): global _num_downloads_this_session dirname = os.path.split(top_file)[1] + '_families' fam_dir = os.path.join(out_dir, dirname) assure_dir_exists(fam_dir) top_content = io.open(top_file, 'rU', encoding='utf-8').read() soup = Soup(top_content, 'html.parser') nametree_list = soup.select("#nametree > li") _LOG.debug("will write to {}".format(dirname)) for list_item in nametree_list: if _num_downloads_this_session != 0: m = "Sleeping for {} seconds to be polite to the server..." _LOG.debug(m.format(THROTTLE_BREAK)) time.sleep(THROTTLE_BREAK) fam_link = list_item.select('a') assert len(fam_link) == 1 fam_link = fam_link[0] fam_rel_url = fam_link['href'] fam_name = fam_link.string.strip() fam_dest = os.path.join(fam_dir, fam_name + '.html') template = u'{}{}' if fam_rel_url.startswith('/') else u'{}/{}' fam_url = template.format(DOMAIN, fam_rel_url) if not os.path.exists(fam_dest): _LOG.debug(u"Starting download from url = {} to {}".format( fam_url, fam_dest)) download_large_file(fam_url, fam_dest) _num_downloads_this_session += 1 _LOG.debug(u"Download completed to .".format(fam_url, fam_dest)) download_csv_for_family(fam_dir, fam_dest, fam_url)
def write_taxon_json(obj, filepath): out_dir = os.path.split(filepath)[0] if out_dir: assure_dir_exists(out_dir) dtw = {} for k, v in obj.items(): if isinstance(v, Taxon): dtw[k] = v.to_serializable_dict() else: dtw[k] = v write_as_json(dtw, filepath, separators=(',', ": "), indent=1)
def copy_file_list_by_linking(unpacked_dirp, normalized_dirp, file_list): assure_dir_exists(normalized_dirp) for fn in file_list: ufp = os.path.join(unpacked_dirp, fn) if os.path.exists(ufp): dfp = os.path.join(normalized_dirp, fn) if os.path.exists(dfp): _LOG.info( 'File already exists at "{}". Skipping link creation.'. format(dfp)) else: os.symlink(ufp, dfp)
def normalize_darwin_core_taxonomy(source, destination, res_wrapper): assure_dir_exists(destination) manifest_fp = os.path.join(source, 'meta.xml') manifest_root = ET.parse(manifest_fp).getroot() core_paths = [] field2index = {} for el in manifest_root.findall('{http://rs.tdwg.org/dwc/text/}core'): for sub in el: if sub.tag.endswith('}id'): field2index['id'] = int(sub.attrib['index']) elif sub.tag.endswith('}field'): nns = os.path.split(sub.attrib['term'])[-1] field2index[nns] = int(sub.attrib['index']) for f in el.findall('{http://rs.tdwg.org/dwc/text/}files'): for loc in f.findall('{http://rs.tdwg.org/dwc/text/}location'): core_paths.append(loc.text.strip()) if len(core_paths) != 1: raise ValueError( 'Did not find a single core path in DwC file ("{}") found: {}'. format(manifest_fp, core_paths)) taxon_fn = core_paths[0] proj_out = os.path.join(destination, 'projection.tsv') if not os.path.exists(proj_out): proj_in = os.path.join(source, taxon_fn) write_gbif_projection_file(proj_in, proj_out, field2index) homemade = { 'id': 0, 'parentNameUsageID': 1, 'acceptedNameUsageID': 2, 'canonicalName': 3, 'taxonRank': 4, 'taxonomicStatus': 5, 'nameAccordingTo': 6, } itd = InterimTaxonomyData() to_remove, to_ignore, paleos = read_gbif_projection( proj_out, itd, homemade, do_gbif_checks=isinstance(res_wrapper, GBIFWrapper)) add_fake_root(itd) remove_if_tips(itd, to_remove) o_to_ignore = find_orphaned(itd) to_ignore.update(o_to_ignore) prune_ignored(itd, to_ignore) _LOG.info('writing {} paleodb ids'.format(len(paleos))) with open(os.path.join(destination, 'paleo.tsv'), 'w') as paleofile: for taxon_id in paleos: paleofile.write('{}\n'.format(taxon_id)) res_wrapper.post_process_interim_tax_data(itd) itd.write_to_dir(destination)
def normalize_silva_taxonomy(source, destination, res_wrapper): assure_dir_exists(destination) depends_on = res_wrapper.depends_on taxalotl_config = res_wrapper.config expect_id_fp, ncbi_mapping_res = None, None for dep_id in depends_on: dep_res = taxalotl_config.get_terminalized_res_by_id( dep_id, 'normalize silva') if not dep_res.has_been_unpacked(): unpack_resources(taxalotl_config, [dep_id]) if dep_res.schema.lower() == 'id list': dep_fp = os.path.join(dep_res.unpacked_filepath, dep_res.local_filename) expect_id_fp = dep_fp elif dep_res.schema.lower() in {'silva taxmap', "fasta silva taxmap"}: dep_fp = dep_res.normalized_filepath ncbi_mapping_res = dep_res else: raise ValueError('unrecognized dependency schema {}'.format( dep_res.schema)) if not os.path.isfile(dep_fp): raise ValueError( "Silva processing dependency not found at: {}".format(dep_fp)) if expect_id_fp is None: raise ValueError('ID list dependency not found.') if ncbi_mapping_res is None: raise ValueError('NCBI mapping dependency not found.') expect_tax_fp = os.path.join(res_wrapper.unpacked_filepath, res_wrapper.local_filename) if not os.path.isfile(expect_tax_fp): raise ValueError( "Silva taxon file not found at: {}".format(expect_tax_fp)) acc_to_trim = ncbi_mapping_res.parse_acc_to_trim_from_ncbi() preferred = parse_silva_ids(expect_id_fp) itd = InterimTaxonomyData() part_name_to_silva_id = parse_silva_taxon_file(expect_tax_fp, preferred, acc_to_trim, itd) _LOG.info('{} taxonomy IDs read'.format(len(itd.to_par))) res_wrapper.post_process_interim_tax_data(itd) itd.write_to_dir(destination) mapping_file = os.path.join(destination, GEN_MAPPING_FILENAME) write_as_json(part_name_to_silva_id, mapping_file, indent=2, separators=(',', ': '))
def download(self): dd = self.unpacked_filepath assure_dir_exists(dd) _LOG.info("uf = {}".format(dd)) top_files = [] for u in self.url_list: pref, suff = os.path.split(u) if not suff: pref, suff = os.path.split(pref) _LOG.info("p = {} s = {}".format(pref, suff)) assert suff dfp = os.path.join(dd, suff) top_files.append(dfp) if not os.path.exists(dfp): _LOG.debug("Starting download from {} to {}".format(u, dfp)) download_large_file(u, dfp) _LOG.debug("Download from {} to {} completed.".format(u, dfp)) for dfp in top_files: scrape_families_from_higher_group(dd, dfp) open(self.download_filepath, 'w')
def unpack_archive(archive_fp, unpack_fp, archive_format, wrapper): afl = archive_format.lower() if afl in ['tar+gzip']: _LOG.debug("gunzip_and_untar from {} to {} ...".format( archive_fp, unpack_fp)) gunzip_and_untar(archive_fp, unpack_fp) _LOG.debug("gunzip_and_untar from {} to {} done.".format( archive_fp, unpack_fp)) elif afl == 'zip': _LOG.debug("unzip from {} to {} ...".format(archive_fp, unpack_fp)) unzip(archive_fp, unpack_fp) _LOG.debug("unzip from {} to {} done.".format(archive_fp, unpack_fp)) elif afl == 'gzip': afn = os.path.split(archive_fp)[-1] if archive_fp.endswith(".gz"): fn = afn[:-3] elif archive_fp.endswith(".gzip"): fn = afn[:-5] else: raise RuntimeError( "Expecting gzipped archive to endwith .gz or .gzip") assure_dir_exists(unpack_fp) if wrapper.local_filename: dest = os.path.join(unpack_fp, wrapper.local_filename) else: dest = os.path.join(unpack_fp, fn) _LOG.debug("gunzip from {} to {} ...".format(archive_fp, dest)) gunzip(archive_fp, dest) _LOG.debug("gunzip from {} to {} done.".format(archive_fp, dest)) elif afl == 'text': assure_dir_exists(unpack_fp) try: lfn = wrapper.local_filename assert lfn is not None except: raise RuntimeError( "Resource must have a local_filename if it format=text") shutil.copyfile(archive_fp, os.path.join(unpack_fp, lfn)) else: m = "Unpacking from {} format is not currently supported" raise NotImplementedError(m.format(archive_format))
def write_to_dir(self, destination): # Write out in OTT form d = tempfile.mkdtemp() fn = [ 'taxonomy.tsv', 'synonyms.tsv', 'forwards.tsv', 'about.json', 'details.json' ] try: syn_order = self.write_ott_taxonomy_tsv( os.path.join(d, 'taxonomy.tsv')) write_ott_synonyms_tsv(os.path.join(d, 'synonyms.tsv'), self.synonyms, syn_order, self.details_log) if self.forwards: write_ott_forwards(os.path.join(d, 'forwards.tsv'), self.forwards) about_fp = os.path.join(d, 'about.json') write_as_json(self.about, about_fp, indent=2) self.finalize() write_ncbi_details_json(os.path.join(d, 'details.json'), self.details_log) except: for f in fn: tf = os.path.join(d, f) if os.path.exists(tf): try: os.remove(tf) except: pass try: os.rmdir(d) except: pass raise assure_dir_exists(destination) for f in fn: sfp = os.path.join(d, f) if os.path.exists(sfp): dfp = os.path.join(destination, f) os.rename(sfp, dfp) os.rmdir(d)
def _write_d_as_tsv(header, dict_to_write, id_order, dest_path): if not dict_to_write: return ret = [] pd = os.path.split(dest_path)[0] assure_dir_exists(pd) _LOG.info('Writing {} records to "{}"'.format(len(dict_to_write), dest_path)) with io.open(dest_path, 'w', encoding='utf-8') as outp: outp.write(header) for i in id_order: el = dict_to_write.get(i) if el is not None: ret.append(i) outp.write(el) oset = frozenset(ret) for key, line in dict_to_write.items(): if key not in oset: ret.append(key) outp.write(line) return ret
def _write_syn_d_as_tsv(header, dict_to_write, id_order, dest_path): ltw = [] for i in id_order: synlist = dict_to_write.get(i) if synlist is not None: for p in synlist: ltw.append(p[1]) oset = frozenset(id_order) if dict_to_write: for key, synlist in dict_to_write.items(): if key not in oset: for syn_pair in synlist: ltw.append(syn_pair[1]) if not ltw: return x = len(ltw) pd = os.path.split(dest_path)[0] assure_dir_exists(pd) _LOG.info('Writing {} records to "{}"'.format(x, dest_path)) with io.open(dest_path, 'w', encoding='utf-8') as outp: outp.write(header) for l in ltw: outp.write(l)
def normalize_plantlist_file(inp_fp, out_dir, family, maj_group_id): _LOG.info(u'{} to {}'.format(inp_fp, out_dir)) fam_name = unidecode(family) id_to_line = {fam_name: [fam_name, maj_group_id, fam_name, 'family', AGI]} legit_ids = { fam_name, } illegit_ids = set() name_to_id = {} with io.open(inp_fp, 'rU', encoding='utf-8') as csvfile: csvreader = csv.reader(csvfile) header = next(csvreader) _LOG.info(u'header = {}'.format(header)) for n, raw_row in enumerate(csvreader): # noinspection PyCompatibility row = [i for i in raw_row] taxon_id = row[0] fam = row[2] if fam != family: raise RuntimeError( "Unexpected family in taxon {} of {}: {}".format( n, family, row)) genus = row[4] assert genus is_hybrid = bool(row[5]) flags = 'hybrid' if is_hybrid else '' sp_epithet = row[6] infr_rank = row[7] infr_epi = row[8] par_id = None if infr_rank: rank = pl_rank_to_ott_rank[infr_rank] assert infr_epi name = ' '.join([genus, sp_epithet, infr_epi]) else: if infr_epi: rank = 'infraspecificname' name = ' '.join([genus, sp_epithet, infr_epi]) elif sp_epithet: rank = 'species' name = ' '.join([genus, sp_epithet]) else: rank = 'genus' name = genus par_id = fam_name tax_stat = row[10] id_to_line[taxon_id] = [taxon_id, par_id, name, rank, flags] if tax_stat.lower() == 'accepted': if name in name_to_id: m = 'Name "{}" repeated in {}. IDs {} and {}. Ignoring the second...' _LOG.warn( m.format(name, family, name_to_id[name], taxon_id)) continue if rank == 'species' or rank == 'genus': name_to_id[name] = taxon_id legit_ids.add(taxon_id) else: illegit_ids.add(taxon_id) _LOG.info(u'taxon_id={} "{}" "{}" "{}" rank={} tax_stat={}'.format( taxon_id, genus, sp_epithet, infr_epi, rank, tax_stat)) # uid | parent_uid | name | rank | flags | legit_gen, legit_sp, legit_infr = [], [], [] for vid in legit_ids: line_el = id_to_line[vid] rank = line_el[3] if rank in ['genus', 'family']: if rank != 'family': legit_gen.append(vid) par_id = line_el[1] elif rank == 'species': name = line_el[2] gen_name = name.split(' ')[0] par_id = name_to_id.get(gen_name) if par_id is None: gen_gen_id = gen_name assert gen_gen_id not in id_to_line id_to_line[gen_gen_id] = [ gen_gen_id, fam_name, gen_name, 'genus', AGI ] name_to_id[gen_name] = gen_gen_id legit_gen.append(gen_gen_id) _LOG.info( "autogenerating genus record for {}".format(gen_name)) par_id = gen_gen_id legit_sp.append(vid) else: name = line_el[2] sp_name = ' '.join(name.split(' ')[:2]) par_id = name_to_id.get(sp_name) if par_id is None: gen_sp_id = sp_name assert gen_sp_id not in id_to_line id_to_line[gen_sp_id] = [ gen_sp_id, sp_name.split()[0], sp_name, 'species', AGI ] name_to_id[sp_name] = gen_sp_id _LOG.info( "autogenerating species record for {}".format(sp_name)) par_id = sp_name legit_sp.append(gen_sp_id) legit_infr.append(vid) line_el[1] = par_id id_order = legit_gen + legit_sp + legit_infr j = '\t|\t' taxon_fp = os.path.join(out_dir, 'taxonomy.tsv') assure_dir_exists(out_dir) with io.open(taxon_fp, 'w', encoding='utf-8') as outp: outp.write('{}\n'.format( j.join(['uid', 'parent_uid', 'name', 'rank', 'flags']))) outp.write('{}\n'.format(j.join(id_to_line[fam_name]))) for i in id_order: outp.write(_gen_line(id_to_line[i])) not_accepted_fp = os.path.join(out_dir, 'not-accepted.tsv') with io.open(not_accepted_fp, 'w', encoding='utf-8') as outp: outp.write('{}\n'.format(j.join(['uid', 'name', 'rank', 'flags']))) for i in illegit_ids: line_el = id_to_line[i] tout = [line_el[0]] + line_el[2:] outp.write(_gen_line(tout))