def scan_markers(genes, marker_genes_map_file): markers = [] with InputStream(marker_genes_map_file) as mg_map: for gene_id, marker_id in select_from_tsv(mg_map, ["gene_id", "marker_id"], {"species_id": str, "genome_id": str, "gene_id": str, "gene_len": int, "marker_id": str}): if gene_id in genes: markers.append((gene_id, marker_id)) return markers
def midas_run_species(args): tempdir = f"{args.outdir}/species/temp/" command(f"rm -rf {tempdir}") command(f"mkdir -p {tempdir}") markers_db_files = multithreading_map(download_reference, [f"s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.fa{ext}.lz4" for ext in ["", ".bwt", ".header", ".sa", ".sequence"]] + ["s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"]) db = UHGG() species_info = db.species marker_info = read_marker_info_repgenomes(markers_db_files[-1]) with TimedSection("aligning reads to marker-genes database"): m8_file = map_reads_hsblast(tempdir, args.r1, args.r2, args.word_size, markers_db_files[0], args.max_reads) with InputStream(params.inputs.marker_genes_hmm_cutoffs) as cutoff_params: marker_cutoffs = dict(select_from_tsv(cutoff_params, selected_columns={"marker_id": str, "marker_cutoff": float})) with TimedSection("classifying reads"): best_hits = find_best_hits(args, marker_info, m8_file, marker_cutoffs) unique_alns = assign_unique(best_hits, species_info, marker_info) species_alns = assign_non_unique(best_hits, unique_alns, marker_info) with TimedSection("estimating species abundance"): total_gene_length = sum_marker_gene_lengths(marker_info) species_abundance = normalize_counts(species_alns, total_gene_length) write_abundance(args.outdir, species_abundance)
def init(args): """ Input spec: https://github.com/czbiohub/iggtools/wiki#inputs Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3 """ msg = f"Building {outputs.genomes}." if find_files(outputs.genomes): if not args.force: tsprint( f"Destination {outputs.genomes} already exists. Specify --force to overwrite." ) return msg = f"Rebuilding {outputs.genomes}." tsprint(msg) id_remap = {} with InputStream(inputs.alt_species_ids) as ids: for row in select_from_tsv( ids, selected_columns=["alt_species_id", "species_id"]): new_id, old_id = row id_remap[old_id] = new_id seen_genomes, seen_species = set(), set() with OutputStream(outputs.genomes) as out: target_columns = [ "genome", "species", "representative", "genome_is_representative" ] out.write("\t".join(target_columns) + "\n") with InputStream(inputs.genomes2species) as g2s: for row in select_from_tsv( g2s, selected_columns=["MAG_code", "Species_id"]): genome, representative = row species = id_remap[representative] genome_is_representative = str(int(genome == representative)) target_row = [ genome, species, representative, genome_is_representative ] out.write("\t".join(target_row) + "\n") seen_genomes.add(genome) seen_species.add(species) tsprint( f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}." )
def parse_species_profile(outdir): "Return map of species_id to coverage for the species present in the sample." with InputStream(f"{outdir}/species/species_profile.txt") as stream: return dict( select_from_tsv(stream, { "species_id": str, "coverage": float }))
def parse_uclust(uclust_file, select_columns): # The uclust TSV file does not contain a header line. So, we have to hardcode the schema here. Then select specified columns. all_uclust_columns = [ 'type', 'cluster_id', 'size', 'pid', 'strand', 'skip1', 'skip2', 'skip3', 'gene_id', 'centroid_id' ] with InputStream(uclust_file) as ucf: for r in select_from_tsv(ucf, select_columns, all_uclust_columns): yield r
def _UHGG_load(toc_tsv, deep_sort=False): species = defaultdict(dict) representatives = {} genomes = {} with InputStream(toc_tsv) as table_of_contents: for row in select_from_tsv(table_of_contents, selected_columns=["genome", "species", "representative", "genome_is_representative"]): genome_id, species_id, representative_id, _ = row species[species_id][genome_id] = row representatives[species_id] = representative_id genomes[genome_id] = species_id if deep_sort: for sid in species.keys(): species[sid] = sorted_dict(species[sid]) species = sorted_dict(species) return species, representatives, genomes
def find_best_hits(args, marker_info, m8_file, marker_cutoffs): """ Find top scoring alignment for each read """ best_hits = {} i = 0 with InputStream(m8_file) as m8_stream: for aln in select_from_tsv(m8_stream, schema=BLAST_M8_SCHEMA, result_structure=dict): i += 1 cutoff = args.aln_mapid if cutoff == None: marker_id = marker_info[aln['target']]['marker_id'] # get gene family from marker_info cutoff = marker_cutoffs[marker_id] if aln['pid'] < cutoff: # does not meet marker cutoff continue if query_coverage(aln) < args.aln_cov: # filter local alignments continue if aln['query'] not in best_hits: # record aln best_hits[aln['query']] = [aln] elif best_hits[aln['query']][0]['score'] == aln['score']: # add aln best_hits[aln['query']] += [aln] elif best_hits[aln['query']][0]['score'] < aln['score']: # update aln best_hits[aln['query']] = [aln] tsprint(f" total alignments: {i}") return list(best_hits.values())
def read_marker_info_repgenomes(map_file): columns = ["species_id", "genome_id", "gene_id", "gene_length", "marker_id"] with InputStream(map_file) as map_file_stream: return {r['gene_id']: r for r in select_from_tsv(map_file_stream, schema=columns, result_structure=dict)}