def aws_batch_submit(args): """Submit given command to AWS Batch and log timestamped event under s3://operations/... folder in json format.""" assert_have_aegea() # Replace anything that's not alphanumeric in batch_command with '_' name = str.join('', (c if c.isalnum() else '_' for c in args.batch_command)) cmd = f"""aegea batch submit --name {name} --ecr-image {args.batch_ecr_image} --memory {args.batch_memory} --vcpus {args.batch_vcpus} --queue {args.batch_queue} --privileged --command="pip3 install 'git+https://github.com/czbiohub/iggtools.git@{args.batch_branch}' --upgrade ; iggtools --version ; aws s3 cp s3://microbiome-igg/2.0/README.TXT - ; iggtools aws_batch_init ; cd /mnt/nvme ; {args.batch_command} ; echo DONE" """ tsprint( f"Submitting to AWS Batch queue {args.batch_queue}: {args.batch_command}" ) aegea_output_json = backtick(cmd) ao = json.loads(aegea_output_json) job_id = ao['jobId'] t_submit = int(time.time()) datestamp, timestamp = datecode(t_submit).split("__") # timestamp is a string, and that's good, because JSON can lose resolution for large integers event = { "unix_timestamp": timestamp, "utc_date": datestamp, "type": "aws_batch_submit", "job_id": job_id, "job_target": args.batch_command, "aegea_command": cmd, } eventpath = f"{opsdir}/events/{datestamp}/{timestamp}__aws_batch_submit__{job_id}.json" with OutputStream(eventpath) as e: e.write(json.dumps(event)) tsprint("You may watch the job with the command\n" + f"aegea batch watch {job_id}")
def map_reads_hsblast(tempdir, r1, r2, word_size, markers_db, max_reads): m8_file = f"{tempdir}/alignments.m8" blast_command = f"hs-blastn align -word_size {word_size} -query /dev/stdin -db {markers_db} -outfmt 6 -num_threads {num_physical_cores} -evalue 1e-3" with OutputStream(m8_file, through=blast_command) as blast_input: for qid, seq in chain(parse_reads(r1, max_reads), parse_reads(r2, max_reads)): blast_input.write(">" + qid + "\n" + seq + "\n") return m8_file
def xref(cluster_files, gene_info_file): """ Produce the gene_info.txt file as documented in https://github.com/czbiohub/iggtools/wiki#pan-genomes """ # Let centroid_info[gene][percent_id] be the centroid of the percent_id cluster contianing gene. # The max_percent_id centroids are computed directly for all genes. Only these centroids are # then reclustered to lower percent_id's. # # The centroids are themselves genes, and their ids, as all gene_ids, are strings # generated by the annotation tool prodigal. centroid_info = defaultdict(dict) for percent_id, (_, uclust_file) in cluster_files.items(): for r_type, r_gene, r_centroid in parse_uclust( uclust_file, ['type', 'gene_id', 'centroid_id']): if r_type == 'S': # r itself is the centroid of its cluster centroid_info[r_gene][percent_id] = r_gene elif r_type == 'H': # r is not itself a centroid centroid_info[r_gene][percent_id] = r_centroid else: # ignore all other r types pass # Check for a problem that occurs with improper import of genomes (when contig names clash). percents = cluster_files.keys() max_percent_id = max(percents) for g in centroid_info: cg = centroid_info[g][max_percent_id] ccg = centroid_info[cg][max_percent_id] assert cg == ccg, f"The {max_percent_id}-centroid relation should be idempotent, however, {cg} != {ccg}. See https://github.com/czbiohub/iggtools/issues/16" # At this point we have the max_percent_id centroid for any gene gc, but we lack # coarser clustering assignments for many genes -- we only have those for genes # that are themelves centroids of max_percent_id clusters. # # We can infer the remaining cluster assignments for all genes by transitivity. # For any gene gc, look up the clusters containing gc's innermost centroid, # gc[max_percent_id]. Those clusters also contain gc. for gc in centroid_info.values(): gc_recluster = centroid_info[gc[max_percent_id]] for percent_id in percents: gc[percent_id] = gc_recluster[percent_id] with OutputStream(gene_info_file) as gene_info: header = ['gene_id'] + [f"centroid_{pid}" for pid in percents] gene_info.write('\t'.join(header) + '\n') genes = centroid_info.keys() for gene_id in sorted(genes): gene_info.write(gene_id) for centroid in centroid_info[gene_id].values(): gene_info.write('\t') gene_info.write(centroid) gene_info.write('\n')
def write_abundance(outdir, species_abundance): """ Write species results to specified output file """ outpath = f"{outdir}/species/species_profile.txt" # TODO: Share this across midas_run_ steps with OutputStream(outpath) as outfile: fields = ['species_id', 'count_reads', 'coverage', 'relative_abundance'] outfile.write('\t'.join(fields) + '\n') output_order = sorted(species_abundance.keys(), key=lambda sid: species_abundance[sid]['count'], reverse=True) for species_id in output_order: values = species_abundance[species_id] if values['count'] > 0: record = [species_id, values['count'], values['cov'], values['rel_abun']] outfile.write('\t'.join(str(x) for x in record) + '\n')
def write_snps_summary(species_pileup_stats, outfile): """ Get summary of mapping statistics """ header = [ 'species_id', 'genome_length', 'covered_bases', 'total_depth', 'aligned_reads', 'mapped_reads', 'fraction_covered', 'mean_coverage' ] with OutputStream(outfile) as file: file.write('\t'.join(header) + '\n') for species_id, species_aln in species_pileup_stats.items(): values = list(species_aln.values()) values.insert(0, species_id) file.write('\t'.join(map(str, values)) + '\n')
def write_results(outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage): if not os.path.exists(f"{outdir}/genes/output"): command(f"mkdir -p {outdir}/genes/output") # open outfiles for each species_id header = ['gene_id', 'count_reads', 'coverage', 'copy_number'] for species_id, species_genes in species.items(): path = f"{outdir}/genes/output/{species_id}.genes.lz4" with OutputStream(path) as sp_out: sp_out.write('\t'.join(header) + '\n') for gene_id, gene in species_genes.items(): if gene["depth"] == 0: # Sparse by default here. You can get the pangenome_size from the summary file, emitted below. continue values = [gene_id, str(gene["mapped_reads"]), format(gene["depth"], DECIMALS), format(gene["copies"], DECIMALS)] sp_out.write('\t'.join(values) + '\n') # summary stats header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'aligned_reads', 'mapped_reads'] path = f"{outdir}/genes/summary.txt" with OutputStream(path) as file: file.write('\t'.join(header) + '\n') for species_id, species_genes in species.items(): # No sparsity here -- should be extremely rare for a species row to be all 0. aligned_reads = sum(g["aligned_reads"] for g in species_genes.values()) mapped_reads = sum(g["mapped_reads"] for g in species_genes.values()) pangenome_size = len(species_genes) values = [ species_id, str(pangenome_size), str(num_covered_genes[species_id]), format(num_covered_genes[species_id] / pangenome_size, DECIMALS), format(species_mean_coverage[species_id], DECIMALS), format(species_markers_coverage[species_id], DECIMALS), str(aligned_reads), str(mapped_reads) ] file.write('\t'.join(values) + '\n')
def init(args): """ Input spec: https://github.com/czbiohub/iggtools/wiki#inputs Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3 """ msg = f"Building {outputs.genomes}." if find_files(outputs.genomes): if not args.force: tsprint( f"Destination {outputs.genomes} already exists. Specify --force to overwrite." ) return msg = f"Rebuilding {outputs.genomes}." tsprint(msg) id_remap = {} with InputStream(inputs.alt_species_ids) as ids: for row in select_from_tsv( ids, selected_columns=["alt_species_id", "species_id"]): new_id, old_id = row id_remap[old_id] = new_id seen_genomes, seen_species = set(), set() with OutputStream(outputs.genomes) as out: target_columns = [ "genome", "species", "representative", "genome_is_representative" ] out.write("\t".join(target_columns) + "\n") with InputStream(inputs.genomes2species) as g2s: for row in select_from_tsv( g2s, selected_columns=["MAG_code", "Species_id"]): genome, representative = row species = id_remap[representative] genome_is_representative = str(int(genome == representative)) target_row = [ genome, species, representative, genome_is_representative ] out.write("\t".join(target_row) + "\n") seen_genomes.add(genome) seen_species.add(species) tsprint( f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}." )
def species_pileup(species_id, args, tempdir, outputdir, contig_file, contigs_db_stats): # Read in contigs information for current species_id contigs = {} contigs_db_stats[ 'species_counts'] += 1 # not being updated and passed as expected with InputStream(contig_file) as file: for rec in Bio.SeqIO.parse(file, 'fasta'): contigs[rec.id] = { "species_id": species_id, "contig_len": int(len(rec.seq)), "contig_seq": str(rec.seq), } contigs_db_stats['total_length'] += contigs[rec.id]["contig_len"] contigs_db_stats['total_seqs'] += 1 # Summary statistics aln_stats = { "genome_length": 0, "total_depth": 0, "covered_bases": 0, "aligned_reads": 0, "mapped_reads": 0, } def keep_read(x): return keep_read_worker(x, args, aln_stats) header = [ 'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c', 'count_g', 'count_t' ] path = f"{outputdir}/{species_id}.snps.lz4" with OutputStream(path) as file: file.write('\t'.join(header) + '\n') zero_rows_allowed = not args.sparse # Loop over alignment for current species's contigs with AlignmentFile(f"{tempdir}/repgenomes.bam") as bamfile: for contig_id in sorted(list(contigs.keys())): # why need to sort? contig = contigs[contig_id] counts = bamfile.count_coverage( contig_id, start=0, end=contig["contig_len"], quality_threshold=args.aln_baseq, read_callback=keep_read) for ref_pos in range(0, contig["contig_len"]): ref_allele = contig["contig_seq"][ref_pos] depth = sum([counts[nt][ref_pos] for nt in range(4)]) count_a = counts[0][ref_pos] count_c = counts[1][ref_pos] count_g = counts[2][ref_pos] count_t = counts[3][ref_pos] values = [ contig_id, ref_pos + 1, ref_allele, depth, count_a, count_c, count_g, count_t ] if depth > 0 or zero_rows_allowed: file.write('\t'.join(str(val) for val in values) + '\n') aln_stats['genome_length'] += 1 aln_stats['total_depth'] += depth if depth > 0: aln_stats['covered_bases'] += 1 tsprint(json.dumps({species_id: aln_stats}, indent=4)) return (species_id, {k: str(v) for k, v in aln_stats.items()})