예제 #1
0
def aws_batch_submit(args):
    """Submit given command to AWS Batch and log timestamped event under s3://operations/... folder in json format."""
    assert_have_aegea()
    # Replace anything that's not alphanumeric in batch_command with '_'
    name = str.join('',
                    (c if c.isalnum() else '_' for c in args.batch_command))
    cmd = f"""aegea batch submit --name {name} --ecr-image {args.batch_ecr_image} --memory {args.batch_memory} --vcpus {args.batch_vcpus} --queue {args.batch_queue} --privileged --command="pip3 install 'git+https://github.com/czbiohub/iggtools.git@{args.batch_branch}' --upgrade ; iggtools --version ; aws s3 cp s3://microbiome-igg/2.0/README.TXT - ; iggtools aws_batch_init ; cd /mnt/nvme ; {args.batch_command} ; echo DONE" """
    tsprint(
        f"Submitting to AWS Batch queue {args.batch_queue}:  {args.batch_command}"
    )
    aegea_output_json = backtick(cmd)
    ao = json.loads(aegea_output_json)
    job_id = ao['jobId']
    t_submit = int(time.time())
    datestamp, timestamp = datecode(t_submit).split("__")
    # timestamp is a string, and that's good, because JSON can lose resolution for large integers
    event = {
        "unix_timestamp": timestamp,
        "utc_date": datestamp,
        "type": "aws_batch_submit",
        "job_id": job_id,
        "job_target": args.batch_command,
        "aegea_command": cmd,
    }
    eventpath = f"{opsdir}/events/{datestamp}/{timestamp}__aws_batch_submit__{job_id}.json"
    with OutputStream(eventpath) as e:
        e.write(json.dumps(event))
    tsprint("You may watch the job with the command\n" +
            f"aegea batch watch {job_id}")
예제 #2
0
def map_reads_hsblast(tempdir, r1, r2, word_size, markers_db, max_reads):
    m8_file = f"{tempdir}/alignments.m8"
    blast_command = f"hs-blastn align -word_size {word_size} -query /dev/stdin -db {markers_db} -outfmt 6 -num_threads {num_physical_cores} -evalue 1e-3"
    with OutputStream(m8_file, through=blast_command) as blast_input:
        for qid, seq in chain(parse_reads(r1, max_reads), parse_reads(r2, max_reads)):
            blast_input.write(">" + qid + "\n" + seq + "\n")
    return m8_file
예제 #3
0
def xref(cluster_files, gene_info_file):
    """
    Produce the gene_info.txt file as documented in https://github.com/czbiohub/iggtools/wiki#pan-genomes
    """
    # Let centroid_info[gene][percent_id] be the centroid of the percent_id cluster contianing gene.
    # The max_percent_id centroids are computed directly for all genes.  Only these centroids are
    # then reclustered to lower percent_id's.
    #
    # The centroids are themselves genes, and their ids, as all gene_ids, are strings
    # generated by the annotation tool prodigal.
    centroid_info = defaultdict(dict)
    for percent_id, (_, uclust_file) in cluster_files.items():
        for r_type, r_gene, r_centroid in parse_uclust(
                uclust_file, ['type', 'gene_id', 'centroid_id']):
            if r_type == 'S':
                # r itself is the centroid of its cluster
                centroid_info[r_gene][percent_id] = r_gene
            elif r_type == 'H':
                # r is not itself a centroid
                centroid_info[r_gene][percent_id] = r_centroid
            else:
                # ignore all other r types
                pass

    # Check for a problem that occurs with improper import of genomes (when contig names clash).
    percents = cluster_files.keys()
    max_percent_id = max(percents)
    for g in centroid_info:
        cg = centroid_info[g][max_percent_id]
        ccg = centroid_info[cg][max_percent_id]
        assert cg == ccg, f"The {max_percent_id}-centroid relation should be idempotent, however, {cg} != {ccg}.  See https://github.com/czbiohub/iggtools/issues/16"

    # At this point we have the max_percent_id centroid for any gene gc, but we lack
    # coarser clustering assignments for many genes -- we only have those for genes
    # that are themelves centroids of max_percent_id clusters.
    #
    # We can infer the remaining cluster assignments for all genes by transitivity.
    # For any gene gc, look up the clusters containing gc's innermost centroid,
    # gc[max_percent_id].  Those clusters also contain gc.
    for gc in centroid_info.values():
        gc_recluster = centroid_info[gc[max_percent_id]]
        for percent_id in percents:
            gc[percent_id] = gc_recluster[percent_id]

    with OutputStream(gene_info_file) as gene_info:
        header = ['gene_id'] + [f"centroid_{pid}" for pid in percents]
        gene_info.write('\t'.join(header) + '\n')
        genes = centroid_info.keys()
        for gene_id in sorted(genes):
            gene_info.write(gene_id)
            for centroid in centroid_info[gene_id].values():
                gene_info.write('\t')
                gene_info.write(centroid)
            gene_info.write('\n')
예제 #4
0
def write_abundance(outdir, species_abundance):
    """ Write species results to specified output file """
    outpath = f"{outdir}/species/species_profile.txt"  # TODO:  Share this across midas_run_ steps
    with OutputStream(outpath) as outfile:
        fields = ['species_id', 'count_reads', 'coverage', 'relative_abundance']
        outfile.write('\t'.join(fields) + '\n')
        output_order = sorted(species_abundance.keys(), key=lambda sid: species_abundance[sid]['count'], reverse=True)
        for species_id in output_order:
            values = species_abundance[species_id]
            if values['count'] > 0:
                record = [species_id, values['count'], values['cov'], values['rel_abun']]
                outfile.write('\t'.join(str(x) for x in record) + '\n')
예제 #5
0
def write_snps_summary(species_pileup_stats, outfile):
    """ Get summary of mapping statistics """
    header = [
        'species_id', 'genome_length', 'covered_bases', 'total_depth',
        'aligned_reads', 'mapped_reads', 'fraction_covered', 'mean_coverage'
    ]
    with OutputStream(outfile) as file:
        file.write('\t'.join(header) + '\n')
        for species_id, species_aln in species_pileup_stats.items():
            values = list(species_aln.values())
            values.insert(0, species_id)
            file.write('\t'.join(map(str, values)) + '\n')
예제 #6
0
def write_results(outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage):
    if not os.path.exists(f"{outdir}/genes/output"):
        command(f"mkdir -p {outdir}/genes/output")
    # open outfiles for each species_id
    header = ['gene_id', 'count_reads', 'coverage', 'copy_number']
    for species_id, species_genes in species.items():
        path = f"{outdir}/genes/output/{species_id}.genes.lz4"
        with OutputStream(path) as sp_out:
            sp_out.write('\t'.join(header) + '\n')
            for gene_id, gene in species_genes.items():
                if gene["depth"] == 0:
                    # Sparse by default here.  You can get the pangenome_size from the summary file, emitted below.
                    continue
                values = [gene_id, str(gene["mapped_reads"]), format(gene["depth"], DECIMALS), format(gene["copies"], DECIMALS)]
                sp_out.write('\t'.join(values) + '\n')
    # summary stats
    header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'aligned_reads', 'mapped_reads']
    path = f"{outdir}/genes/summary.txt"
    with OutputStream(path) as file:
        file.write('\t'.join(header) + '\n')
        for species_id, species_genes in species.items():
            # No sparsity here -- should be extremely rare for a species row to be all 0.
            aligned_reads = sum(g["aligned_reads"] for g in species_genes.values())
            mapped_reads = sum(g["mapped_reads"] for g in species_genes.values())
            pangenome_size = len(species_genes)
            values = [
                species_id,
                str(pangenome_size),
                str(num_covered_genes[species_id]),
                format(num_covered_genes[species_id] / pangenome_size, DECIMALS),
                format(species_mean_coverage[species_id], DECIMALS),
                format(species_markers_coverage[species_id], DECIMALS),
                str(aligned_reads),
                str(mapped_reads)
            ]
            file.write('\t'.join(values) + '\n')
예제 #7
0
파일: init.py 프로젝트: bsmith89/iggtools
def init(args):
    """
    Input spec: https://github.com/czbiohub/iggtools/wiki#inputs
    Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3
    """

    msg = f"Building {outputs.genomes}."
    if find_files(outputs.genomes):
        if not args.force:
            tsprint(
                f"Destination {outputs.genomes} already exists.  Specify --force to overwrite."
            )
            return
        msg = f"Rebuilding {outputs.genomes}."
    tsprint(msg)

    id_remap = {}
    with InputStream(inputs.alt_species_ids) as ids:
        for row in select_from_tsv(
                ids, selected_columns=["alt_species_id", "species_id"]):
            new_id, old_id = row
            id_remap[old_id] = new_id

    seen_genomes, seen_species = set(), set()
    with OutputStream(outputs.genomes) as out:

        target_columns = [
            "genome", "species", "representative", "genome_is_representative"
        ]
        out.write("\t".join(target_columns) + "\n")

        with InputStream(inputs.genomes2species) as g2s:
            for row in select_from_tsv(
                    g2s, selected_columns=["MAG_code", "Species_id"]):
                genome, representative = row
                species = id_remap[representative]
                genome_is_representative = str(int(genome == representative))
                target_row = [
                    genome, species, representative, genome_is_representative
                ]
                out.write("\t".join(target_row) + "\n")
                seen_genomes.add(genome)
                seen_species.add(species)

    tsprint(
        f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}."
    )
예제 #8
0
def species_pileup(species_id, args, tempdir, outputdir, contig_file,
                   contigs_db_stats):
    # Read in contigs information for current species_id

    contigs = {}
    contigs_db_stats[
        'species_counts'] += 1  # not being updated and passed as expected

    with InputStream(contig_file) as file:
        for rec in Bio.SeqIO.parse(file, 'fasta'):
            contigs[rec.id] = {
                "species_id": species_id,
                "contig_len": int(len(rec.seq)),
                "contig_seq": str(rec.seq),
            }
            contigs_db_stats['total_length'] += contigs[rec.id]["contig_len"]
            contigs_db_stats['total_seqs'] += 1

    # Summary statistics
    aln_stats = {
        "genome_length": 0,
        "total_depth": 0,
        "covered_bases": 0,
        "aligned_reads": 0,
        "mapped_reads": 0,
    }

    def keep_read(x):
        return keep_read_worker(x, args, aln_stats)

    header = [
        'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c',
        'count_g', 'count_t'
    ]
    path = f"{outputdir}/{species_id}.snps.lz4"

    with OutputStream(path) as file:

        file.write('\t'.join(header) + '\n')
        zero_rows_allowed = not args.sparse

        # Loop over alignment for current species's contigs
        with AlignmentFile(f"{tempdir}/repgenomes.bam") as bamfile:
            for contig_id in sorted(list(contigs.keys())):  # why need to sort?
                contig = contigs[contig_id]
                counts = bamfile.count_coverage(
                    contig_id,
                    start=0,
                    end=contig["contig_len"],
                    quality_threshold=args.aln_baseq,
                    read_callback=keep_read)

                for ref_pos in range(0, contig["contig_len"]):
                    ref_allele = contig["contig_seq"][ref_pos]
                    depth = sum([counts[nt][ref_pos] for nt in range(4)])
                    count_a = counts[0][ref_pos]
                    count_c = counts[1][ref_pos]
                    count_g = counts[2][ref_pos]
                    count_t = counts[3][ref_pos]
                    values = [
                        contig_id, ref_pos + 1, ref_allele, depth, count_a,
                        count_c, count_g, count_t
                    ]

                    if depth > 0 or zero_rows_allowed:
                        file.write('\t'.join(str(val)
                                             for val in values) + '\n')

                    aln_stats['genome_length'] += 1
                    aln_stats['total_depth'] += depth
                    if depth > 0:
                        aln_stats['covered_bases'] += 1

    tsprint(json.dumps({species_id: aln_stats}, indent=4))
    return (species_id, {k: str(v) for k, v in aln_stats.items()})