def main( ): args = get_args( ) if args.out is None: name = wu.path2name( args.query ) args.out = wu.name2path( name, ".", ".blastout" ) params = { "QUERY" : args.query, "OUTFILE" : args.out, "BLASTN" : args.blastn, "DB" : args.db, "MAXTAR" : wu.c_max_target_seqs, "THREADS" : args.threads, "FORMAT" : wu.c_blast_format_string, } command = [ "{BLASTN}", "-query {QUERY}", "-db {DB}", "-out {OUTFILE}", "-max_target_seqs {MAXTAR}", "-num_threads {THREADS}", "-outfmt \'{FORMAT}\'", ] command = " ".join( command ).format( **params ) wu.say( "Executing command:", command ) os.system( command ) wu.say( "Finished successfully." )
def write_detailed_output( basename=None, outdir=None, contig_coverage=None, contig_hits=None, ): # first file can get pretty big, hence gzip p_site_hits = wu.name2path(basename, outdir, ".site_hits.tsv.gz") p_gene_hits = wu.name2path(basename, outdir, ".gene_hits.tsv") # write: site_hits wu.say("Writing site hits.") with wu.try_open(p_site_hits, "w") as fh: wu.write_rowdict( format=c_formats["site_hits"], file=fh, ) for c in sorted(contig_coverage): depths = contig_coverage[c] rowdict = { "contig": c, "mean": np.mean(depths), "stdev": np.std(depths), "depths": " ".join(["{:.0f}".format(k) for k in depths]), } wu.write_rowdict( rowdict=rowdict, format=c_formats["site_hits"], file=fh, ) # write: gene_hits wu.say("Writing gene-pair hits.") with wu.try_open(p_gene_hits, "w") as fh: wu.write_rowdict( format=c_formats["gene_hits"], file=fh, ) for c in sorted(contig_hits): for code1, code2 in sorted(contig_hits[c]): if code2 > code1: continue value = contig_hits[c][(code1, code2)] rowdict = { "contig": c, "gene1": code1, "gene2": code2, "hits": value, } wu.write_rowdict( rowdict=rowdict, format=c_formats["gene_hits"], file=fh, )
def bowtie2_build( p_bowtie2_build=None, p_contigs=None, p_index=None, args=None, ): alias = { "PROG": p_bowtie2_build, "CONTIGS": p_contigs, "INDEX": p_index, } if args.resume and os.path.exists(p_index + ".1.bt2"): wu.say("RESUMING: The index <{INDEX}> already exists.".format(**alias)) else: wu.say("Indexing <{CONTIGS}> to <{INDEX}>.".format(**alias)) command = [ "{PROG}", "{CONTIGS}", "{INDEX}", ] command = " ".join(command) command = command.format(**alias) os.system(command) wu.say("Build complete.") return None
def main( ): args = get_args( ) if args.gff is None: name = wu.path2name( args.blastout ) args.gff = wu.name2path( name, ".", ".gff" ) fh_gff = wu.try_open( args.gff, "w" ) writer = csv.writer( fh_gff, csv.excel_tab ) for contig, hits in wu.iter_contig_hits( args.blastout ): intervals = hits2ints( hits, args.min_scov, ) intervals = overlap_intervals( intervals, args.min_overlap, args.stranded == "on", ) for start, stop, strand in intervals: gene_length = stop - start + 1 if gene_length >= args.min_gene_length: items = [ contig, "waafle_genecaller", "gene", start, stop, ".", strand, 0, ".", ] writer.writerow( [str( k ) for k in items] ) fh_gff.close( ) wu.say( "Finished successfully." )
def concordant_hits(p_sam=None, ): counter = 0 mate1 = None mate2 = None for hit in wu.iter_sam_hits(p_sam): # progress counter += 1 if counter % int(1e5) == 0: wu.say(" SAM alignments processed: {:.1f}M".format(counter / 1e6)) # weave mate1 = mate2 mate2 = hit # edge case if mate1 is None: continue # not a mate pair elif mate1.qseqid != mate2.qseqid: continue # not concordant elif mate1.sseqid != mate2.sseqid: continue # good pair else: yield [mate1, mate2]
def bowtie2_align( p_bowtie2=None, p_reads1=None, p_reads2=None, p_index=None, p_sam=None, args=None, ): alias = { "PROG": p_bowtie2, "READS1": p_reads1, "READS2": p_reads2, "INDEX": p_index, "SAM": p_sam, "THREADS": args.threads, } if args.resume and os.path.exists(p_sam): wu.say( "RESUMING: A sam mapping <{SAM}> already exists.".format(**alias)) else: wu.say("Performing bowtie2 alignment.") command = [ "{PROG}", "-x {INDEX}", "-1 {READS1}", "-2 {READS2}", "-S {SAM}", "--threads {THREADS}", "--no-mixed", "--no-discordant", ] command = " ".join(command) command = command.format(**alias) os.system(command) wu.say("Alignment complete.") return None
def main(): # begin args = get_args() p_contigs = args.contigs p_gff = args.gff # define files p_outdir = args.outdir p_tmpdir = args.tmpdir basename = args.basename if basename is None: basename = wu.path2name(p_contigs) p_index = wu.name2path(basename, p_tmpdir, ".index") p_sam = wu.name2path(basename, p_tmpdir, ".sam") p_junctions = wu.name2path(basename, p_outdir, ".junctions.tsv") # alignment workflow if args.sam is not None: p_sam = args.sam wu.say("Using specified SAM file:", p_sam) elif args.reads1 is not None and args.reads2 is not None: # build process bowtie2_build( p_bowtie2_build=args.bowtie2_build, p_contigs=args.contigs, p_index=p_index, args=args, ) # alignment process bowtie2_align( p_bowtie2=args.bowtie2, p_reads1=args.reads1, p_reads2=args.reads2, p_index=p_index, p_sam=p_sam, args=args, ) else: wu.die("Must provide READS or SAM file.") # load contig data wu.say("Loading contig lengths.") contig_lengths = wu.read_contig_lengths(p_contigs) contig_coverage = {} for name, length in contig_lengths.items(): contig_coverage[name] = np.zeros(length) wu.say("Loading contig gene coordinates.") contig_loci = {} for name, loci in wu.iter_contig_loci(p_gff): contig_loci[name] = loci contig_hits = {} # post-processing workflow wu.say("Processing SAM file.") for mate1, mate2 in concordant_hits(p_sam): contig = mate1.sseqid inner = contig_hits.setdefault(contig, Counter()) # update pers-site coverage (note: base-0 start and pythonic end) coords = [mate1.sstart, mate1.send, mate2.sstart, mate2.send] L = min(coords) - 1 R = max(coords) - 1 contig_coverage[contig][L:R + 1] += 1 # find hit loci hits = find_hit_loci( mate1=mate1, mate2=mate2, loci=contig_loci.get(contig, []), args=args, ) # attach self counts for code in hits: inner[(code, code)] += 1 # attach pair counts (note: symmetric storage for safer lookup) for code1 in hits: for code2 in hits: if code1 != code2: inner[(code1, code2)] += 1 # detailed output? if args.write_detailed_output: write_detailed_output( basename=basename, outdir=p_outdir, contig_coverage=contig_coverage, contig_hits=contig_hits, ) # write junction report wu.say("Writing junction report.") with wu.try_open(p_junctions, "w") as fh: wu.write_rowdict( format=c_formats["junctions"], file=fh, ) for c in sorted(contig_lengths): rowdicts = evaluate_contig( loci=contig_loci.get(c, []), coverage=contig_coverage[c], gene_hits=contig_hits.get(c, {}), args=args, ) for rowdict in rowdicts: rowdict["contig"] = c wu.write_rowdict( rowdict=rowdict, format=c_formats["junctions"], file=fh, ) # end wu.say("Finished successfully.")
def main(): args = get_args() # load junctions data hits = {} covs = {} wu.say("Loading junctions report.") F = wu.Frame(args.junctions) # loop over junctions for R in F.iter_rowdicts(): contig = R["CONTIG"] gene1 = R["GENE1"] gene2 = R["GENE2"] hits.setdefault(contig, {})[(gene1, gene2)] = int(R["JUNCTION_HITS"]) covs.setdefault(contig, {})[(gene1, gene2)] = float(R["RATIO"]) # filter contigs total = 0 failed = 0 outfile = args.outfile if outfile is None: outfile = args.contig_profile + ".qc_pass" # load results, open new file, write headers F = wu.Frame(args.contig_profile) fh = wu.try_open(outfile, "w") wu.write_rowdict(None, F.headers, file=fh) # loop over contigs for R in F.iter_rowdicts(): total += 1 contig = R["CONTIG_NAME"] # contig-level filters if contig not in hits or contig not in covs: failed += 1 wu.say("Missing junction data for contig:", contig) continue loci = R["LOCI"].split("|") synteny = R["SYNTENY"] qc_pass = True for i in range(len(loci) - 1): my_test = True spair = synteny[i] + synteny[i + 1] if spair not in ["AB", "BA"]: continue gpair = (loci[i], loci[i + 1]) my_hits = hits[contig].get(gpair, -1) my_hits = my_hits >= args.min_junction_hits my_covs = covs[contig].get(gpair, -1) my_covs = my_covs >= args.min_junction_ratio my_test = my_hits or my_covs qc_pass = qc_pass and my_test if not qc_pass: failed += 1 wu.say("Failed QC:", contig) else: wu.write_rowdict(R, F.headers, file=fh) # wrap-up wu.say("Failure rate: {} of {} ({:.1f}%)".format( failed, total, 100 * failed / float(total))) wu.say("Finished successfully.")
def main(): args = get_args() wu.say("Loading taxonomy.") taxonomy = wu.Taxonomy(args.taxonomy) # initialize contigs wu.say("Initializing contigs.") contigs = {} contig_lengths = wu.read_contig_lengths(args.contigs) index = 0 for contig_name, length in contig_lengths.items(): C = Contig(contig_name, args) C.length = length index += 1 C.index = index contigs[contig_name] = C # process gff wu.say("Adding gene coordinates.") for contig_name, loci in wu.iter_contig_loci(args.gff, attach_annotations=False): if contig_name not in contigs: wu.say(" Unknown contig in <gff> file", contig_name) continue C = contigs[contig_name] C.attach_loci(loci) # check basename in preparation for writing output if args.basename is None: args.basename = os.path.split(args.contigs)[1].split(".")[0] # prepare details file details = None if args.write_details: details = wu.try_open( os.path.join(args.outdir, args.basename + ".details.tsv.gz"), "w") # headers wu.write_rowdict(None, c_formats["details"], file=details) # parse hits, process contigs wu.say("Analyzing contigs.") # major contig loop for contig_name, hits in wu.iter_contig_hits(args.blastout): if contig_name not in contigs: wu.say(" Unknown contig in <blastout> file", contig_name) continue # this is a good contig C = contigs[contig_name] if not args.quiet: wu.say(" #{:>7,} of {:>7,}".format(C.index, len(contigs))) # attach hits to genes C.attach_hits(hits) C.update_gene_scores() # initial jumps? if args.jump_taxonomy is not None: for j in range(args.jump_taxonomy): C.raise_taxonomy(taxonomy) # evaluate; note: the 'ignore' option can result in "empty" contigs if not all([L.ignore for L in C.loci]): evaluate_contig(C, taxonomy, details, args) # wrap up write_main_output_files(contigs, taxonomy, args) wu.say("Finished successfully.") if details is not None: details.close()
def write_main_output_files(contigs, taxonomy, args): # open output file handles wu.say("Initializing outputs.") handles = {} for option in ["lgt", "no_lgt", "unclassified"]: file_name = ".".join([args.basename, option, "tsv"]) handles[option] = open(os.path.join(args.outdir, file_name), "w") # determine possible function annotation systems systems = set() for contig in contigs.values(): for locus in contig.loci: for system in locus.annotations: systems.add(system) for option in c_main_formats: for s in sorted(systems): c_formats[option].append(c_annotation_prefix + s) # print headers for name in handles: wu.write_rowdict(None, c_main_formats[name], file=handles[name]) # write results (sorted loop over contigs) for contig_name in sorted(contigs): contig = contigs[contig_name] best_one = contig.best_one best_two = contig.best_two # unclassified if not_ok(best_one) and not_ok(best_two): rowdict = { "contig_name": contig_name, "call": "unclassified", "contig_length": contig.length, "loci": make_loci_field(contig.loci), } attach_rowdict_functions(rowdict, contig, systems) wu.write_rowdict(rowdict, c_formats["unclassified"], handles["unclassified"]) # no_lgt elif is_ok(best_one): clade = best_one.clade1 rowdict = { "contig_name": contig_name, "call": "no_lgt", "contig_length": contig.length, "min_score": best_one.crit, "avg_score": best_one.rank, "synteny": best_one.synteny, "clade": clade, "taxonomy": c_delim2.join(taxonomy.get_lineage(clade)), "melded": make_tails_field(best_one.tails1), "loci": make_loci_field(contig.loci), } attach_rowdict_functions(rowdict, contig, systems) wu.write_rowdict(rowdict, c_formats["no_lgt"], handles["no_lgt"]) # lgt elif is_ok(best_two): clade1, clade2 = best_two.clade1, best_two.clade2 rowdict = { "contig_name": contig_name, "call": "lgt", "contig_length": contig.length, "min_max_score": best_two.crit, "avg_max_score": best_two.rank, "synteny": best_two.synteny, "direction": best_two.direction, "clade_A": clade1, "clade_B": clade2, "lca": taxonomy.get_lca(clade1, clade2), "taxonomy_A": c_delim2.join(taxonomy.get_lineage(clade1)), "taxonomy_B": c_delim2.join(taxonomy.get_lineage(clade2)), "melded_A": make_tails_field(best_two.tails1), "melded_B": make_tails_field(best_two.tails2), "loci": make_loci_field(contig.loci), } attach_rowdict_functions(rowdict, contig, systems) wu.write_rowdict(rowdict, c_formats["lgt"], handles["lgt"]) # wrap up for h in handles.values(): h.close()