def dump_results(outfile, fastadir, contigs): for r, d in contigs.iteritems(): with open(os.path.join(fastadir, "{}.fa".format(r)), "w") as outfasta: for s,seq in d["seqs"].iteritems(): outfile.write( "{}:{}\t{}\t{}\n".format( "\t".join(str(d["region"]).strip().split()[0:5]), s, len(seq), "*" ) ) outfasta.write(">{}\n".format(s)) for km in kmers.kmerize(seq, 50, 50): outfasta.write("{}\n".format(km))
ref += str(kmers[e].name)[kstep:] ## step through BWTs trying to assemble between flanking k-mers for bb, b in enumerate(bwts): alt, n = assembly.build_bridge(b, str(kmers[s-1].name), str(kmers[e].name)) # check for successful assembly if not len(alt): # unsuccessful assembly logger.info("Couldn't bridge range {}:{}-{}".format(kmers[s-1].chrom, kmers[s-1].start+1, kmers[e].end)) else: # assembly successful; align to reference ref_aln, alt_aln, ops = swalign.align(ref, alt[0]) # check coverage of alt alleles alt_counts = [] for k in ktools.kmerize(alt[0], ksize, kstep): if len(k) == ksize: alt_counts.append( util.count_reads(b, k) ) support = np.median(alt_counts) #print(ref_aln) #print(ops) #print(alt_aln) # iterate on variant sites in alignment for offset, site_ref, site_alt in swalign.reconcile(ref_aln, alt_aln): # generate VCF entry CHROM = kmers[s-1].chrom POS = kmers[s-1].start+offset+1 #NB: must convert to 1-based ID = "." REF = site_ref ALT = site_alt QUAL = "."
parser.add_argument( "-f","--fasta", type = argparse.FileType("rU"), help = "fasta file containing sequences to search with" ) parser.add_argument( "-M", "--msbwt", type = io.readable_dir, nargs = "+", help = "one or more msBWTs in which to count k-mers" ) parser.add_argument( "-k", "--kmer", type = int, default = 0, help = "k-mer size (set to 0 to use all of each sequence) [default: %(default)d]") parser.add_argument( "--normalize", action = "store_true", help = "normalize counts against total size of msBWTs [default: %(default)d]") args = parser.parse_args() bwts = util.BwtSet(args.msbwt) fa = SeqIO.parse(args.fasta, format = "fasta") sys.stderr.write( "Using the following msBWTs:\n{}".format(str(bwts)) ) sys.stderr.write( "Reporting counts as parts per billion.\n" ) if args.kmer > 0: sys.stderr.write( "Breaking sequences into k-mers of length {} for searches.\n".format(args.kmer) ) else: sys.stderr.write( "Searching with provided sequences as-is.\n" ) for seq in fa: if args.kmer > 0: k = args.kmer else: k = len(seq.seq) for subseq in kmers.kmerize(seq.seq, k): for bwtname,count in bwts.count(subseq, args.normalize).iteritems(): print seq.name, subseq, bwtname, count
all_found = True these_contigs = collections.defaultdict(str) for i in range(0, len(bwtnames)): sys.stderr.write("\t{} ...\n".format(bwtnames[i])) ## first check that seed will work in this BWT ## allow step-down of seed size to get started, but hold k-mer size for assembly constant seed_found = False k_start = args.kmer + 1 while not seed_found: k_start -= 1 ## if seed is less than 21nt long, it's probably hopeless if k_start < 21: break for seed_k in kmers.kmerize(seed, k_start, 1): x = util.count_reads(msbwts[i], seed_k) if x >= args.minweight: seed_found = True seed = seed_k sys.stderr.write("\t\tseed accepted ({} bp): {}\n".format(len(seed_k), seed_k)) break ## seed not found in this sample: break loop if not seed_found: all_found = False break ## do assembly seq = assembly.greedy_assemble( args.msbwt[i], seed = seed, k = args.kmer, count_k = args.count_kmer, direction = True, min_weight = args.minweight, max_weight = args.maxweight,
break_kmers.append( (kmer_stash[i_start].name, kmer_stash[i_end].name) ) break_counts.append( (kmer_stash[i_start].score, kmer_stash[i_end].score) ) for i in range(0, len(break_coords)): hap = assemble_inward(msbwt[0], break_kmers[i][0], break_kmers[i][1]) if args.verbose: outline = [ break_coords[i][0], break_coords[i][1], break_counts[i][0], break_coords[i][2], break_counts[i][1] ] sys.stderr.write("\t".join([ str(x) for x in outline ]) + "\n") sys.stderr.write("\t" + str(hap) + "\n\n") for h in hap: hap_avg = {} for (seq, count) in h.iteritems(): if count > args.maf: ungapped_seq = dna.ungap(seq) k_counts = [] for k in kmers.kmerize(dna.ungap(seq), kmer_size): if dna.complexity(k) > args.complexity: k_counts.append( util.count_reads(msbwt[0], k) ) if len(k_counts): hap_avg.update({ ungapped_seq: max(k_counts) }) flag = "*" if len(hap_avg.keys()) > 1: sys.stderr.write("Warning: apparently there is >1 haplotype at this variant site.\n") flag = "+" for (hh, hc) in hap_avg.iteritems(): if hc > args.maxhits: print break_coords[i][0], break_coords[i][1], break_coords[i][2], hh, hc, flag sys.stdout.flush() # force write; this lets me peek at output in almost-real time if args.verbose:
for i in range(0, len(break_coords)): hap = assemble_inward(msbwt[0], break_kmers[i][0], break_kmers[i][1]) if args.verbose: outline = [ break_coords[i][0], break_coords[i][1], break_counts[i][0], break_coords[i][2], break_counts[i][1] ] sys.stderr.write("\t".join([str(x) for x in outline]) + "\n") sys.stderr.write("\t" + str(hap) + "\n\n") for h in hap: hap_avg = {} for (seq, count) in h.iteritems(): if count > args.maf: ungapped_seq = dna.ungap(seq) k_counts = [] for k in kmers.kmerize(dna.ungap(seq), kmer_size): if dna.complexity(k) > args.complexity: k_counts.append(util.count_reads(msbwt[0], k)) if len(k_counts): hap_avg.update({ungapped_seq: max(k_counts)}) flag = "*" if len(hap_avg.keys()) > 1: sys.stderr.write( "Warning: apparently there is >1 haplotype at this variant site.\n" ) flag = "+" for (hh, hc) in hap_avg.iteritems(): if hc > args.maxhits: print break_coords[i][0], break_coords[i][1], break_coords[i][ 2], hh, hc, flag
save_graph = True bwtname = re.sub(r"/+$", "", args.msbwt).split("/").pop() ## make sure seed sequences have proper form seed = dna.ungap(args.seed.upper()) sys.stderr.write("Seed sequence is: {}\n".format(seed)) end_seeds = [] if args.end_seeds: for s in args.end_seeds: end_seeds.append(dna.ungap(s)) ## examine the seed, taking the first k-mer which returns a result msbwts = util.load_bwts([args.msbwt]) seed_found = False for seed_k in kmers.kmerize(seed, args.kmer, 1): x = util.count_reads(msbwts[0], seed_k) if x >= args.minweight: seed_found = True seed = seed_k print seed_k break if not seed_found: sys.exit( "Seed sequence doesn't contain any k-mers which meet the specified abundance threshold." ) ## do assembly seq = assembly.greedy_assemble(args.msbwt, seed=seed,
save_graph = True bwtname = re.sub(r"/+$","", args.msbwt).split("/").pop() ## make sure seed sequences have proper form seed = dna.ungap(args.seed.upper()) sys.stderr.write("Seed sequence is: {}\n".format(seed)) end_seeds = [] if args.end_seeds: for s in args.end_seeds: end_seeds.append( dna.ungap(s) ) ## examine the seed, taking the first k-mer which returns a result msbwts = util.load_bwts([args.msbwt]) seed_found = False for seed_k in kmers.kmerize(seed, args.kmer, 1): x = util.count_reads(msbwts[0], seed_k) if x >= args.minweight: seed_found = True seed = seed_k print seed_k break if not seed_found: sys.exit("Seed sequence doesn't contain any k-mers which meet the specified abundance threshold.") ## do assembly seq = assembly.greedy_assemble( args.msbwt, seed = seed, end_seeds = end_seeds, k = args.kmer, count_k = args.count_kmer, direction = args.backward, min_weight = args.minweight, max_weight = args.maxweight, max_nodes = args.maxnodes, max_length = args.maxlength, save = save_graph, outprefix = args.prefix, memmap = args.memmap )
for bb, b in enumerate(bwts): alt, n = assembly.build_bridge(b, str(kmers[s - 1].name), str(kmers[e].name)) # check for successful assembly if not len(alt): # unsuccessful assembly logger.info("Couldn't bridge range {}:{}-{}".format( kmers[s - 1].chrom, kmers[s - 1].start + 1, kmers[e].end)) else: # assembly successful; align to reference ref_aln, alt_aln, ops = swalign.align(ref, alt[0]) # check coverage of alt alleles alt_counts = [] for k in ktools.kmerize(alt[0], ksize, kstep): if len(k) == ksize: alt_counts.append(util.count_reads(b, k)) support = np.median(alt_counts) #print(ref_aln) #print(ops) #print(alt_aln) # iterate on variant sites in alignment for offset, site_ref, site_alt in swalign.reconcile( ref_aln, alt_aln): # generate VCF entry CHROM = kmers[s - 1].chrom POS = kmers[ s - 1].start + offset + 1 #NB: must convert to 1-based ID = "."