def add_batch(batch_index, pCS, orphans, fasta_d, cpus, dun_use_partial): """ 1. align batch<i>.fasta against seed<i>.S.fasta, process -> write remains to batch<i>.remains.fasta 2. align batch<i>.remains.fasta against seed<i>.orphans.fasta -> write remains to batch<i>.remains2.fasta 3. self align batch<i>.remains2.fasta -> combine remains+orphans to new orphans 4. write out seed<i+1>.S.fasta and seed<i+1>.orphans.fasta """ cur_file = "batch{0}.fasta".format(batch_index) seqids = set([r.id for r in SeqIO.parse(open(cur_file), 'fasta')]) o = ar.run_minimap(cur_file, "seed{0}.S.fasta".format(batch_index), cpus=cpus) print("processing", o, file=sys.stderr) pCS, remains = sp.process_align_to_pCS(o, seqids, pCS, MiniReader, dun_use_partial=dun_use_partial) print("pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \ len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.values()), len(orphans), len(remains)), file=sys.stderr) # write batch<i>.remains.fasta cur_file = "batch{0}.remains.fasta".format(batch_index) FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d) o = ar.run_minimap(cur_file, "seed{0}.orphans.fasta".format(batch_index), cpus=cpus) print("processing", o, file=sys.stderr) pCS, orphans, remains = sp.process_align_to_orphan( o, remains, orphans, pCS, MiniReader, dun_use_partial=dun_use_partial) print("pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \ len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.values()), len(orphans), len(remains)), file=sys.stderr) # write batch<i>.remains2.fasta and self align cur_file = "batch{0}.remains2.fasta".format(batch_index) FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d) o = ar.run_minimap(cur_file, cur_file, cpus=cpus) print("processing", o, file=sys.stderr) pCS, remains = sp.process_self_align_into_seed( o, remains, MiniReader, pCS, dun_use_partial=dun_use_partial) print("pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \ len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.values()), len(orphans), len(remains)), file=sys.stderr) # combine remains+orphans to new orphans orphans = orphans.union(remains) FileIO.write_preClusterSet_to_fasta( pCS, "seed{0}.S.fasta".format(batch_index + 1), fasta_d) FileIO.write_seqids_to_fasta( orphans, "seed{0}.orphans.fasta".format(batch_index + 1), fasta_d) return pCS, orphans
def add_batch(batch_index, pCS, orphans, fasta_d, cpus, dun_use_partial): """ 1. align batch<i>.fasta against seed<i>.S.fasta, process -> write remains to batch<i>.remains.fasta 2. align batch<i>.remains.fasta against seed<i>.orphans.fasta -> write remains to batch<i>.remains2.fasta 3. self align batch<i>.remains2.fasta -> combine remains+orphans to new orphans 4. write out seed<i+1>.S.fasta and seed<i+1>.orphans.fasta """ cur_file = "batch{0}.fasta".format(batch_index) seqids = set([r.id for r in SeqIO.parse(open(cur_file), 'fasta')]) o = ar.run_minimap(cur_file, "seed{0}.S.fasta".format(batch_index), cpus=cpus) print >> sys.stderr, "processing", o pCS, remains = sp.process_align_to_pCS(o, seqids, pCS, MiniReader, dun_use_partial=dun_use_partial) print >> sys.stderr, "pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \ len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans), len(remains)) # write batch<i>.remains.fasta cur_file = "batch{0}.remains.fasta".format(batch_index) FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d) o = ar.run_minimap(cur_file, "seed{0}.orphans.fasta".format(batch_index), cpus=cpus) print >> sys.stderr, "processing", o pCS, orphans, remains = sp.process_align_to_orphan(o, remains, orphans, pCS, MiniReader, dun_use_partial=dun_use_partial) print >> sys.stderr, "pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \ len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans), len(remains)) # write batch<i>.remains2.fasta and self align cur_file = "batch{0}.remains2.fasta".format(batch_index) FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d) o = ar.run_minimap(cur_file, cur_file, cpus=cpus) print >> sys.stderr, "processing", o pCS, remains = sp.process_self_align_into_seed(o, remains, MiniReader, pCS, dun_use_partial=dun_use_partial) print >> sys.stderr, "pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \ len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans), len(remains)) # combine remains+orphans to new orphans orphans = orphans.union(remains) FileIO.write_preClusterSet_to_fasta(pCS, "seed{0}.S.fasta".format(batch_index+1), fasta_d) FileIO.write_seqids_to_fasta(orphans, "seed{0}.orphans.fasta".format(batch_index+1), fasta_d) return pCS, orphans
def main(cpus, dun_make_bins=False, dun_use_partial=False, num_seqs_per_batch=100000, dun_cleanup_files=False): print "Indexing isoseq_flnc.fasta using LazyFastaReader..." d = LazyFastaReader('isoseq_flnc.fasta') print "Splitting input isoseq_flnc.fasta into seed/batches..." num_batchs = create_seed_n_batch_files(input='isoseq_flnc.fasta', fasta_d=d, seed_filename='seed0.fasta', batch_pre='batch', num_seqs_per_batch=num_seqs_per_batch) # step1. run minimap of seed0 against itself and process o = ar.run_minimap('seed0.fasta', 'seed0.fasta', cpus=cpus) seqids = set([r.id for r in SeqIO.parse(open('seed0.fasta'),'fasta')]) pCS, orphans = sp.process_self_align_into_seed(o, seqids, MiniReader, dun_use_partial=dun_use_partial) # keep stats size_S, size_tucked, size_orphans = len(pCS.S), sum(v=='T' for v in pCS.seq_stat.itervalues()), len(orphans) print "seed 0 initial: S {0}, tucked {1}, orphans {2}".format(size_S, size_tucked, size_orphans) # write out seed1.S.fasta and seed1.orphans.fasta FileIO.write_preClusterSet_to_fasta(pCS, 'seed1.S.fasta', d) FileIO.write_seqids_to_fasta(orphans, 'seed1.orphans.fasta', d) # step 2a. minimap batch1 against seed1.S and process for i in xrange(1, num_batchs): pCS, orphans = add_batch(i, pCS, orphans, d, cpus=cpus, dun_use_partial=dun_use_partial) cleanup_precluster_intermediate_files(i) # detect PCR chimeras from orphans chimeras = detect_PCR_chimeras(orphans, d) orphans = orphans.difference(chimeras) FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d) FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d) tucked_seqids = [] # dump pCS, orphans, chimeras to a pickle # can't dump yet --- since pCS is an object #with open('preCluster.output.pickle', 'w') as f: # dump({'pCS': pCS, 'chimeras': chimeras, 'orphans': orphans}, f) # write CSV file with open('preCluster.output.csv', 'w') as f: f.write("seqid,stat\n") for x, stat in pCS.seq_stat.iteritems(): if stat == 'T': f.write("{0},tucked\n".format(x)) tucked_seqids.append(x) elif stat == 'M': f.write("{0},{1}\n".format(x, pCS.seq_map[x])) for x in orphans: f.write("{0},orphan\n".format(x)) for x in chimeras: f.write("{0},chimera\n".format(x)) # Liz: currently not using tucked... #FileIO.write_seqids_to_fasta(tucked_seqids, "preCluster_out.tucked.fasta", d) infof = open('preCluster.cluster_info.csv', 'w') infof.write("cluster,size\n") # write out a directory per preCluster cid in preCluster_out/<cid> # Liz note: right now, write out even directories with just 1 sequence # (we know they have "tucked" support, so can run Partial/Arrow on it) #singlef = open("preCluster_out.singles.fasta", 'w') for cid in pCS.S: # if pCS.S[cid].size == 1: # r = d[pCS.S[cid].members[0]] # singlef.write(">{0}\n{1}\n".format(r.id, r.seq)) # else: if True: if not dun_make_bins: dirname = os.path.join("preCluster_out", str(cid)) os.makedirs(dirname) file = os.path.join(dirname, 'isoseq_flnc.fasta') FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d) infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members))) #singlef.close() infof.close() if not dun_cleanup_files: # clean up all seed* and batch* files for file in glob.glob('batch*fasta*'): os.remove(file) for file in glob.glob('seed*fasta*'): os.remove(file)