def blast6filter_main(cmdline_args = None): if not cmdline_args: import sys cmdline_args = sys.argv if not len(cmdline_args) == 3: return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first" task,infile = cmdline_args[1:3] fileit = iterator_over_file(infile) alignment_getter = blast_record_iterator(fileit) # #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns))) # #sys.exit(1) if task.startswith("r"): grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) score_func = aln_funcs.score_getter_matching_consensus_estimated greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter, final_sort_key=attrgetter("pctid")) def remove_self(alns): a = list(alns) log("Remove Self: Working on %d" % len(alns)) filtered = filter(lambda y: not y.qname == y.sname, a) log("Remove Self: Filtered alignments: %d" % len(filtered)) return filtered lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained, greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) if task == "r_noover": score_func = aln_funcs.score_getter_penalize_overlap_estimated lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained) best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns) if task =="r_experimental": lis = compose(greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) else: grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("qstart"), attrgetter("qend")) lis = compose(partial(aln_funcs.LIS,aln_funcs.score_getter_penalize_overlap_estimated), aln_funcs.remove_contained) best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns) filter(print,imap(record_to_string, chain.from_iterable(best)))
def coverage_from_blast6(): if not len(sys.argv) == 2: sys.exit("coverage_from_blast6 in.blast6") raw_alignment_it = blast_record_iterator(iterator_over_file(sys.argv[1])) lno = partial(best_scoring_non_overlapping, attrgetter("qstart"), attrgetter("qend"), attrgetter("bitscore")) q_filt_alignment_it = chain.from_iterable( imap(compose(lno, itemgetter(1)), groupby(raw_alignment_it, attrgetter("qname")))) #read all alignments into memory ref_sorted_alignments = sorted(q_filt_alignment_it, key=attrgetter("sname")) for reference,alignments in groupby(ref_sorted_alignments, attrgetter("sname")): alignments = list(alignments) ref_len = alignments[0].slen blast_start_getter = lambda a: a.sstart-1 blast_end_getter = lambda a: a.send-1 cov_arr = coverage_array_from_ranges(alignments, ref_len, blast_start_getter, blast_end_getter) filter(print, izip(count(1),cov_arr)) #mark the regions with 0 coverage zerocov = map(lambda x: 1 if x==0 else 0, cov_arr) zerocov_regions = get_marked_ranges(zerocov) region_printer = compose(print,lambda (x,(y,z)) : "\t".join(map(str,[x,y,z]))) #filter(region_printer, izip(repeat(reference), zerocov_regions)) ##Get Low ID regions ranges_w_id = imap(compose(lambda (x,y,i) : (x-1,y-1,i), attrgetter("sstart","send","pctid")), alignments) pct_arr = coverage_array_from_ranges(alignments, ref_len, blast_start_getter, blast_end_getter, lambda r, (o_pid,o_cnt): (r.pctid+o_pid, o_cnt+1), (0,0)) lowid = map(lambda (c_pid,cnt): 1 if cnt != 0 and c_pid/cnt < 95.0 else 0, pct_arr) lowid_regions = get_marked_ranges(lowid)
def correct_oxford(reads_fn=None, alignments_fn=None): '''Corrects oxford reads''' log = logger(sys.stderr) if not reads_fn or not alignments_blast6_fn: if not len(sys.argv) == 3: sys.exit("correct.py raw_reads.fa alignments.blast6") (reads_fn,alignments_fn) = sys.argv[1:3] log("Reading raw reads into memory") #just put all reads in memory fastas = compose(fasta_iterator, iterator_over_file)(reads_fn) raw_reads = dict(map(attrgetter("name","seq"), fastas)) log("Reading raw reads DONE :)") #The alignments need to be sorted by the long read name (second column) alignment_it = line_record_iterator(Blast6SeqRecord, Blast6SeqTypes, iterator_over_file(alignments_fn)) important_field_getter = attrgetter("qname","sname","qstart","qend", "sstart","send", "qseq", "sseq") for readname, alignments in groupby(alignment_it, attrgetter("sname")): log("Working on %s" % readname) raw_read_seq = raw_reads.get(readname) if not raw_read_seq: log("Can not find sequence for %s" % readname) continue log("Raw Read Length: %d" % len(raw_read_seq)) g = AlnGraph(raw_read_seq) alignments = imap(important_field_getter, alignments) num_alignments = 0 for qname,sname,qstart,qend,sstart,send,qseq,sseq in alignments: #blast alignments are one based, convert to 0 based (qstart, qend) = (qstart-1, qend-1) (sstart, send) = (sstart-1, send-1) #reverse complement, must switch the alignment strings if send < sstart: (qseq, sseq) = tuple(map(reverse_complement, [qseq,sseq])) send, sstart = sstart,send (qseq, sseq) = convert_mismatches(qseq,sseq) try: alignment_tuple =((qstart, qend, qseq), (sstart, send, sseq), qname) g.add_alignment( alignment_tuple) except Exception as e: log("Add Alignmented Error: %s" % e) continue if num_alignments > TOO_MANY_ALIGNMENTS: break num_alignments += 1 log("Processed Alignments: %d" % num_alignments) if num_alignments > TOO_MANY_ALIGNMENTS: log("Too Many Alignments, Skipping") continue log("Generating Consensus") consensus = g.generate_all_consensus(min_cov=0)[0] log("Consensus Length %d" % len(consensus[0])) log("%s Done\n\n" % readname) #log("Output dag info") #output_dag_info(g, "g.info") print ">"+readname+"_consensus" print consensus[0]
#!/usr/bin/env python #Strips off the range of an alignment for a reads in blasr m4 format #for use with cmd line filtering tools import sys from jbio.io.file import iterator_over_file if len(sys.argv) == 1: infh = sys.stdin else: infh = iterator_over_file(sys.argv[1]) for line in infh: arr = line.strip().split() slash_split = arr[0].split("/") if "_" in slash_split[-2]: print "\t".join(["/".join(slash_split[:-1]) , slash_split[-1]] + arr[1:]) else: print "\t".join(["/".join(slash_split), slash_split[-1]] + arr[1:])
if not len(sys.argv) == 3: print "gene_fasta.py input.fa input.gff" sys.exit(1) #FIELDS = ["ID","Alias","orf_classification","gene","Note"] FIELDS = ["ID","Note"] fa_fn,gff_fn = sys.argv[1:3] #read fasta records into memory def fasta_clean_getter(fasta_entry): name = fasta_entry.name.split()[0] return (name, fasta_entry.seq) fasta_records = dict(imap(fasta_clean_getter,fasta_iterator(iterator_over_file(fa_fn)))) gene_entries = ifilter(lambda x: x.feature == "gene", gff_iterator(iterator_over_file(gff_fn))) for gene_record in gene_entries: attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";"))) header = ">" + attrs["Name"] fields = FIELDS field_getter_func = lambda x : unquote(attrs.get(x,"None")) if x =="Note" else attrs.get(x,"None") field_getter = imap(field_getter_func, fields) header += " " + " ".join(imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter))) start, end = gene_record.start-1, gene_record.end-1 seq = fasta_records[gene_record.seqname][start:end+1] print header
def blast6filter_main(cmdline_args=None): if not cmdline_args: import sys cmdline_args = sys.argv if not len(cmdline_args) == 3: return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first" task, infile = cmdline_args[1:3] fileit = iterator_over_file(infile) alignment_getter = blast_record_iterator(fileit) # #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns))) # #sys.exit(1) if task.startswith("r"): grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send")) score_func = aln_funcs.score_getter_matching_consensus_estimated greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter, final_sort_key=attrgetter("pctid")) def remove_self(alns): a = list(alns) log("Remove Self: Working on %d" % len(alns)) filtered = filter(lambda y: not y.qname == y.sname, a) log("Remove Self: Filtered alignments: %d" % len(filtered)) return filtered lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained, greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) if task == "r_noover": score_func = aln_funcs.score_getter_penalize_overlap_estimated lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained) best = imap(compose(partial(map, itemgetter(2)), lis), grouped_alns) if task == "r_experimental": lis = compose(greedy_repeat_filt, remove_self) best = imap(lis, grouped_alns) else: grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter) aln_funcs = alignment_functions(attrgetter("qstart"), attrgetter("qend")) lis = compose( partial(aln_funcs.LIS, aln_funcs.score_getter_penalize_overlap_estimated), aln_funcs.remove_contained) best = imap(compose(partial(map, itemgetter(2)), lis), grouped_alns) filter(print, imap(record_to_string, chain.from_iterable(best)))
# import sys from itertools import groupby, imap from operator import attrgetter, add import functools from jbio.io.blast import record_iterator as blast_record_iterator from jbio.io.file import iterator_over_file from jbio.alignment import alignment_functions if not len(sys.argv) == 2: sys.exit("gene_coverage_stats.py input.blast6.q") infile = sys.argv[1] blast_records = blast_record_iterator(iterator_over_file(infile)) afuncs = alignment_functions(attrgetter("qstart"), attrgetter("qend")) for qname, hsps in groupby(blast_records, attrgetter("qname")): hsps = list(hsps) first_hsp = hsps[0] hsp_lens = map(afuncs.len_aln, hsps) hsp_pct_of_query = map(lambda l: l / float(first_hsp.qlen) * 100.0, hsp_lens) total_query_coverage = sum(hsp_lens) / float(first_hsp.qlen) * 100.0 hsp_contig_names = map(attrgetter("sname"), hsps) number_different_contigs = len(set(sorted(hsp_contig_names))) p_items = [
#!/usr/bin/env python import sys from jbio.io.file import iterator_over_file from jbio.fasta import record_iterator as fasta_iterator if not len(sys.argv) == 2: sys.exit("fasta_to_line.py in.fa") for record in fasta_iterator(iterator_over_file(sys.argv[1])): print "\t".join([record.name, record.seq])
#!/usr/bin/env python import sys from jbio.io.celera import unitig_layout_iterator from jbio.io.file import iterator_over_file if not len(sys.argv) == 2: sys.exit("getsingeltonfrags.py unitigs.layout") for unitig in unitig_layout_iterator(iterator_over_file(sys.argv[1])): if len(unitig.frags) == 1: print "frg iid %d isdeleted t" % unitig.frags[0].ident
# import sys from itertools import groupby, imap from operator import attrgetter, add import functools from jbio.io.blast import record_iterator as blast_record_iterator from jbio.io.file import iterator_over_file from jbio.alignment import alignment_functions if not len(sys.argv) == 2: sys.exit("gene_coverage_stats.py input.blast6.q") infile = sys.argv[1] blast_records = blast_record_iterator(iterator_over_file(infile)) afuncs = alignment_functions(attrgetter("qstart"), attrgetter("qend")) for qname, hsps in groupby(blast_records, attrgetter("qname")): hsps = list(hsps) first_hsp = hsps[0] hsp_lens = map(afuncs.len_aln, hsps) hsp_pct_of_query = map(lambda l : l / float(first_hsp.qlen) * 100.0, hsp_lens) total_query_coverage = sum(hsp_lens) / float(first_hsp.qlen) * 100.0 hsp_contig_names = map(attrgetter("sname"), hsps) number_different_contigs = len(set(sorted(hsp_contig_names))) p_items = [qname, total_query_coverage, number_different_contigs, zip(hsp_pct_of_query, hsp_contig_names)] print("\t".join(imap(str, p_items)))
sys.exit(1) #FIELDS = ["ID","Alias","orf_classification","gene","Note"] FIELDS = ["ID", "Note"] fa_fn, gff_fn = sys.argv[1:3] #read fasta records into memory def fasta_clean_getter(fasta_entry): name = fasta_entry.name.split()[0] return (name, fasta_entry.seq) fasta_records = dict( imap(fasta_clean_getter, fasta_iterator(iterator_over_file(fa_fn)))) gene_entries = ifilter(lambda x: x.feature == "gene", gff_iterator(iterator_over_file(gff_fn))) for gene_record in gene_entries: attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";"))) header = ">" + attrs["Name"] fields = FIELDS field_getter_func = lambda x: unquote(attrs.get( x, "None")) if x == "Note" else attrs.get(x, "None") field_getter = imap(field_getter_func, fields) header += " " + " ".join( imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter))) start, end = gene_record.start - 1, gene_record.end - 1
#!/usr/bin/env python import sys from itertools import imap from jbio.io.file import iterator_over_file from jbio.fasta import record_iterator as fasta_iterator ##Create Kmers if not len(sys.argv) == 3: sys.exit("Usage: kmer.py k-size in.fa\n") fn = sys.argv[2] ksize = int(sys.argv[1]) for record in fasta_iterator(iterator_over_file(fn)): seq = record.seq starts = range(len(seq)-ksize+1) kmers = imap(lambda start: seq[start:start+ksize], starts) for kmer in kmers: print kmer