Пример #1
0
def blast6filter_main(cmdline_args = None):
    
    if not cmdline_args:
        import sys
        cmdline_args = sys.argv

    if not len(cmdline_args) == 3:
        return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first"
    
    task,infile = cmdline_args[1:3]

    fileit = iterator_over_file(infile)

    alignment_getter = blast_record_iterator(fileit)

    #
    #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
    #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send"))
    #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns)))
    #
    #sys.exit(1)
    
    if task.startswith("r"):
        grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send"))
        score_func = aln_funcs.score_getter_matching_consensus_estimated
        greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter, final_sort_key=attrgetter("pctid"))
        def remove_self(alns):
            a = list(alns)
            log("Remove Self: Working on %d" % len(alns))
            filtered = filter(lambda y: not y.qname == y.sname, a)
            log("Remove Self: Filtered alignments: %d" % len(filtered))
            return filtered
            
        lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained, greedy_repeat_filt, remove_self)
        best = imap(lis, grouped_alns)
        if task == "r_noover":
            score_func = aln_funcs.score_getter_penalize_overlap_estimated
            lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained)
            best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns)        
        if task =="r_experimental":
            lis = compose(greedy_repeat_filt, remove_self)
            best = imap(lis, grouped_alns)

    else:
        grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("qstart"), attrgetter("qend"))
        lis = compose(partial(aln_funcs.LIS,aln_funcs.score_getter_penalize_overlap_estimated), aln_funcs.remove_contained)
        best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns)        
        
    

    
    filter(print,imap(record_to_string, 
                      chain.from_iterable(best)))
Пример #2
0
def coverage_from_blast6():

    if not len(sys.argv) == 2:
        sys.exit("coverage_from_blast6 in.blast6")

    raw_alignment_it = blast_record_iterator(iterator_over_file(sys.argv[1]))
    lno = partial(best_scoring_non_overlapping,
                  attrgetter("qstart"), 
                  attrgetter("qend"),
                  attrgetter("bitscore"))

    q_filt_alignment_it = chain.from_iterable(
        imap(compose(lno, itemgetter(1)), 
             groupby(raw_alignment_it, 
                     attrgetter("qname"))))
    
    #read all alignments into memory
    ref_sorted_alignments = sorted(q_filt_alignment_it, 
                                   key=attrgetter("sname"))
    
    for reference,alignments in groupby(ref_sorted_alignments,
                                        attrgetter("sname")):
        alignments = list(alignments)
        ref_len = alignments[0].slen
        
        blast_start_getter = lambda a: a.sstart-1
        blast_end_getter = lambda a: a.send-1
        cov_arr = coverage_array_from_ranges(alignments, ref_len,
                                             blast_start_getter,
                                             blast_end_getter)
        filter(print, izip(count(1),cov_arr))
        #mark the regions with 0 coverage
        zerocov = map(lambda x: 1 if x==0 else 0, cov_arr)
        zerocov_regions = get_marked_ranges(zerocov)
        
        region_printer = compose(print,lambda (x,(y,z)) : "\t".join(map(str,[x,y,z])))
        
        #filter(region_printer, izip(repeat(reference), zerocov_regions))
        
        ##Get Low ID regions
        ranges_w_id = imap(compose(lambda (x,y,i) : (x-1,y-1,i),
                                   attrgetter("sstart","send","pctid")), alignments)

        pct_arr = coverage_array_from_ranges(alignments, ref_len,
                                             blast_start_getter,
                                             blast_end_getter,
                                             lambda r, (o_pid,o_cnt): (r.pctid+o_pid, o_cnt+1), 
                                             (0,0))
        lowid = map(lambda (c_pid,cnt): 1 if cnt != 0 and c_pid/cnt < 95.0 else 0,
                    pct_arr)
        
        lowid_regions = get_marked_ranges(lowid)
Пример #3
0
def correct_oxford(reads_fn=None, alignments_fn=None):
    '''Corrects oxford reads'''
    
    log = logger(sys.stderr)
    
    if not reads_fn or not alignments_blast6_fn:
        if not len(sys.argv) == 3:
            sys.exit("correct.py raw_reads.fa alignments.blast6")
        (reads_fn,alignments_fn) = sys.argv[1:3]

        log("Reading raw reads into memory")
        #just put all reads in memory
        fastas = compose(fasta_iterator, iterator_over_file)(reads_fn)
        raw_reads = dict(map(attrgetter("name","seq"), fastas))

        log("Reading raw reads DONE :)")

        #The alignments need to be sorted by the long read name (second column)
        alignment_it = line_record_iterator(Blast6SeqRecord, Blast6SeqTypes,
                                            iterator_over_file(alignments_fn))
        
        important_field_getter = attrgetter("qname","sname","qstart","qend",
                                            "sstart","send", "qseq", "sseq")
                                            
        for readname, alignments in groupby(alignment_it, attrgetter("sname")):
            log("Working on %s" % readname)
            
            raw_read_seq = raw_reads.get(readname)
            if not raw_read_seq:
                log("Can not find sequence for %s" % readname)
                continue

            log("Raw Read Length: %d" % len(raw_read_seq))    
            g = AlnGraph(raw_read_seq)

            alignments = imap(important_field_getter, alignments)
            num_alignments = 0
            for qname,sname,qstart,qend,sstart,send,qseq,sseq in alignments:

                #blast alignments are one based, convert to 0 based
                (qstart, qend) = (qstart-1, qend-1)
                (sstart, send) = (sstart-1, send-1)

                #reverse complement, must switch the alignment strings
                if send < sstart:
                    (qseq, sseq) = tuple(map(reverse_complement, [qseq,sseq]))
                    send, sstart = sstart,send
                    
                (qseq, sseq) = convert_mismatches(qseq,sseq)
                try:
                    alignment_tuple =((qstart, qend, qseq),
                                      (sstart, send, sseq), qname) 
                    g.add_alignment( alignment_tuple)
                except Exception as e:
                    log("Add Alignmented Error: %s" % e)
                    continue
                if num_alignments > TOO_MANY_ALIGNMENTS:
                    break
                
                num_alignments += 1

            log("Processed Alignments: %d" % num_alignments)
            if num_alignments > TOO_MANY_ALIGNMENTS:
                log("Too Many Alignments, Skipping")
                continue
            
            log("Generating Consensus")
            consensus = g.generate_all_consensus(min_cov=0)[0]
            log("Consensus Length %d" % len(consensus[0]))
            log("%s Done\n\n" % readname)

            #log("Output dag info")
            #output_dag_info(g, "g.info")

            print ">"+readname+"_consensus"
            print consensus[0]
Пример #4
0
#!/usr/bin/env python

#Strips off the range of an alignment for a reads in blasr m4 format
#for use with cmd line filtering tools

import sys

from jbio.io.file import iterator_over_file

if len(sys.argv) == 1:
    infh = sys.stdin
else:
    infh = iterator_over_file(sys.argv[1])
    

for line in infh:
    arr = line.strip().split()
    slash_split = arr[0].split("/")
    if "_" in slash_split[-2]:
        print "\t".join(["/".join(slash_split[:-1]) , slash_split[-1]] + arr[1:])
    else:
        print "\t".join(["/".join(slash_split), slash_split[-1]] + arr[1:])
Пример #5
0
if not len(sys.argv) == 3:
    print "gene_fasta.py input.fa input.gff"
    sys.exit(1)

#FIELDS = ["ID","Alias","orf_classification","gene","Note"]
FIELDS = ["ID","Note"]

fa_fn,gff_fn  = sys.argv[1:3]

#read fasta records into memory
def fasta_clean_getter(fasta_entry):
    name = fasta_entry.name.split()[0]
    return (name, fasta_entry.seq)

fasta_records = dict(imap(fasta_clean_getter,fasta_iterator(iterator_over_file(fa_fn))))

gene_entries = ifilter(lambda x: x.feature == "gene", 
                       gff_iterator(iterator_over_file(gff_fn)))

for gene_record in gene_entries:
    attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";")))
    header = ">" + attrs["Name"]
    fields = FIELDS
    field_getter_func = lambda x : unquote(attrs.get(x,"None")) if x =="Note" else attrs.get(x,"None")
    field_getter = imap(field_getter_func, fields)
    header += " " + " ".join(imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter)))
    
    start, end = gene_record.start-1, gene_record.end-1
    seq = fasta_records[gene_record.seqname][start:end+1]
    print header
Пример #6
0
def blast6filter_main(cmdline_args=None):

    if not cmdline_args:
        import sys
        cmdline_args = sys.argv

    if not len(cmdline_args) == 3:
        return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first"

    task, infile = cmdline_args[1:3]

    fileit = iterator_over_file(infile)

    alignment_getter = blast_record_iterator(fileit)

    #
    #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
    #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send"))
    #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns)))
    #
    #sys.exit(1)

    if task.startswith("r"):
        grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("sstart"),
                                        attrgetter("send"))
        score_func = aln_funcs.score_getter_matching_consensus_estimated
        greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter,
                                     final_sort_key=attrgetter("pctid"))

        def remove_self(alns):
            a = list(alns)
            log("Remove Self: Working on %d" % len(alns))
            filtered = filter(lambda y: not y.qname == y.sname, a)
            log("Remove Self: Filtered alignments: %d" % len(filtered))
            return filtered

        lis = compose(partial(aln_funcs.LIS,
                              score_func), aln_funcs.remove_contained,
                      greedy_repeat_filt, remove_self)
        best = imap(lis, grouped_alns)
        if task == "r_noover":
            score_func = aln_funcs.score_getter_penalize_overlap_estimated
            lis = compose(partial(aln_funcs.LIS, score_func),
                          aln_funcs.remove_contained)
            best = imap(compose(partial(map, itemgetter(2)), lis),
                        grouped_alns)
        if task == "r_experimental":
            lis = compose(greedy_repeat_filt, remove_self)
            best = imap(lis, grouped_alns)

    else:
        grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("qstart"),
                                        attrgetter("qend"))
        lis = compose(
            partial(aln_funcs.LIS,
                    aln_funcs.score_getter_penalize_overlap_estimated),
            aln_funcs.remove_contained)
        best = imap(compose(partial(map, itemgetter(2)), lis), grouped_alns)

    filter(print, imap(record_to_string, chain.from_iterable(best)))
Пример #7
0
#
import sys
from itertools import groupby, imap
from operator import attrgetter, add
import functools

from jbio.io.blast import record_iterator as blast_record_iterator
from jbio.io.file import iterator_over_file
from jbio.alignment import alignment_functions

if not len(sys.argv) == 2:
    sys.exit("gene_coverage_stats.py input.blast6.q")

infile = sys.argv[1]

blast_records = blast_record_iterator(iterator_over_file(infile))

afuncs = alignment_functions(attrgetter("qstart"), attrgetter("qend"))

for qname, hsps in groupby(blast_records, attrgetter("qname")):
    hsps = list(hsps)
    first_hsp = hsps[0]

    hsp_lens = map(afuncs.len_aln, hsps)
    hsp_pct_of_query = map(lambda l: l / float(first_hsp.qlen) * 100.0,
                           hsp_lens)
    total_query_coverage = sum(hsp_lens) / float(first_hsp.qlen) * 100.0
    hsp_contig_names = map(attrgetter("sname"), hsps)
    number_different_contigs = len(set(sorted(hsp_contig_names)))

    p_items = [
Пример #8
0
#!/usr/bin/env python

import sys

from jbio.io.file import iterator_over_file
from jbio.fasta import record_iterator as fasta_iterator

if not len(sys.argv) == 2:
    sys.exit("fasta_to_line.py in.fa")


for record in fasta_iterator(iterator_over_file(sys.argv[1])):
    print "\t".join([record.name, record.seq])
Пример #9
0
#!/usr/bin/env python

import sys

from jbio.io.celera import unitig_layout_iterator
from jbio.io.file import iterator_over_file

if not len(sys.argv) == 2:
    sys.exit("getsingeltonfrags.py unitigs.layout")

for unitig in unitig_layout_iterator(iterator_over_file(sys.argv[1])):
    if len(unitig.frags) == 1:
        
        print "frg iid %d isdeleted t" % unitig.frags[0].ident






Пример #10
0
#
import sys
from itertools import groupby, imap
from operator import attrgetter, add
import functools

from jbio.io.blast import record_iterator as blast_record_iterator
from jbio.io.file import iterator_over_file
from jbio.alignment import alignment_functions

if not len(sys.argv) == 2:
    sys.exit("gene_coverage_stats.py input.blast6.q")

infile = sys.argv[1]

blast_records = blast_record_iterator(iterator_over_file(infile))

afuncs = alignment_functions(attrgetter("qstart"), attrgetter("qend"))

for qname, hsps in groupby(blast_records, attrgetter("qname")):
    hsps = list(hsps)
    first_hsp = hsps[0]
    
    hsp_lens = map(afuncs.len_aln, hsps)
    hsp_pct_of_query = map(lambda l : l / float(first_hsp.qlen) * 100.0, hsp_lens)
    total_query_coverage = sum(hsp_lens) / float(first_hsp.qlen) * 100.0
    hsp_contig_names = map(attrgetter("sname"), hsps)
    number_different_contigs = len(set(sorted(hsp_contig_names)))
    
    p_items = [qname, total_query_coverage, number_different_contigs, zip(hsp_pct_of_query, hsp_contig_names)]
    print("\t".join(imap(str, p_items)))
Пример #11
0
    sys.exit(1)

#FIELDS = ["ID","Alias","orf_classification","gene","Note"]
FIELDS = ["ID", "Note"]

fa_fn, gff_fn = sys.argv[1:3]


#read fasta records into memory
def fasta_clean_getter(fasta_entry):
    name = fasta_entry.name.split()[0]
    return (name, fasta_entry.seq)


fasta_records = dict(
    imap(fasta_clean_getter, fasta_iterator(iterator_over_file(fa_fn))))

gene_entries = ifilter(lambda x: x.feature == "gene",
                       gff_iterator(iterator_over_file(gff_fn)))

for gene_record in gene_entries:
    attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";")))
    header = ">" + attrs["Name"]
    fields = FIELDS
    field_getter_func = lambda x: unquote(attrs.get(
        x, "None")) if x == "Note" else attrs.get(x, "None")
    field_getter = imap(field_getter_func, fields)
    header += " " + " ".join(
        imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter)))

    start, end = gene_record.start - 1, gene_record.end - 1
Пример #12
0
#!/usr/bin/env python

import sys

from itertools import imap

from jbio.io.file import iterator_over_file
from jbio.fasta import record_iterator as fasta_iterator

##Create Kmers

if not len(sys.argv) == 3:
    sys.exit("Usage: kmer.py k-size in.fa\n")

fn = sys.argv[2]
ksize = int(sys.argv[1])

for record in fasta_iterator(iterator_over_file(fn)):
    seq = record.seq
    starts = range(len(seq)-ksize+1)
    kmers = imap(lambda start: seq[start:start+ksize], starts)
    for kmer in kmers:
        print kmer