Exemplo n.º 1
0
    def remove_contained(alignments):
        '''Removes alignments contained within other alignments
        @param alignments list of alignments
        @return list of alignments with contained alignments removed
        '''
        logf = compose(log,partial(add, "remove_contained: "))

        alignments = make_list(alignments)
        
        logf("Starting Alignments: %d" % len(alignments))
        
        is_contained = lambda a,b : sg(b) >= sg(a) and eg(b) <= eg(a)
        
        logf("Sorting")
        end_sorted = sorted(alignments, key=eg, reverse=True)
        alignments = sorted(end_sorted, key=sg)

        logf("Searching")
        #remove contained
        contained = [False] * len(alignments)
        for i in xrange(len(alignments)):
            for j in xrange(i):
                if is_contained(alignments[j],alignments[i]):
                    contained[i] = True
                    break
                
        filtered = map(itemgetter(1), 
                       ifilter(compose(lambda x : not x,itemgetter(0)), 
                               izip(contained,alignments)))

        logf("Filtered Alignments: %d" % len(filtered))
        
        return filtered
Exemplo n.º 2
0
    def greedy_repeat_filter(alignment_iterable, final_sort_key=None):
        '''takes a list of alignments, if two alignmnts have the same
        start or end positions take the longest
        
        final_sort_key gives us the final value to sort by to break ties
              Larger values are better
        '''
        logf = compose(log, partial(add,"greedy_repeat_filter: "))

        s_sorted = make_list(alignment_iterable)
        logf("Staring Alignments: %d" % len(s_sorted))

        if final_sort_key:
            s_sorted = sorted(alignment_iterable, key=final_sort_key, reverse=True)

        s_sorted = sorted(s_sorted, key=eg, reverse=True)
        s_sorted = sorted(s_sorted, key=sg)
        
        filtered_alignments = imap(itemgetter(0),group(sg, s_sorted))

        e_sorted = sorted(filtered_alignments, key=eg, reverse=True)
        
        filtered = map(itemgetter(0), group(eg, e_sorted))
        logf("Filtered Alignments %d " % len(filtered) )

        return filtered
Exemplo n.º 3
0
def group(key_func, alignment_iterable):
    '''
    Groups alignments by key_func, only returns the groups
    as an iterable
    '''
    return imap(compose(list,itemgetter(1)), 
                groupby(alignment_iterable, 
                        key=key_func))
Exemplo n.º 4
0
def disabled_test_LIS(config):
    
    test_data_path = config.get("test_data_path")
    alignment_file = os.path.join(test_data_path,
                                  "channel_286_read_45_1406145606_2D.blast6.gz")
    blast_alignment_getter = compose(blast_record_iterator, ioffe)
    
    aln_funcs = alignment_functions(attrgetter("sstart"),
                                   attrgetter("send"))
    return lambda : aln_funcs.LIS(aln_funcs.score_getter_matching_consensus_estimated,
                                  blast_alignment_getter(alignment_file))
Exemplo n.º 5
0
def coverage_from_blast6():

    if not len(sys.argv) == 2:
        sys.exit("coverage_from_blast6 in.blast6")

    raw_alignment_it = blast_record_iterator(iterator_over_file(sys.argv[1]))
    lno = partial(best_scoring_non_overlapping,
                  attrgetter("qstart"), 
                  attrgetter("qend"),
                  attrgetter("bitscore"))

    q_filt_alignment_it = chain.from_iterable(
        imap(compose(lno, itemgetter(1)), 
             groupby(raw_alignment_it, 
                     attrgetter("qname"))))
    
    #read all alignments into memory
    ref_sorted_alignments = sorted(q_filt_alignment_it, 
                                   key=attrgetter("sname"))
    
    for reference,alignments in groupby(ref_sorted_alignments,
                                        attrgetter("sname")):
        alignments = list(alignments)
        ref_len = alignments[0].slen
        
        blast_start_getter = lambda a: a.sstart-1
        blast_end_getter = lambda a: a.send-1
        cov_arr = coverage_array_from_ranges(alignments, ref_len,
                                             blast_start_getter,
                                             blast_end_getter)
        filter(print, izip(count(1),cov_arr))
        #mark the regions with 0 coverage
        zerocov = map(lambda x: 1 if x==0 else 0, cov_arr)
        zerocov_regions = get_marked_ranges(zerocov)
        
        region_printer = compose(print,lambda (x,(y,z)) : "\t".join(map(str,[x,y,z])))
        
        #filter(region_printer, izip(repeat(reference), zerocov_regions))
        
        ##Get Low ID regions
        ranges_w_id = imap(compose(lambda (x,y,i) : (x-1,y-1,i),
                                   attrgetter("sstart","send","pctid")), alignments)

        pct_arr = coverage_array_from_ranges(alignments, ref_len,
                                             blast_start_getter,
                                             blast_end_getter,
                                             lambda r, (o_pid,o_cnt): (r.pctid+o_pid, o_cnt+1), 
                                             (0,0))
        lowid = map(lambda (c_pid,cnt): 1 if cnt != 0 and c_pid/cnt < 95.0 else 0,
                    pct_arr)
        
        lowid_regions = get_marked_ranges(lowid)
Exemplo n.º 6
0
def blast6filter_main(cmdline_args = None):
    
    if not cmdline_args:
        import sys
        cmdline_args = sys.argv

    if not len(cmdline_args) == 3:
        return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first"
    
    task,infile = cmdline_args[1:3]

    fileit = iterator_over_file(infile)

    alignment_getter = blast_record_iterator(fileit)

    #
    #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
    #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send"))
    #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns)))
    #
    #sys.exit(1)
    
    if task.startswith("r"):
        grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send"))
        score_func = aln_funcs.score_getter_matching_consensus_estimated
        greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter, final_sort_key=attrgetter("pctid"))
        def remove_self(alns):
            a = list(alns)
            log("Remove Self: Working on %d" % len(alns))
            filtered = filter(lambda y: not y.qname == y.sname, a)
            log("Remove Self: Filtered alignments: %d" % len(filtered))
            return filtered
            
        lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained, greedy_repeat_filt, remove_self)
        best = imap(lis, grouped_alns)
        if task == "r_noover":
            score_func = aln_funcs.score_getter_penalize_overlap_estimated
            lis = compose(partial(aln_funcs.LIS, score_func), aln_funcs.remove_contained)
            best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns)        
        if task =="r_experimental":
            lis = compose(greedy_repeat_filt, remove_self)
            best = imap(lis, grouped_alns)

    else:
        grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("qstart"), attrgetter("qend"))
        lis = compose(partial(aln_funcs.LIS,aln_funcs.score_getter_penalize_overlap_estimated), aln_funcs.remove_contained)
        best = imap(compose(partial(map,itemgetter(2)),lis), grouped_alns)        
        
    

    
    filter(print,imap(record_to_string, 
                      chain.from_iterable(best)))
Exemplo n.º 7
0
    def LIS(score_getter, alignments):
        '''Score getter takes two alignments and returns a score,
           Should probably choose from the above scoring functions'''
        logf = compose(log, partial(add, "LIS:"))
        alignments = make_list(alignments)



        logf("Starting Alignments: %d" % len(alignments))
        if len(alignments) == 0:
            return []
        logf("Sorting")
        end_sorted = sorted(alignments, key=eg, reverse=True)
        alns = sorted(end_sorted, key=sg)

        logf("Starting DP")

        #initialize lis array
        lis = map(LIS_t._make, izip(imap(partial(score_getter,None), alns),
                                repeat(-1)))
        #DP
        for i in xrange(len(alns)):
            for j in xrange(i):
                ##Score getter needs to know about how it's being used (Oh well)
                score = lis[j].score + score_getter(alns[j], alns[i])
                if score > lis[i].score:
                    lis[i] = LIS_t(score, j)
                
        #traceback
        max_pos, _ = max(enumerate(lis), key=itemgetter(1))

        tb = [False] * len(alns)
        cur_max = max_pos
        while True:
            tb[cur_max] = True
            cur_max = lis[cur_max].prev
            if cur_max == -1:
                break
        filtered = filter(itemgetter(0), izip(tb,lis,alns))
        logf("Filtered Alignments: %d" % len(filtered))

        return filtered
Exemplo n.º 8
0
def correct_oxford(reads_fn=None, alignments_fn=None):
    '''Corrects oxford reads'''
    
    log = logger(sys.stderr)
    
    if not reads_fn or not alignments_blast6_fn:
        if not len(sys.argv) == 3:
            sys.exit("correct.py raw_reads.fa alignments.blast6")
        (reads_fn,alignments_fn) = sys.argv[1:3]

        log("Reading raw reads into memory")
        #just put all reads in memory
        fastas = compose(fasta_iterator, iterator_over_file)(reads_fn)
        raw_reads = dict(map(attrgetter("name","seq"), fastas))

        log("Reading raw reads DONE :)")

        #The alignments need to be sorted by the long read name (second column)
        alignment_it = line_record_iterator(Blast6SeqRecord, Blast6SeqTypes,
                                            iterator_over_file(alignments_fn))
        
        important_field_getter = attrgetter("qname","sname","qstart","qend",
                                            "sstart","send", "qseq", "sseq")
                                            
        for readname, alignments in groupby(alignment_it, attrgetter("sname")):
            log("Working on %s" % readname)
            
            raw_read_seq = raw_reads.get(readname)
            if not raw_read_seq:
                log("Can not find sequence for %s" % readname)
                continue

            log("Raw Read Length: %d" % len(raw_read_seq))    
            g = AlnGraph(raw_read_seq)

            alignments = imap(important_field_getter, alignments)
            num_alignments = 0
            for qname,sname,qstart,qend,sstart,send,qseq,sseq in alignments:

                #blast alignments are one based, convert to 0 based
                (qstart, qend) = (qstart-1, qend-1)
                (sstart, send) = (sstart-1, send-1)

                #reverse complement, must switch the alignment strings
                if send < sstart:
                    (qseq, sseq) = tuple(map(reverse_complement, [qseq,sseq]))
                    send, sstart = sstart,send
                    
                (qseq, sseq) = convert_mismatches(qseq,sseq)
                try:
                    alignment_tuple =((qstart, qend, qseq),
                                      (sstart, send, sseq), qname) 
                    g.add_alignment( alignment_tuple)
                except Exception as e:
                    log("Add Alignmented Error: %s" % e)
                    continue
                if num_alignments > TOO_MANY_ALIGNMENTS:
                    break
                
                num_alignments += 1

            log("Processed Alignments: %d" % num_alignments)
            if num_alignments > TOO_MANY_ALIGNMENTS:
                log("Too Many Alignments, Skipping")
                continue
            
            log("Generating Consensus")
            consensus = g.generate_all_consensus(min_cov=0)[0]
            log("Consensus Length %d" % len(consensus[0]))
            log("%s Done\n\n" % readname)

            #log("Output dag info")
            #output_dag_info(g, "g.info")

            print ">"+readname+"_consensus"
            print consensus[0]
Exemplo n.º 9
0
def blast6filter_main(cmdline_args=None):

    if not cmdline_args:
        import sys
        cmdline_args = sys.argv

    if not len(cmdline_args) == 3:
        return "blast6filter q/r_cons/r_noover input.blast6 -- Make sure input is sorted by q/r first"

    task, infile = cmdline_args[1:3]

    fileit = iterator_over_file(infile)

    alignment_getter = blast_record_iterator(fileit)

    #
    #grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
    #aln_funcs = alignment_functions(attrgetter("sstart"), attrgetter("send"))
    #filter(compose(print,record_to_string), chain.from_iterable(imap(partial(aln_funcs.greedy_repeat_filter,final_sort_key=attrgetter("pctid")), grouped_alns)))
    #
    #sys.exit(1)

    if task.startswith("r"):
        grouped_alns = alignment_grouper(attrgetter("sname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("sstart"),
                                        attrgetter("send"))
        score_func = aln_funcs.score_getter_matching_consensus_estimated
        greedy_repeat_filt = partial(aln_funcs.greedy_repeat_filter,
                                     final_sort_key=attrgetter("pctid"))

        def remove_self(alns):
            a = list(alns)
            log("Remove Self: Working on %d" % len(alns))
            filtered = filter(lambda y: not y.qname == y.sname, a)
            log("Remove Self: Filtered alignments: %d" % len(filtered))
            return filtered

        lis = compose(partial(aln_funcs.LIS,
                              score_func), aln_funcs.remove_contained,
                      greedy_repeat_filt, remove_self)
        best = imap(lis, grouped_alns)
        if task == "r_noover":
            score_func = aln_funcs.score_getter_penalize_overlap_estimated
            lis = compose(partial(aln_funcs.LIS, score_func),
                          aln_funcs.remove_contained)
            best = imap(compose(partial(map, itemgetter(2)), lis),
                        grouped_alns)
        if task == "r_experimental":
            lis = compose(greedy_repeat_filt, remove_self)
            best = imap(lis, grouped_alns)

    else:
        grouped_alns = alignment_grouper(attrgetter("qname"), alignment_getter)
        aln_funcs = alignment_functions(attrgetter("qstart"),
                                        attrgetter("qend"))
        lis = compose(
            partial(aln_funcs.LIS,
                    aln_funcs.score_getter_penalize_overlap_estimated),
            aln_funcs.remove_contained)
        best = imap(compose(partial(map, itemgetter(2)), lis), grouped_alns)

    filter(print, imap(record_to_string, chain.from_iterable(best)))
Exemplo n.º 10
0
def record_to_string(record, delim="\t"):
    fields = record._fields
    val_getter = compose(str, partial(getattr, record))
    return delim.join(imap(val_getter, fields))
Exemplo n.º 11
0
def line_record_iterator(record, types, iterable):
    '''Converts an iterable (of lines) to records with given type'''
    record_maker = compose(record._make, 
                           partial(zipmap,types) ,
                           getattr(str, "split"))
    return imap(record_maker, iterable)