def make_pair_maps(pair_file, fmt, qbed, sbed): """make dicts of q => s and s => q""" qmap_tuple = [] for pair in get_pair(pair_file,fmt, qbed, sbed): if pair is None: break (sname, qname) = pair qmap_tuple.append((qname,sname)) qmap_tuple.append((sname,qname)) return qmap_tuple
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) #################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def make_pair_maps(pair_file, fmt, qbed, sbed): """ make dicts of q => s and s => q """ qmap = collections.defaultdict(list) # key is query, value is a list of subject hits smap = collections.defaultdict(list) print >>sys.stderr, "pair file:", pair_file for pair in get_pair(pair_file, fmt, qbed, sbed): if pair is None: break (qname, sname) = pair qmap[qname].append(sname) smap[sname].append(qname) return qmap, smap
def make_pair_maps(pair_file, fmt, qbed, sbed): """ make dicts of q => s and s => q """ qmap = collections.defaultdict( list) # key is query, value is a list of subject hits smap = collections.defaultdict(list) print >> sys.stderr, "pair file:", pair_file for pair in get_pair(pair_file, fmt, qbed, sbed): if pair is None: break (qname, sname) = pair qmap[qname].append(sname) smap[sname].append(qname) return qmap, smap
def get_homeolog(qfeat,pairsfile, sbed, qbed): for region, sregion in get_pair(pairsfile, 'pck', sbed, qbed): if region['sfeat'] == qfeat[3]: return region['ORG2_qfeat']
def main(qbed,sbed,missed_pairs, ncpu): """run tblastx on missed pairs...""" #print >>sys.stderr,ncpu ncpu = int(ncpu) pool = Pool(ncpu) pairs_file = get_pairs_file(missed_pairs) print >>sys.stdout, "#hit,ref_gene,blastn_introns,blastx_hits, blastx_gene_hits, blastx_frame, blastn_gaps, blastx_gaps,orf_perdiction,orf_blastx,frame_shift" blastn = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastn -G 5 -E 2 -W 7 -q -2 -e 0.001 -D 1 -i {0} -j {1} -I {2},{3} -J {4},{5} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = split_fastas(qbed)#MASK CODING sfastas = get_mask_non_cds(sbed) #mask noncoding pairs = [True] _get_pair_gen = get_pair(pairs_file,"pair", qbed,sbed) def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] def get_blastn_cmd(pair): """creates the dictionary values used to fill in blast cmd""" if pair is None: return None hit, gene = pair hstart, hstop = abs(3000 - hit['start']), (3000 + hit['end']) # double check fasta to make sure i dont need to add or remove one gstart,gstop = gene['start'],gene['end'] # checks the entire gene... query_file = qfastas[hit['seqid']] subject_file = sfastas[gene['seqid']] blastn_cmd = blastn.format(query_file, subject_file, hstart, hstop, gstart, gstop) #print >> sys.stderr,'{0},{1},{2}'.format(hit['accn'],gene['accn'],cmd) return blastn_cmd,hit, gene cmds = [c for c in map(get_blastn_cmd, [l for l in pairs if l]) if c] #print >>sys.stderr, "results: {0}".format(cmds[0][0]) results = (r for r in pool.map(commands.getoutput,[c[0] for c in cmds])) for res, (cmd, hit, gene) in zip(results,cmds): print >>sys.stderr, "CMD: {0},{1}".format(gene['accn'],hit['accn']) d,no_res = group_cds(res, gene) gap_list =[] intron_list = [] hit['locs'] = [] if no_res == True: continue for group_key in d.keys(): exon_hits = d[group_key] non_crossing = remove_crossing_hits(exon_hits,hit,gene) if len(non_crossing) > 1: gaps,hstart,hend =bites(non_crossing) gap_list.append(sum(gaps)) elif len(non_crossing) == 1: # print >>sys.stderr, non_crossing [(hstart,hend,sstart,send,evalue)] = non_crossing if len(non_crossing) >= 1: intron_list.append(group_key[0]) hit['locs'].append((hstart,hend)) hit['locs'].sort() #print >>sys.stderr, "hit_loc : {0}".format(hit['locs']) if len(hit['locs']) < 1: continue orf_prediction = find_orf(qbed,hit) introns = "{0}/{1}".format(len(intron_list),len(gene['locs'])) gap_totaln = sum(gap_list) # new hit locs made from blastn res hit_percent, gene_percent, frame_percent,frame_shift, best_frame, gap_total,orf_start= protein_parse(hit,gene,sbed,qbed) orf_start = abs(min(hit['locs'][0]) + int(orf_start)) w ="{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}".format(hit['accn'],gene['accn'],introns,hit_percent,gene_percent, frame_percent,gap_totaln,gap_total,orf_prediction,orf_start,frame_shift) print >>sys.stdout, w