def test_parse_balse(self): orientaion = -1 cns = parse_blast( self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta ) print cns
def main(cns_file, qdups_path, sdups_path, pair_file, fmt, qbed, sbed, qpad, spad, blast_path, unmasked_fasta, mask='F', ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas( sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".localdups" slocaldups_path = sbed.path.split(".")[0] + ".localdups" npair_file, nqlocaldups, nslocaldups, ncns_file = map( make_copy_of_file, [pair_file, qlocaldups_path, slocaldups_path, cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups, rdups = get_pairs(pair_file, fmt, qdups, sdups) print len(dups), len(rdups) ldups = get_large_dups(dups, qdups, sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent, sparent) for qparent, sparent in dups if qparent in rdups and sparent in rdups] for (qparent, sparent) in dups: if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue cnss_size = [] qfeat_dups = get_all_dups(qdups, qparent) sfeat_dups = get_all_dups(sdups, sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [ c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map, qfastas_map, sfastas_map, qpad_map, spad_map) if c ] results = ( r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >> sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss)) cnss_size.append( (len(cnss) * -1, qfeat["start"], sfeat["start"], qfeat["accn"], sfeat["accn"], cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent, sparent) in rdups_dic[qparent].keys(): logging.info((qparent, sparent)) rdups_dic[qparent].update({(qparent, sparent): cnss_size}) elif sparent in rdups: if (qparent, sparent) in rdups_dic[sparent].keys(): logging.info((qparent, sparent)) rdups_dic[sparent].update({(qparent, sparent): cnss_size}) else: cnss_size.sort() cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[ 0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >> sys.stderr, "FINAL: {0},{1},{2}".format( qaccn, saccn, cns_number) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, cnss_size, qparent, sparent, qfeat, sfeat, qdups, sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): qparent, sparent = dparents ### one or list? cnss[0]? cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[ dparents] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, [best_reps[dparents]], qparent, sparent, qfeat, sfeat, qdups, sdups) write_nolocaldups( qbed.path, nqlocaldups, "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups( sbed.path, nslocaldups, "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file, 'pair', "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]), "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]), "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
def test_parse_balse(self): orientaion = -1 cns = parse_blast(self.blast_str, orientaion, self.qfeat, self.sfeat, self.qbed, self.sbed, 12000, 26000, self.unmasked_fasta) print cns