def _main(args): if len(args) != 3: print("usage: xls_get_region_from_fasta.py <fasta> <xls> <window>") sys.exit(1) win = int(args[2]) fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[0]) seqs = [] for ln in open(args[1]): sp = ln[:-1].split() print sp pk = int(sp[1]) + int(sp[4]) seq = fasta[sp[0]]['sequence'][(pk - win):(pk + win)] get_in = sp[ -1] #raw_input(">%s:%d..%d\n\'k\'=keep; \'r\' = reverse comp; \'<anything else>\' = discard: " % (sp[0],pk-win,pk+win)) if get_in == 'k': pass elif get_in == 'r': seq = fasta_subseq_2.revcomp(seq) else: continue seqs.append(">%s:%d..%d\n%s" % (sp[0], pk - win, pk + win, seq)) outfile = raw_input("name of output file: ") outfh = open(outfile, "w") for s in seqs: print >> outfh, s
def get_enrich(xls_regs, sgr, winsize, fasta): fasta_db = fasta_subseq_2.FastaDB() fasta_db.openFastaFile(fasta) for reg in xls_regs: for x in ('mtx1_hits', 'mtx2_hits'): hit_info = [] for h in reg[x]: hit = {'hit_obj': h} width = abs(h['start'] - h['end']) if h['strand'] == "+": seq = fasta_db[h['chr']]['sequence'][h['start']:( h['start'] + width)] hit['loc'] = h['start'] else: ### !!!!! CHANGE IF FIX HIT DATABASE!!!! seq = fasta_subseq_2.revcomp( fasta_db[h['chr']]['sequence'][h['end']:(h['end'] + width)]) hit['loc'] = h['end'] hit['nearest'] = (0, 0) hit['vals'] = [] hit['seq'] = seq hit_info.append(hit) reg[x + '_info'] = hit_info for y in open(sgr): (chr, loc, val) = y.split() loc = int(loc) val = int(val) #print chr for x in xls_regs: for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'): for d in x[hit_info]: #print (loc,target_loc) target_loc = d['loc'] if (chr == d['hit_obj']['chr']) and ( abs(loc - target_loc) < abs(loc - d['nearest'][0])): d['nearest'] = (loc, val) if (chr == d['hit_obj']['chr']) and (abs(loc - target_loc) < (winsize / 2)): d['vals'].append(val) print >> sys.stderr, d for x in xls_regs: for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'): for h in x[hit_info]: h['win_mean'] = np.mean(h['vals']) h['win_median'] = np.median(h['vals']) h['enrich_md'] = h['nearest'][1] / h['win_median'] h['enrich_mn'] = h['nearest'][1] / h['win_mean'] print >> sys.stderr, h return xls_regs
def _main(args): usage = "pecan_WGA_runner.py <genome_fastas_file> <treefile> <mercator_map> <mercator_genome_order> <outdir>" if len(args) != 5: print usage sys.exit(0) genome_dict = {} genome_order = [] mercator_genome_order = [] map_lines = [] mercator_order_file = open(args[3]) mercator_genome_order = mercator_order_file.readline().split() for ln in open(args[0]): (sp, fasta) = ln[:-1].split() genome_order.append(sp) genome_dict[sp] = fasta_subseq_2.FastaDB() genome_dict[sp].openFastaFile(fasta) tree_dict = generate_trees(genome_dict, args[1]) for ln in open(args[2]): map_ln = ln[:-1].split() (species, map_dict) = make_map_dict(map_ln[1:], genome_order, mercator_genome_order) tree_obj = tree_dict[species] tree_strIO = StringIO() Phylo.write(tree_obj, tree_strIO, "newick") fastas = [(sp, genome_dict[sp]) for sp in genome_order if (sp in species)] map_entry = { 'map_dict': map_dict, 'tree': tree_strIO.getvalue(), 'map_idx': int(map_ln[0]), 'fastas': fastas } map_lines.append(map_entry) os.chdir(args[4]) pool = mp.Pool(2) #pool.map(run_aln_mapline, map_lines) pool.map(run_aln_mapline, map_lines) print "ALL DONE!"
def _buildAlnTables(self, subAlnObj, alnScore): species = subAlnObj.getSeqDict().keys() dt_init = [] # description for alignment table - build as we read spp exp_size = 0 self.species = species for (j, sp) in enumerate(species): # open fasta to get unaligned chromosome lengths & seqs sp_fasta = fs2.FastaDB() sp_fasta.openFastaFile(SPP_FASTAS[sp]) chrs = sorted(sp_fasta.keys()) # group for each species' chromosome arrays sp_grp = self.h5.createGroup(self.h5.root, sp, "%s Chromosomes" % (sp, )) self.species_chrs[sp] = {} # use 1 byte index as chr identifier self.chr_key_arrays[sp] = [None] * len(chrs) for (i, ch) in enumerate(chrs): # build 2 x len(chr) array of (base,aligned_coord) pairs # print sp_fasta[ch] bases = np.matrix(list(sp_fasta[ch].getFullSeq()), dtype=np.dtype("a1")).T maps = np.matrix(np.zeros(len(sp_fasta[ch]) - 1), dtype=np.dtype("u8")).T flat_chr_arr = np.ndarray(shape=(len(maps), 1), dtype=np.dtype([('base', 'a1'), ('aln_map', 'u8')])) flat_chr_arr['base'] = bases flat_chr_arr['aln_map'] = maps self.species_chrs[sp][ch] = self.h5.createTable( sp_grp, "chr" + ch, np.dtype([('base', 'a1'), ('aln_map', 'u8')])) self.species_chrs[sp][ch].append(flat_chr_arr) self.species_chrs[sp][ch].flush() self.chr_key_arrays[sp][i] = [ch, self.species_chrs[sp][ch]] print "%s %s length = %d added to %s" % (sp, ch, len(bases), sp_grp) exp_size += len(sp_fasta[ch]) # add a column to the align table description dt_init.append((sp, [('base', 'a1'), ('chr_key', 'u4'), ('position', 'u8')])) self.aln_tbl_dtype = np.dtype(dt_init) self.aln_table = self.h5.createTable( self.h5.root, 'aln_table', self.aln_tbl_dtype, expectedrows=exp_size) # make the alignment table self.aln_table.flush() self.built_chr_tabs = True
def _main(args): if len(args) < 4: print >> sys.stderr, "usage: xls_motif_window.py <xls> <fasta> <matrix_file> <window>" sys.exit(1) fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[1]) xls_regions = [] for x in open(args[0]): spl = x[:-1].split() region = { 'chr': spl[0], 'start': int(spl[1]), 'end': int(spl[2]), 'enrich': spl[7] } region['seq'] = fasta[ region['chr']]['sequence'][region['start']:region['end']] xls_regions.append(region) for r in xls_regions: try: annot = patser_tools.makePatserAnnotation(sequence=r['seq'], matrix=args[2]) except IOError: print >> sys.stderr, "Error in seq %s:%d..%d:" % ( r['chr'], r['start'], r['end']) continue if len(annot.getAllFeatures()) < 1: continue maxhit = annot.getMaxFeature("score") winstart = None winend = None winseq = None if maxhit.tags["strand"] == '+': winstart = r['start'] + (maxhit.start - int(args[3]) / 2) winend = r['start'] + (maxhit.start + int(args[3]) / 2) win_seq = fasta[r['chr']]['sequence'][winstart:winend] else: winstart = r['start'] + ((maxhit.end - 3) - int(args[3]) / 2) winend = r['start'] + ((maxhit.end - 3) + int(args[3]) / 2) win_seq = fasta_subseq_2.revcomp( fasta[r['chr']]['sequence'][winstart:winend]) print ">%s:%d..%d:%s enr=%s mtx=%s" % ( r['chr'], winstart, winend, maxhit.tags['strand'], r['enrich'], maxhit.tags['score']) print win_seq
def _main(args): if len(args) != 3: print "usage: <bed_file> <seq_file> <matrix>" sys.exit(0) fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[1]) bed_annots = [] bed_in = open(args[0]) for line in bed_in: spl = line[:-1].split() fseq = fasta[spl[0]]["sequence"][int(spl[1]):int(spl[2])] if spl[5] == "-": fseq = fasta_subseq_2.revcomp(fseq) #print spl try: patannot = patser_tools.makePatserAnnotation(sequence=fseq, matrix=args[2]) except: continue #print "-" * 30 #print spl #print pp(patannot.getAllFeatures()) bed_annots.append({ "seq": spl[0] + "_" + spl[1] + "_" + spl[2], "annotation": patannot }) for ann in bed_annots: for feat in ann["annotation"].getAllFeatures(): print "%s\t%i\t%i\t%f\t%f\t%s" % ( ann["seq"], feat.st, feat.en, feat.tagset["score"], feat.tagset["pval"], feat.tagset["strand"])
def _main(args): if len(args) != 3: print "usage: patser_annotate_genome_noxgrid.py <genome_seq> <matrix_file> <matrix_name>" sys.exit(0) # open fasta fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[0]) jobs = [] for (name, chr) in fasta.items(): srch = searchObj(chrObj=chr, seq_name=name, matrix=args[1], matrix_name=args[2]) print srch jobs.append(srch) print jobs pool = mp.Pool() results = pool.map(search, jobs)
def _fillWGADBFromFile(self): #print self.h5.root #print self.h5.root._v_children.items() self.aln_table = self.h5.root.aln_table for sp in self.h5.root._v_groups.keys(): self.species_chrs[sp] = {} self.species.append(sp) sp_fasta = fs2.FastaDB() sp_fasta.openFastaFile(SPP_FASTAS[sp]) chrs = sorted(sp_fasta.keys()) #.sorted() self.chr_key_arrays[sp] = [None] * len(chrs) for chrom in self.h5.root._v_groups[sp]._v_children.keys(): ch = chrom.replace("chr", "") #print (ch,chrom) self.species_chrs[sp][ch] = self.h5.root._v_groups[ sp]._v_children[chrom] for (i, chrom) in enumerate(chrs): #print self.species_chrs try: self.chr_key_arrays[sp][i] = self.species_chrs[sp][chrom] except: print >> sys.stderr, "WARNING: chromsome %s not found" % ( chrom, ) self.built_chr_tabs = True