def replace_results(randoms_old, randoms_replace, directory, suffix, replace_suffix, new_suffix): rold = load_enhancers(randoms_old) rreplace = load_enhancers(randoms_replace) if rreplace == []: return e2 = rreplace.pop(0) lines = [] for e in rold: if e.id == e2.id: lines.append(True) if rreplace != []: e2 = rreplace.pop(0) else: lines.append(False) for filename in glob(directory + "/*" + suffix): frep = open(filename + replace_suffix) fnew = open(filename + new_suffix) for i, line in enumerate(open(filename)): if lines[i]: fnew.write(frep.readline()) else: fnew.write(line) frep.close() fnew.close()
def replace_randoms(randoms_old, randoms_replace, randoms_new): rold = load_enhancers(randoms_old) rreplace = load_enhancers(randoms_replace) rnew = [] if rreplace == []: return e2 = rreplace.pop(0) for e in rold: if e.id == e2.id: rnew.append(e2) if rreplace != []: e2 = rreplace.pop(0) else: rnew.append(e) save_enhancers(rnew, randoms_new)
def count_all_means(enhancers_file, directories, name): enhancers = load_enhancers(enhancers_file) print "nr of sequences:", len(enhancers) names = [str(x.id) for x in enhancers] try: fails_file = open(DATAPATH + "%s.fails" % (name), "a") except: fails_file = open(DATAPATH + "%s.fails" % (name), "w+") fails_file.write('\t'.join([' '] + names) + "\n") print "saving fails number to %s%s.fails" % (DATAPATH, name) n = len(names) result = [0] * n filenames = [] for directory in directories: filenames += glob(directory + "/*.bigWig") + glob(directory + "/*.bw") random.shuffle(filenames) for filename in filenames: fails = count_mean_signal(enhancers, filename, name) if fails != []: fails_file.write("\t".join([filename.split('/')[-1]] + map(str, fails)) + "\n") fails_file.close()
def count_all_means(enhancers_file, directories, name): enhancers = load_enhancers(enhancers_file) #print len(enhancers) names = [str(x.id) for x in enhancers] try: fails_file = open(DATAPATH + "%s.fails" % (name), "a") except: fails_file = open(DATAPATH + "%s.fails" % (name), "w+") fails_file.write('\t'.join([' '] + names) + "\n") print "saving fails number to %s%s.fails" % (DATAPATH, name) n = len(names) result = [0] * n filenames = [] for directory in directories: filenames += glob(directory + "/*.bigWig") + glob(directory + "/*.bw") pool = ThreadPool(N_pools) fail_list = pool.map(lambda x: count_mean_signal(enhancers, x, name), filenames) pool.close() for filename, fails in zip(filenames, fail_list): if fails != []: fails_file.write("\t".join([filename.split('/')[-1]] + map(str, fails)) + "\n")
def change_random(randoms_file, to_remove): enhancers = load_enhancers(randoms_file) record_dict = SeqIO.index(home+"data/female.hg19.fa", "fasta") to_return = [] for e in enhancers: if e.id not in to_remove: continue while True: l = e.end - e.start start = random.randint(0,chr_lens[e.chromosome]-l-1) if record_dict[e.chromosome].seq[start:(start+l)].find('N') == -1: to_return.append(Position(e.chromosome, start, start + l, e.id, False, [])) break else: print e.chromosome save_enhancers(to_return, randoms_file+".change") return to_return
def change_random(randoms_file, to_remove): enhancers = load_enhancers(randoms_file) record_dict = SeqIO.index(home + "data/female.hg19.fa", "fasta") to_return = [] for e in enhancers: if e.id not in to_remove: continue while True: l = e.end - e.start start = random.randint(0, chr_lens[e.chromosome] - l - 1) if record_dict[e.chromosome].seq[start:(start + l)].find('N') == -1: to_return.append( Position(e.chromosome, start, start + l, e.id, False, [])) break else: print e.chromosome save_enhancers(to_return, randoms_file + ".change") return to_return
from ..shared import * import load_vista import sys if __name__ == '__main__': if len(sys.argv) < 4: print "USAGE: add_sequences.py TYPE inputfile outputfile [start_id] [tissue]" print "Genome file: %s%s, files_directory: %s" % ( DATAPATH, GENOME_FILE, DATAPATH) sys.exit(1) if len(sys.argv) == 6: tissue = [sys.argv[5]] positive = True else: tissue = [] positive = False if sys.argv[1] == 'bed': positions = load_vista.load_bed(sys.argv[2], positive, tissue, startid=int(sys.argv[4])) load_vista.add_seqs(positions, sys.argv[3]) else: #vista type positions = load_vista.load_enhancers(sys.argv[2]) load_vista.add_seqs(positions, sys.argv[3])
x.end)) overlaps.append(x.id) break overlaps = list(set(overlaps)) return overlaps if __name__ == "__main__": dist = True if len(sys.argv) < 4: print "USAGE: overlaps.py from_file with_db tissue output" print "output_directory: %s" % (DATAPATH) sys.exit(1) file1 = sys.argv[1] db = sys.argv[2] tissue = sys.argv[3] outname = sys.argv[4] remove = sorted( check_overlap(load_enhancers(file1), load_target(db, tissue, False))) print remove with open(DATAPATH + outname, 'a+') as f: #remove = [x for x in remove] #print "removing %d"% len(set(remove)) f.write("\n".join([str(x) for x in remove]) + "\n") f.close()
import sys if __name__ =='__main__': if len(sys.argv) < 4: print "USAGE: add_sequences.py TYPE inputfile outputfile [start_id] [tissue]" print "Genome file: %s%s, files_directory: %s"% (DATAPATH, GENOME_FILE, DATAPATH) sys.exit(1) if len(sys.argv) == 6: tissue = [sys.argv[5]] positive = True else: tissue = [] positive = False if sys.argv[1] == 'bed': positions = load_vista.load_bed(sys.argv[2], positive, tissue, startid=int(sys.argv[4])) load_vista.add_seqs(positions, sys.argv[3]) else: #vista type positions = load_vista.load_enhancers(sys.argv[2]) load_vista.add_seqs(positions, sys.argv[3])
overlaps=list(set(overlaps)) return overlaps if __name__ == "__main__": dist = True if len(sys.argv) < 4: print "USAGE: overlaps.py from_file with_db tissue output" print "output_directory: %s"% ( DATAPATH) sys.exit(1) file1 = sys.argv[1] db = sys.argv[2] tissue = sys.argv[3] outname = sys.argv[4] remove = sorted(check_overlap(load_enhancers(file1), load_target(db, tissue, False))) print remove with open(DATAPATH+outname, 'a+') as f: #remove = [x for x in remove] #print "removing %d"% len(set(remove)) f.write("\n".join([str(x) for x in remove])+"\n") f.close()