def __init__(self, load_refseq=True, load_canonical=True, load_transcript=False): self.ensembl_id_patt = re.compile('(ENST\d+)') if load_refseq: gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH, snapconf.REFSEQ_ANNOTATION) gene_pickle_file = "%s.pkl" % (gene_file) self.gene_map = snaputil.load_cpickle_file(gene_pickle_file) if not self.gene_map: self.load_gene_coords(gene_file) snaputil.store_cpickle_file(gene_pickle_file, self.gene_map) if load_canonical: canonical_gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH, snapconf.CANONICAL_ANNOTATION) canonical_gene_pickle_file = "%s.pkl" % (canonical_gene_file) self.canonical_gene_map = snaputil.load_cpickle_file( canonical_gene_pickle_file) if not self.canonical_gene_map: self.load_canonical_gene_coords(canonical_gene_file) snaputil.store_cpickle_file(canonical_gene_pickle_file, self.canonical_gene_map) #per transcript exons if load_transcript: transcript_file = "%s/%s" % (snapconf.TABIX_DB_PATH, snapconf.TABIX_GENE_INTERVAL_DB) transcript_pickle_file = "%s.pkl" % (transcript_file) self.transcript_map = snaputil.load_cpickle_file( transcript_pickle_file) if not self.transcript_map: self.load_transcripts(transcript_file) snaputil.store_cpickle_file(transcript_pickle_file, self.transcript_map)
def __init__(self,load_refseq=True,load_canonical=True,load_transcript=False): self.ensembl_id_patt = re.compile('(ENST\d+)') if load_refseq: gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH,snapconf.REFSEQ_ANNOTATION) gene_pickle_file = "%s.pkl" % (gene_file) self.gene_map = snaputil.load_cpickle_file(gene_pickle_file) if not self.gene_map: self.load_gene_coords(gene_file) snaputil.store_cpickle_file(gene_pickle_file,self.gene_map) if load_canonical: canonical_gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH,snapconf.CANONICAL_ANNOTATION) canonical_gene_pickle_file = "%s.pkl" % (canonical_gene_file) self.canonical_gene_map = snaputil.load_cpickle_file(canonical_gene_pickle_file) if not self.canonical_gene_map: self.load_canonical_gene_coords(canonical_gene_file) snaputil.store_cpickle_file(canonical_gene_pickle_file,self.canonical_gene_map) #per transcript exons if load_transcript: transcript_file = "%s/%s" % (snapconf.TABIX_DB_PATH,snapconf.TABIX_GENE_INTERVAL_DB) transcript_pickle_file = "%s.pkl" % (transcript_file) self.transcript_map = snaputil.load_cpickle_file(transcript_pickle_file) if not self.transcript_map: self.load_transcripts(transcript_file) snaputil.store_cpickle_file(transcript_pickle_file,self.transcript_map)
def sample_ids2intron_ids_from_bit_vector(sample_ids): snaptron_ids_final = None for sample_id in sample_ids: snaptron_ids = snaputil.load_cpickle_file("%s/%s.pkl" % (snapconf.PACKED_SAMPLE_IDS_PATH, str(sample_id)), compressed=False) #in a few cases we may not have a mapping for a specific sample_id if snaptron_ids is None: continue if snaptron_ids_final is None: snaptron_ids_final = snaptron_ids else: snaptron_ids_final = snaptron_ids_final | snaptron_ids snaptron_ids_final_set = set() [snaptron_ids_final_set.add(str(i)) for (i,x) in enumerate(snaptron_ids_final) if x] return snaptron_ids_final_set
def load_sample_metadata(file_): start = time.time() fmd=snaputil.load_cpickle_file("%s.pkl" % (file_)) if fmd: end = time.time() taken = end-start return fmd start = time.time() fmd={} #dont need the hash-on-column headers just yet with open(file_,"r") as f: for line in f: line = line.rstrip() fields=line.split("\t") fmd[fields[0]]=line end = time.time() taken = end-start #sys.stderr.write("time taken to load samples from normal: %d\n" % taken) snaputil.store_cpickle_file("%s.pkl" % (file_),fmd) return fmd
COMPRESSED=False path='/data3/snaptron/sample_ids_full' #path='/data3/snaptron/sample_ids' #suffix='.gz' suffix='' def orthem(ba1,ba2): ba_final = ba1 | ba2 return ba_final def setthem(ba_final): i = 0 s1=set() [s1.add(i) for (i,x) in enumerate(ba_final) if x] #for bit in ba_final: # if bit: # s1.add(i) # i+=1 return s1 ba_final = su.load_cpickle_file("%s/0.pkl%s" % (path,suffix), compressed=COMPRESSED) for i in xrange(1,5000): ba2 = su.load_cpickle_file("%s/%s.pkl%s" % (path,str(i),suffix), compressed=COMPRESSED) if ba2 != None: ba_final = orthem(ba2,ba_final) s1 = setthem(ba_final) print len(s1) #sys.stdout.write(",".join([str(x) for x in s1]))