def Readanno(filename, annoglb, genome): glannot = glload(annoglb) allelement = set(glannot['annot']) if genome in ['mm10']: chr_list = ['chr' + str(i) for i in range(1, 20)] + ['chrX', 'chrY', 'chrM'] elif genome in ['hg38']: chr_list = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] return (allelement, chr_list, annoglb, glannot)
def Readanno(filename, annoglb, genome): glannot = glload(annoglb) allelement = set(glannot['annot']) # if genome in ['mm10']: # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ] # elif genome in ['hg38']: # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ] chr_list = list(set([k['chr'] for k in glannot['loc'] ])) #this is useful for costume chromsome return (allelement, chr_list, annoglb, glannot)
def getanno(filename, genefile, tefile, genome, mode): form = { 'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3 } if genefile == 'default' and tefile == 'default': if genome == 'mm10': chr_list = ['chr' + str(i) for i in range(1, 20)] + ['chrX', 'chrY', 'chrM'] if mode == 'exclusive': if not os.path.exists('mm10.exclusive.glb'): logging.error( "Did not find the annotation index mm10.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) sys.exit(1) all_annot = 'mm10.exclusive.glb' allelement = set(glload(all_annot)['annot']) elif mode == 'inclusive': if not os.path.exists('mm10.inclusive.glb'): logging.error( "Did not find the annotation index mm10.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) sys.exit(1) all_annot = 'mm10.inclusive.glb' allelement = set(glload(all_annot)['annot']) elif genome == 'hg38': chr_list = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] if mode == 'exclusive': if not os.path.exists('hg38.exclusive.glb'): logging.error( "Did not find the annotation index hg38.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) sys.exit(1) all_annot = 'hg38.exclusive.glb' allelement = set(glload(all_annot)['annot']) elif mode == 'inclusive': if not os.path.exists('hg38.inclusive.glb'): logging.error( "Did not find the annotation index hg38.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) sys.exit(1) all_annot = 'hg38.inclusive.glb' allelement = set(glload(all_annot)['annot']) else: if genome in ['hg38']: chr_list = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] elif genome in ['mm10']: chr_list = ['chr' + str(i) for i in range(1, 20)] + ['chrX', 'chrY', 'chrM'] if not os.path.isfile(tefile): logging.error("No such file: %s !\n" % (tefile)) sys.exit(1) if not os.path.isfile(genefile): logging.error("No such file: %s !\n" % (genefile)) sys.exit(1) all_annot = annoGtf(filename, genefile=genefile, tefile=tefile, mode=mode) allelement = set(glload(all_annot)['annot']) return (allelement, chr_list, all_annot)
def align(chr, filename, all_annot, glannot, whitelist, CB): ''' **Purpose** For each read, align it to the index and assign a TE, gene. This is the speed critical part. ''' s1 = time.time() chr = 'chr' + chr if not os.path.exists('%s_scTEtmp/o3' % filename): os.system('mkdir -p %s_scTEtmp/o3' % filename) if not glannot: # Load separately for the multicore pipeline, share the index for the single core pipeline glannot = glload(all_annot) # Only keep the glbase parts we need. buckets = glannot.buckets[chr.replace('chr', '')] all_annot = glannot.linearData oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr), 'rt') res = {} for line in oh: t = line.strip().split('\t') barcode = t[3] if barcode not in whitelist: continue if barcode not in res: res[barcode] = defaultdict(int) #chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom; left = int(t[1]) rite = int(t[2]) #loc = location(chr=chrom, left=left, right=rite) left_buck = ((left - 1) // 10000) * 10000 right_buck = ((rite) // 10000) * 10000 buckets_reqd = range(left_buck, right_buck + 10000, 10000) if buckets_reqd: loc_ids = set() loc_ids_update = loc_ids.update # get the ids reqd. [ loc_ids_update(buckets[buck]) for buck in buckets_reqd if buck in buckets ] result = [ all_annot[index]['annot'] for index in loc_ids if (rite >= all_annot[index]['loc'].loc['left'] and left <= all_annot[index]['loc'].loc["right"]) ] if result: for gene in result: res[barcode][gene] += 1 oh.close() oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename, filename, chr), 'wt') for bc in sorted(res): for gene in sorted(res[bc]): oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene])) oh.close()