def make_gene_info(output_file, gene_model, genome_id, is_grc, add_ref_id): # create UCSC to GRC chr name corresponding table ucsc2grc = {} if is_grc: ucsc2grc = chr_name.make_ucsc2grc(genome_id) ucsc_gene_file = utils.set_ucsc_gene_file(genome_id, gene_model) hout = open(output_file + ".unsorted.tmp", 'w') with gzip.open(ucsc_gene_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') chr = ucsc2grc[F[2]] if F[2] in ucsc2grc else F[2] gene_id = F[1] gene_start = F[4] gene_end = F[5] strand = F[3] symbol = F[12] exon_starts = F[9].split(',') exon_ends = F[10].split(',') gene_print_name = "---" if gene_model == "refseq": if add_ref_id: gene_print_name = symbol + '(' + gene_id + ')' else: gene_print_name = symbol elif gene_model == "gencode": gene_print_name = gene_id else: print >> sys.stderr, "the value of gene_model should be refseq or gencode" sys.exit(1) key = chr + '\t' + gene_start + '\t' + gene_end print >> hout, key + '\t' + gene_print_name + '\t' + "0" + '\t' + strand hout.close() hout = open(output_file + ".sorted.tmp", 'w') subprocess.check_call( ["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"], stdout=hout) hout.close() hout = open(output_file, 'w') subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"], stdout=hout) hout.close() subprocess.check_call(["tabix", "-p", "bed", output_file]) subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"]) subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])
def make_simple_repeat_info(output_file, genome_id, is_grc): # create UCSC to GRC chr name corresponding table ucsc2grc = {} if is_grc: ucsc2grc = chr_name.make_ucsc2grc(genome_id) if genome_id == "hg19": simple_repeat_file = pkg_resources.resource_filename( "annot_utils", "data/hg19/simpleRepeat.txt.gz") elif genome_id == "hg38": simple_repeat_file = pkg_resources.resource_filename( "annot_utils", "data/hg38/simpleRepeat.txt.gz") elif genome_id == "mm10": simple_repeat_file = pkg_resources.resource_filename( "annot_utils", "data/mm10/simpleRepeat.txt.gz") else: print >> sys.stderr, "genome_id shoud be hg19, hg38 or mm10" sys.exit(1) hout = open(output_file + ".unsorted.tmp", 'w') with gzip.open(simple_repeat_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') chr = ucsc2grc[F[1]] if F[1] in ucsc2grc else F[1] print >> hout, chr + '\t' + '\t'.join(F[2:]) hout.close() hout = open(output_file + ".sorted.tmp", 'w') subprocess.check_call( ["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"], stdout=hout) hout.close() hout = open(output_file, 'w') subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"], stdout=hout) hout.close() subprocess.check_call(["tabix", "-p", "bed", output_file]) subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"]) subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])
def make_exon_info(output_file, gene_model, genome_id, is_grc, add_ref_id): # create UCSC to GRC chr name corresponding table ucsc2grc = {} if is_grc: ucsc2grc = chr_name.make_ucsc2grc(genome_id) ucsc_gene_file = utils.set_ucsc_gene_file(genome_id, gene_model) hout = open(output_file + ".unsorted.tmp", 'w') with gzip.open(ucsc_gene_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') chr = ucsc2grc[F[2]] if F[2] in ucsc2grc else F[2] gene_id = F[1] exon_starts = F[9].split(',') exon_ends = F[10].split(',') strand = F[3] exonNum = int(F[8]) gene = F[1] symbol = F[12] size = 0 for i in range(len(exon_starts) - 1): size = size + int(exon_ends[i]) - int(exon_starts[i]) gene_print_name = "---" if gene_model == "refseq": if add_ref_id: gene_print_name = symbol + '(' + gene_id + ')' else: gene_print_name = symbol elif gene_model == "gencode": gene_print_name = gene_id for i in range(0, len(exon_starts) - 1): key = chr + '\t' + exon_starts[i] + '\t' + exon_ends[i] if strand == "+": print >> hout, key + '\t' + gene_print_name + '\t' + str(i) + '\t' + "+" else: print >> hout, key + '\t' + gene_print_name + '\t' + str(exonNum - i - 1) + '\t' + "-" hout.close() hout = open(output_file + ".sorted.tmp", 'w') subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"], stdout = hout) hout.close() hout = open(output_file, 'w') subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"], stdout = hout) hout.close() subprocess.check_call(["tabix", "-p", "bed", output_file]) subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"]) subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])
def make_boundary_info(output_file, genome_id, is_grc, donor_size, acceptor_size): # create UCSC to GRC chr name corresponding table ucsc2grc = {} if is_grc: ucsc2grc = chr_name.make_ucsc2grc(genome_id) ucsc_gene_file = utils.set_ucsc_gene_file(genome_id, "refseq") donor_size_exon, donor_size_intron = [ int(x) for x in donor_size.split(',') ] acceptor_size_intron, acceptor_size_exon = [ int(x) for x in acceptor_size.split(',') ] key2junction, key2gene_id, key2exon_num = {}, {}, {} with gzip.open(ucsc_gene_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') chr = ucsc2grc[F[2]] if F[2] in ucsc2grc else F[2] starts = [int(x) for x in F[9].split(',') if x != ''] ends = [int(x) for x in F[10].split(',') if x != ''] strand = F[3] exon_num = int(F[8]) gene_id = F[1] symbol = F[12] for i in range(0, exon_num - 1): if strand == '+': # donor key = '\t'.join([ chr, str(ends[i] - donor_size_exon), str(ends[i] + donor_size_intron), symbol, "donor", strand ]) else: # acceptor key = '\t'.join([ chr, str(ends[i] - acceptor_size_exon), str(ends[i] + acceptor_size_intron), symbol, "acceptor", strand ]) junction = chr + ':' + str( ends[i]) + '-' + str(starts[i + 1] + 1) if key not in key2junction: key2junction[key] = [] if key not in key2gene_id: key2gene_id[key] = [] if key not in key2exon_num: key2exon_num[key] = [] key2junction[key].append(junction) key2gene_id[key].append(gene_id) key2exon_num[key].append(str(i)) for i in range(1, exon_num): if strand == '+': # acceptor key = '\t'.join([ chr, str(starts[i] - acceptor_size_intron), str(starts[i] + acceptor_size_exon), symbol, "acceptor", strand ]) else: # donor key = '\t'.join([ chr, str(starts[i] - donor_size_intron), str(starts[i] + donor_size_exon), symbol, "donor", strand ]) junction = chr + ':' + str( ends[i - 1]) + '-' + str(int(starts[i]) + 1) if key not in key2junction: key2junction[key] = [] if key not in key2gene_id: key2gene_id[key] = [] if key not in key2exon_num: key2exon_num[key] = [] key2junction[key].append(junction) key2gene_id[key].append(gene_id) key2exon_num[key].append(str(i)) hout = open(output_file + ".unsorted.tmp", 'w') for key in sorted(key2junction): print >> hout, '\t'.join([ key, ','.join(key2junction[key]), ','.join(key2gene_id[key]), ','.join(key2exon_num[key]) ]) hout.close() hout = open(output_file + ".sorted.tmp", 'w') subprocess.check_call( ["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"], stdout=hout) hout.close() hout = open(output_file, 'w') subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"], stdout=hout) hout.close() subprocess.check_call(["tabix", "-p", "bed", output_file]) subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"]) subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])