示例#1
0
def make_gene_info(output_file, gene_model, genome_id, is_grc, add_ref_id):

    # create UCSC to GRC chr name corresponding table
    ucsc2grc = {}
    if is_grc:
        ucsc2grc = chr_name.make_ucsc2grc(genome_id)

    ucsc_gene_file = utils.set_ucsc_gene_file(genome_id, gene_model)

    hout = open(output_file + ".unsorted.tmp", 'w')
    with gzip.open(ucsc_gene_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')

            chr = ucsc2grc[F[2]] if F[2] in ucsc2grc else F[2]
            gene_id = F[1]
            gene_start = F[4]
            gene_end = F[5]
            strand = F[3]
            symbol = F[12]
            exon_starts = F[9].split(',')
            exon_ends = F[10].split(',')

            gene_print_name = "---"
            if gene_model == "refseq":
                if add_ref_id:
                    gene_print_name = symbol + '(' + gene_id + ')'
                else:
                    gene_print_name = symbol
            elif gene_model == "gencode":
                gene_print_name = gene_id
            else:
                print >> sys.stderr, "the value of gene_model should be refseq or gencode"
                sys.exit(1)

            key = chr + '\t' + gene_start + '\t' + gene_end
            print >> hout, key + '\t' + gene_print_name + '\t' + "0" + '\t' + strand

    hout.close()

    hout = open(output_file + ".sorted.tmp", 'w')
    subprocess.check_call(
        ["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"],
        stdout=hout)
    hout.close()

    hout = open(output_file, 'w')
    subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"],
                          stdout=hout)
    hout.close()

    subprocess.check_call(["tabix", "-p", "bed", output_file])

    subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"])
    subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])
示例#2
0
def make_simple_repeat_info(output_file, genome_id, is_grc):

    # create UCSC to GRC chr name corresponding table
    ucsc2grc = {}
    if is_grc:
        ucsc2grc = chr_name.make_ucsc2grc(genome_id)

    if genome_id == "hg19":
        simple_repeat_file = pkg_resources.resource_filename(
            "annot_utils", "data/hg19/simpleRepeat.txt.gz")
    elif genome_id == "hg38":
        simple_repeat_file = pkg_resources.resource_filename(
            "annot_utils", "data/hg38/simpleRepeat.txt.gz")
    elif genome_id == "mm10":
        simple_repeat_file = pkg_resources.resource_filename(
            "annot_utils", "data/mm10/simpleRepeat.txt.gz")
    else:
        print >> sys.stderr, "genome_id shoud be hg19, hg38 or mm10"
        sys.exit(1)

    hout = open(output_file + ".unsorted.tmp", 'w')
    with gzip.open(simple_repeat_file, 'r') as hin:

        for line in hin:

            F = line.rstrip('\n').split('\t')

            chr = ucsc2grc[F[1]] if F[1] in ucsc2grc else F[1]
            print >> hout, chr + '\t' + '\t'.join(F[2:])

    hout.close()

    hout = open(output_file + ".sorted.tmp", 'w')
    subprocess.check_call(
        ["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"],
        stdout=hout)
    hout.close()

    hout = open(output_file, 'w')
    subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"],
                          stdout=hout)
    hout.close()

    subprocess.check_call(["tabix", "-p", "bed", output_file])

    subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"])
    subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])
示例#3
0
def make_exon_info(output_file, gene_model, genome_id, is_grc, add_ref_id):

    # create UCSC to GRC chr name corresponding table
    ucsc2grc = {} 
    if is_grc:
        ucsc2grc = chr_name.make_ucsc2grc(genome_id)

    ucsc_gene_file = utils.set_ucsc_gene_file(genome_id, gene_model)


    hout = open(output_file + ".unsorted.tmp", 'w')
    with gzip.open(ucsc_gene_file, 'r') as hin:

        for line in hin:

            F = line.rstrip('\n').split('\t')
        
            chr = ucsc2grc[F[2]] if F[2] in ucsc2grc else F[2]
            gene_id = F[1]
            exon_starts = F[9].split(',')
            exon_ends = F[10].split(',')
            strand = F[3]
            exonNum = int(F[8])
            gene = F[1]
            symbol = F[12]

            size = 0
            for i in range(len(exon_starts) - 1):
                size = size + int(exon_ends[i]) - int(exon_starts[i])


            gene_print_name = "---"
            if gene_model == "refseq":
                if add_ref_id:
                    gene_print_name = symbol + '(' + gene_id + ')'
                else:
                    gene_print_name = symbol
            elif gene_model == "gencode":
                gene_print_name = gene_id


            for i in range(0, len(exon_starts) - 1):
                key = chr + '\t' + exon_starts[i] + '\t' + exon_ends[i]
                if strand == "+":
                    print >> hout, key + '\t' + gene_print_name + '\t' + str(i) + '\t' + "+"
                else:
                    print >> hout, key + '\t' + gene_print_name + '\t' + str(exonNum - i - 1) + '\t' + "-"


    hout.close()


    hout = open(output_file + ".sorted.tmp", 'w')
    subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"], stdout = hout)
    hout.close()

    hout = open(output_file, 'w')
    subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"], stdout = hout)
    hout.close()

    subprocess.check_call(["tabix", "-p", "bed", output_file])


    subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"])
    subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])
示例#4
0
def make_boundary_info(output_file, genome_id, is_grc, donor_size,
                       acceptor_size):

    # create UCSC to GRC chr name corresponding table
    ucsc2grc = {}
    if is_grc:
        ucsc2grc = chr_name.make_ucsc2grc(genome_id)

    ucsc_gene_file = utils.set_ucsc_gene_file(genome_id, "refseq")

    donor_size_exon, donor_size_intron = [
        int(x) for x in donor_size.split(',')
    ]
    acceptor_size_intron, acceptor_size_exon = [
        int(x) for x in acceptor_size.split(',')
    ]

    key2junction, key2gene_id, key2exon_num = {}, {}, {}
    with gzip.open(ucsc_gene_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')

            chr = ucsc2grc[F[2]] if F[2] in ucsc2grc else F[2]
            starts = [int(x) for x in F[9].split(',') if x != '']
            ends = [int(x) for x in F[10].split(',') if x != '']
            strand = F[3]
            exon_num = int(F[8])
            gene_id = F[1]
            symbol = F[12]

            for i in range(0, exon_num - 1):
                if strand == '+':  # donor
                    key = '\t'.join([
                        chr,
                        str(ends[i] - donor_size_exon),
                        str(ends[i] + donor_size_intron), symbol, "donor",
                        strand
                    ])
                else:  # acceptor
                    key = '\t'.join([
                        chr,
                        str(ends[i] - acceptor_size_exon),
                        str(ends[i] + acceptor_size_intron), symbol,
                        "acceptor", strand
                    ])

                junction = chr + ':' + str(
                    ends[i]) + '-' + str(starts[i + 1] + 1)

                if key not in key2junction: key2junction[key] = []
                if key not in key2gene_id: key2gene_id[key] = []
                if key not in key2exon_num: key2exon_num[key] = []

                key2junction[key].append(junction)
                key2gene_id[key].append(gene_id)
                key2exon_num[key].append(str(i))

            for i in range(1, exon_num):
                if strand == '+':  # acceptor
                    key = '\t'.join([
                        chr,
                        str(starts[i] - acceptor_size_intron),
                        str(starts[i] + acceptor_size_exon), symbol,
                        "acceptor", strand
                    ])
                else:  # donor
                    key = '\t'.join([
                        chr,
                        str(starts[i] - donor_size_intron),
                        str(starts[i] + donor_size_exon), symbol, "donor",
                        strand
                    ])

                junction = chr + ':' + str(
                    ends[i - 1]) + '-' + str(int(starts[i]) + 1)

                if key not in key2junction: key2junction[key] = []
                if key not in key2gene_id: key2gene_id[key] = []
                if key not in key2exon_num: key2exon_num[key] = []

                key2junction[key].append(junction)
                key2gene_id[key].append(gene_id)
                key2exon_num[key].append(str(i))

    hout = open(output_file + ".unsorted.tmp", 'w')
    for key in sorted(key2junction):
        print >> hout, '\t'.join([
            key, ','.join(key2junction[key]), ','.join(key2gene_id[key]),
            ','.join(key2exon_num[key])
        ])
    hout.close()

    hout = open(output_file + ".sorted.tmp", 'w')
    subprocess.check_call(
        ["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"],
        stdout=hout)
    hout.close()

    hout = open(output_file, 'w')
    subprocess.check_call(["bgzip", "-f", "-c", output_file + ".sorted.tmp"],
                          stdout=hout)
    hout.close()

    subprocess.check_call(["tabix", "-p", "bed", output_file])

    subprocess.check_call(["rm", "-rf", output_file + ".unsorted.tmp"])
    subprocess.check_call(["rm", "-rf", output_file + ".sorted.tmp"])