Пример #1
0
def main():
    usage = 'usage: %prog [options] arg'
    parser = OptionParser(usage)
    parser.add_option('-d', dest='downstream', type='int', default=0, help='Downstream promoter length [Default: %default]')
    parser.add_option('-u', dest='upstream', type='int', default=2000, help='Upstream promoter length [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='promoter', help='Output file prefix [Default: %default]')
    (options,args) = parser.parse_args()
    
    if len(args) != 1:
        parser.error('Must provide gtf file')
    else:
        gtf_file = args[0]

    gff.promoters(gtf_file, options.upstream, options.downstream, '%s.gff'%options.output_pre)
    p = subprocess.Popen('gff2fa.py %s.gff > %s.fa' % (options.output_pre,options.output_pre), shell=True)
    os.waitpid(p.pid,0)
Пример #2
0
def intersect_gene_te(gtf_file, upstream, downstream):
    # focus on promoter
    tmp_fd, tmp_file = tempfile.mkstemp()
    gff.promoters(gtf_file, upstream, downstream, tmp_file)

    # intersect genes w/ repeats
    # hash transposon nt by gene
    gene_trans = {}
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' %
                         (tmp_file, hg19_reps_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene = gff.gtf_kv(a[8])['transcript_id']
        rep_kv = gff.gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # add nt
        if gene not in gene_trans:
            gene_trans[gene] = {}
        gene_trans[gene][(rep, fam)] = gene_trans[gene].get(
            (rep, fam), 0) + int(a[18])
        gene_trans[gene][('*', fam)] = gene_trans[gene].get(
            ('*', fam), 0) + int(a[18])
        gene_trans[gene][('*', '*')] = gene_trans[gene].get(
            ('*', '*'), 0) + int(a[18])

        line = p.stdout.readline()
    p.communicate()

    # create a fake family for dTE-lncRNAs
    for line in open(gtf_file):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        if tid not in gene_trans:
            gene_trans[tid] = {('n', 'n'): 1}

    return gene_trans
Пример #3
0
def intersect_gene_te(gtf_file, upstream, downstream):
    # focus on promoter
    tmp_fd, tmp_file = tempfile.mkstemp()
    gff.promoters(gtf_file, upstream, downstream, tmp_file)
    
    # intersect genes w/ repeats
    # hash transposon nt by gene
    gene_trans = {}
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (tmp_file,hg19_reps_gff), shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene = gff.gtf_kv(a[8])['transcript_id']
        rep_kv = gff.gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # add nt
        if gene not in gene_trans:
            gene_trans[gene] = {}
        gene_trans[gene][(rep,fam)] = gene_trans[gene].get((rep,fam),0) + int(a[18])
        gene_trans[gene][('*',fam)] = gene_trans[gene].get(('*',fam),0) + int(a[18])
        gene_trans[gene][('*','*')] = gene_trans[gene].get(('*','*'),0) + int(a[18])

        line = p.stdout.readline()
    p.communicate()

    # create a fake family for dTE-lncRNAs
    for line in open(gtf_file):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        if tid not in gene_trans:
            gene_trans[tid] = {('n','n'):1}

    return gene_trans