示例#1
0
def main():
    usage = 'usage: %prog [options] <feature gff/bed>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='gff_file', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % home_dir)
    parser.add_option('-n', dest='null_iterations', type=int, default=50, help='Number of shuffles to perform to estimate null distribution [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.gff_file:
        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, options.gff_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -s -u -f 0.5 -a %s -b %s > %s' % (feature_gff, options.gff_file, feature_gff_gff_file), shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute size of search space
    if options.gff_file:
        genome_length = count_gff(options.gff_file)
    else:
        genome_length = count_hg19()

    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # hash counted repeat genomic bp
    te_in = open(options.repeats_gff)
    genome_te_bp = hash_te(te_in)
    te_in.close()

    ############################################
    # convert feature gff to bed
    ############################################
    if feature_gff[-3:] == 'gtf':
        feature_bed_fd, feature_bed_file = tempfile.mkstemp()
        subprocess.call('gtf2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True)

    elif feature_gff[-3:] == 'gff':
        feature_bed_fd, feature_bed_file = tempfile.mkstemp()
        subprocess.call('gff2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True)

    elif feature_gff[-3:] == 'bed':
        feature_bed_file = feature_gff

    else:
        parser.error('Cannot recognize gff format suffix')

    ############################################
    # null distribution
    ############################################
    shuffle_bed_fd, shuffle_bed_file = tempfile.mkstemp()

    te_null_bp = {}
    for ni in range(options.null_iterations):
        print >> sys.stderr, ni

        # shuffle feature bed
        subprocess.call('shuffleBed -i %s -g %s/research/common/data/genomes/hg19/assembly/human.hg19.genome -excl %s/research/common/data/genomes/hg19/assembly/hg19_gaps.bed > %s' % (feature_bed_file, home_dir, home_dir, shuffle_bed_file), shell=True)

        # intersect w/ TEs and hash overlaps
        te_tmp_bp = intersect_hash(options.repeats_gff, shuffle_bed_file)
        for te in genome_te_bp:
            te_null_bp.setdefault(te,[]).append(te_tmp_bp.get(te,0))

    ############################################
    # actual
    ############################################
    te_bp = intersect_hash(options.repeats_gff, feature_gff)

    ############################################
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in genome_te_bp:
        feature_freq = float(te_bp.get(te,0))/feature_len
        genome_freq = float(genome_te_bp[te])/genome_length
        fold_change = feature_freq / genome_freq

        #print te, stats.mean(te_null_bp[te]), stats.sd(te_null_bp[te])

        null_u, null_sd = stats.mean_sd(te_null_bp[te])
        if null_sd == 0:
            null_sd = 1.0
            
        if fold_change > 1:
            p = norm.sf(te_bp[te]-1, loc=null_u, scale=null_sd)
        else:
            p = norm.cdf(te_bp.get(te,0), loc=null_u, scale=null_sd)

        p_vals.append(p)

        cols = (te[0], te[1], te_bp.get(te,0), feature_freq, genome_freq, fold_change, p)
        lines.append('%-18s %-18s %8d %11.2e %11.2e %9.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    os.close(shuffle_bed_fd)
    os.remove(shuffle_bed_file)
    if feature_gff[-3:] != 'bed':
        os.close(feature_bed_fd)
        os.remove(feature_bed_file)
    if options.gff_file:
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
示例#2
0
def main():
    usage = 'usage: %prog [options] <feature gff>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)    

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, feature_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    te_lengths = te_target_size(options.repeats_gff, feature_len)

    ############################################
    # hash TE/feature overlaps
    ############################################
    # initialize
    te_features = {}
    for rep, fam in te_lengths:
        if options.strand_split:
            te_features[(rep+'+',fam)] = set()
            te_features[('*+',fam)] = set()
            te_features[('*+','*')] = set()
            te_features[(rep+'-',fam)] = set()
            te_features[('*-',fam)] = set()
            te_features[('*-','*')] = set()
        else:
            te_features[(rep,fam)] = set()
            te_features[('*',fam)] = set()
            te_features[('*','*')] = set()
        
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff,feature_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')
        
        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        fchrom = a[9]
        fstart = int(a[12])
        fend = int(a[13])

        rep_star = '*'
        if options.strand_split:
            tstrand = a[6]
            fstrand = a[15]
            if tstrand == fstrand:
                rep += '+'
                rep_star += '+'
            else:
                rep += '-'
                rep_star += '-'

        te_features[(rep,fam)].add((fchrom,fstart,fend))
        te_features[(rep_star,fam)].add((fchrom,fstart,fend))
        te_features[(rep_star,'*')].add((fchrom,fstart,fend))

    p.communicate()

    ############################################SW
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in te_features:
        rep, fam = te

        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
            te_p = float(te_len) / (2*genome_length)
        else:
            te_len = te_lengths[(rep,fam)]
            te_p = float(te_len) / genome_length
        
        te_count = len(te_features.get(te,[]))
        exp_count = te_p * feature_num

        fold_change = te_count / exp_count

        if fold_change > 1:
            p_val = binom.sf(te_count-1, feature_num, te_p)
        else:
            p_val = binom.cdf(te_count, feature_num, te_p)
        
        p_vals.append(p_val)

        cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val)
        lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
示例#3
0
def main():
    usage = 'usage: %prog [options] <feature gff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-g',
        dest='filter_gff',
        help=
        'Filter the TEs by overlap with genes in the given gff file [Default: %default]'
    )
    parser.add_option(
        '-r',
        dest='repeats_gff',
        default=
        '%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff'
        % os.environ['HOME'])
    parser.add_option('-s',
                      dest='strand_split',
                      default=False,
                      action='store_true',
                      help='Split statistics by strand [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' %
                        (options.filter_gff, filter_merged_bed_file),
                        shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call(
            'intersectBed -a %s -b %s > %s' %
            (options.repeats_gff, filter_merged_bed_file, te_gff_file),
            shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call(
            'intersectBed -u -f 0.5 -a %s -b %s > %s' %
            (feature_gff, filter_merged_bed_file, feature_gff_gff_file),
            shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, feature_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    te_lengths = te_target_size(options.repeats_gff, feature_len)

    ############################################
    # hash TE/feature overlaps
    ############################################
    # initialize
    te_features = {}
    for rep, fam in te_lengths:
        if options.strand_split:
            te_features[(rep + '+', fam)] = set()
            te_features[('*+', fam)] = set()
            te_features[('*+', '*')] = set()
            te_features[(rep + '-', fam)] = set()
            te_features[('*-', fam)] = set()
            te_features[('*-', '*')] = set()
        else:
            te_features[(rep, fam)] = set()
            te_features[('*', fam)] = set()
            te_features[('*', '*')] = set()

    p = subprocess.Popen('intersectBed -wo -a %s -b %s' %
                         (options.repeats_gff, feature_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        fchrom = a[9]
        fstart = int(a[12])
        fend = int(a[13])

        rep_star = '*'
        if options.strand_split:
            tstrand = a[6]
            fstrand = a[15]
            if tstrand == fstrand:
                rep += '+'
                rep_star += '+'
            else:
                rep += '-'
                rep_star += '-'

        te_features[(rep, fam)].add((fchrom, fstart, fend))
        te_features[(rep_star, fam)].add((fchrom, fstart, fend))
        te_features[(rep_star, '*')].add((fchrom, fstart, fend))

    p.communicate()

    ############################################SW
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in te_features:
        rep, fam = te

        if options.strand_split:
            te_len = te_lengths[(rep[:-1], fam)]
            te_p = float(te_len) / (2 * genome_length)
        else:
            te_len = te_lengths[(rep, fam)]
            te_p = float(te_len) / genome_length

        te_count = len(te_features.get(te, []))
        exp_count = te_p * feature_num

        fold_change = te_count / exp_count

        if fold_change > 1:
            p_val = binom.sf(te_count - 1, feature_num, te_p)
        else:
            p_val = binom.cdf(te_count, feature_num, te_p)

        p_vals.append(p_val)

        cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val)
        lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)