Exemplo n.º 1
0
def main():
    usage = 'usage: %prog [options] <feature gff/bed>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='gff_file', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % home_dir)
    parser.add_option('-n', dest='null_iterations', type=int, default=50, help='Number of shuffles to perform to estimate null distribution [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.gff_file:
        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, options.gff_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -s -u -f 0.5 -a %s -b %s > %s' % (feature_gff, options.gff_file, feature_gff_gff_file), shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute size of search space
    if options.gff_file:
        genome_length = count_gff(options.gff_file)
    else:
        genome_length = count_hg19()

    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # hash counted repeat genomic bp
    te_in = open(options.repeats_gff)
    genome_te_bp = hash_te(te_in)
    te_in.close()

    ############################################
    # convert feature gff to bed
    ############################################
    if feature_gff[-3:] == 'gtf':
        feature_bed_fd, feature_bed_file = tempfile.mkstemp()
        subprocess.call('gtf2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True)

    elif feature_gff[-3:] == 'gff':
        feature_bed_fd, feature_bed_file = tempfile.mkstemp()
        subprocess.call('gff2bed.py %s > %s' % (feature_gff,feature_bed_file), shell=True)

    elif feature_gff[-3:] == 'bed':
        feature_bed_file = feature_gff

    else:
        parser.error('Cannot recognize gff format suffix')

    ############################################
    # null distribution
    ############################################
    shuffle_bed_fd, shuffle_bed_file = tempfile.mkstemp()

    te_null_bp = {}
    for ni in range(options.null_iterations):
        print >> sys.stderr, ni

        # shuffle feature bed
        subprocess.call('shuffleBed -i %s -g %s/research/common/data/genomes/hg19/assembly/human.hg19.genome -excl %s/research/common/data/genomes/hg19/assembly/hg19_gaps.bed > %s' % (feature_bed_file, home_dir, home_dir, shuffle_bed_file), shell=True)

        # intersect w/ TEs and hash overlaps
        te_tmp_bp = intersect_hash(options.repeats_gff, shuffle_bed_file)
        for te in genome_te_bp:
            te_null_bp.setdefault(te,[]).append(te_tmp_bp.get(te,0))

    ############################################
    # actual
    ############################################
    te_bp = intersect_hash(options.repeats_gff, feature_gff)

    ############################################
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in genome_te_bp:
        feature_freq = float(te_bp.get(te,0))/feature_len
        genome_freq = float(genome_te_bp[te])/genome_length
        fold_change = feature_freq / genome_freq

        #print te, stats.mean(te_null_bp[te]), stats.sd(te_null_bp[te])

        null_u, null_sd = stats.mean_sd(te_null_bp[te])
        if null_sd == 0:
            null_sd = 1.0
            
        if fold_change > 1:
            p = norm.sf(te_bp[te]-1, loc=null_u, scale=null_sd)
        else:
            p = norm.cdf(te_bp.get(te,0), loc=null_u, scale=null_sd)

        p_vals.append(p)

        cols = (te[0], te[1], te_bp.get(te,0), feature_freq, genome_freq, fold_change, p)
        lines.append('%-18s %-18s %8d %11.2e %11.2e %9.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    os.close(shuffle_bed_fd)
    os.remove(shuffle_bed_file)
    if feature_gff[-3:] != 'bed':
        os.close(feature_bed_fd)
        os.remove(feature_bed_file)
    if options.gff_file:
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
Exemplo n.º 2
0
def main():
    usage = 'usage: %prog [options] <bam> <ref_gtf>'
    parser = OptionParser(usage)

    # IO options
    parser.add_option('-o',
                      dest='out_dir',
                      default='uniform',
                      help='Output directory [Default: %default]')

    # window options
    parser.add_option('-w',
                      dest='window_size',
                      type='int',
                      default=25,
                      help='Window size for counting [Default: %default]')
    parser.add_option(
        '-i',
        '--ignore',
        dest='ignore_gff',
        help=
        'Ignore reads overlapping overlapping troublesome regions in the given GFF file'
    )
    parser.add_option('-u',
                      '--unstranded',
                      dest='unstranded',
                      action='store_true',
                      default=False,
                      help='Sequencing is unstranded [Default: %default]')

    # cufflinks options
    parser.add_option(
        '--cuff_done',
        dest='cuff_done',
        action='store_true',
        default=False,
        help=
        'The Cufflinks run to estimate the model parameters is already done [Default: %default]'
    )
    parser.add_option('-t',
                      dest='threads',
                      type='int',
                      default=2,
                      help='Number of threads to use [Default: %default]')

    # debug options
    parser.add_option('-v',
                      '--verbose',
                      dest='verbose',
                      action='store_true',
                      default=False,
                      help='Verbose output [Default: %default]')
    parser.add_option('-g',
                      '--gene',
                      dest='gene_only',
                      help='Call peaks on the specified gene only')
    #parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]')

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        bam = args[0]
        ref_gtf = args[1]

    clip_peaks.out_dir = options.out_dir

    if not os.path.isdir(clip_peaks.out_dir):
        os.mkdir(clip_peaks.out_dir)

    ############################################
    # parameterize
    ############################################
    if not options.cuff_done:
        # make a new gtf w/ unspliced RNAs
        update_ref_gtf = clip_peaks.prerna_gtf(ref_gtf)

        subprocess.call(
            'cufflinks -o %s -p %d -G %s %s' %
            (clip_peaks.out_dir, options.threads, update_ref_gtf, bam),
            shell=True)

    # store transcripts
    transcripts = clip_peaks.read_genes('%s/transcripts.gtf' %
                                        clip_peaks.out_dir,
                                        key_id='transcript_id')

    # merge overlapping genes
    g2t_merge, antisense_clusters = clip_peaks.merged_g2t(
        '%s/transcripts.gtf' % clip_peaks.out_dir, options.unstranded)

    if options.unstranded:
        # alter strands
        clip_peaks.ambiguate_strands(transcripts, g2t_merge,
                                     antisense_clusters)

    # set transcript FPKMs
    clip_peaks.set_transcript_fpkms(transcripts,
                                    clip_peaks.out_dir,
                                    missing_fpkm=0)

    # possibly limit genes to examine
    if options.gene_only:
        gene_ids = []
        for gids in g2t_merge.keys():
            if options.gene_only in gids.split(','):
                gene_ids.append(gids)
        if len(gene_ids) == 0:
            print >> sys.stderr, 'gene_id %s not found' % options.gene_only
            exit(1)
    else:
        gene_ids = g2t_merge.keys()

    ############################################
    # filter BAM
    ############################################
    if options.ignore_gff:
        bam_ignore_fd, bam_ignore_file = tempfile.mkstemp(
            dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -v -abam %s -b %s > %s' %
                        (bam, options.ignore_gff, bam_ignore_file),
                        shell=True)
        bam = bam_ignore_file

    ############################################
    # process genes
    ############################################
    # index
    subprocess.call('samtools index %s' % bam, shell=True)

    # initialize stats
    table_out = open('%s/uniformity_table.txt' % clip_peaks.out_dir, 'w')
    id_list = []
    fpkm_list = []

    # open bam
    bam_in = pysam.Samfile(bam, 'rb')

    # for each gene
    for gene_id in gene_ids:
        # make a more focused transcript hash for this gene
        gene_transcripts = {}
        for tid in g2t_merge[gene_id]:
            gene_transcripts[tid] = transcripts[tid]

        # obtain basic gene attributes
        (gchrom, gstrand, gstart,
         gend) = clip_peaks.gene_attrs(gene_transcripts)

        # initialize window counts
        transcript_isoform_counts = {}
        for tid in gene_transcripts:
            transcript_isoform_counts[tid] = []

        # choose a single event position and weight the reads
        read_pos_weights = clip_peaks.position_reads(bam_in,
                                                     gchrom,
                                                     gstart,
                                                     gend,
                                                     gstrand,
                                                     mapq_zero=True)

        # process read alignments
        for (pos, weight, mm) in read_pos_weights:
            # map pos to isoforms
            iso_pos = {}
            for tid in gene_transcripts:
                iso_pos[tid] = isoform_position(gene_transcripts[tid], pos)

            # sum fpkms for hit isoforms
            fpkm_sum = sum([
                gene_transcripts[tid].fpkm for tid in gene_transcripts
                if iso_pos[tid] != None
            ])

            if fpkm_sum <= 0:
                pass
                #print >> sys.stderr, 'No FPKM for %s at %d' % (gene_id,pos)
            else:
                # distribute read to isoform counts
                for tid in gene_transcripts:
                    if iso_pos[tid] != None:
                        win_i = int(iso_pos[tid] / options.window_size)
                        while win_i >= len(transcript_isoform_counts[tid]):
                            transcript_isoform_counts[tid].append(0)
                        transcript_isoform_counts[tid][
                            win_i] += weight * gene_transcripts[
                                tid].fpkm / fpkm_sum

        # compute window stats
        for tid in gene_transcripts:
            if gene_transcripts[tid].fpkm > 1 and len(
                    transcript_isoform_counts[tid]) > 5:
                u, sd = stats.mean_sd(transcript_isoform_counts[tid][:-1])
                if u > 0:
                    id_list.append(sd * sd / u)
                    fpkm_list.append(gene_transcripts[tid].fpkm)

                    cols = (tid, gene_transcripts[tid].fpkm,
                            len(transcript_isoform_counts[tid]) - 1, u, sd,
                            id_list[-1])
                    print >> table_out, '%-20s  %8.2f  %6d  %7.2f  %7.2f  %5.3f' % cols

    bam_in.close()
    table_out.close()

    ############################################
    # summary stats
    ############################################
    median = stats.median(id_list)
    mean = stats.mean(id_list)

    fpkm_cv_sum = sum([id_list[i] * fpkm_list[i] for i in range(len(id_list))])
    fpkm_sum = sum(fpkm_list)
    fpkm_mean = fpkm_cv_sum / fpkm_sum

    logfpkm_cv_sum = sum([
        id_list[i] * math.log(fpkm_list[i] + 1, 2) for i in range(len(id_list))
    ])
    logfpkm_sum = sum([math.log(f + 1, 2) for f in fpkm_list])
    logfpkm_mean = logfpkm_cv_sum / logfpkm_sum

    # print
    print 'Median:                %7.4f' % median
    print 'Mean:                  %7.4f' % mean
    print 'FPKM-weighted mean:    %7.4f' % fpkm_mean
    print 'logFPKM-weighted mean: %7.4f' % logfpkm_mean

    # clean cufflinks output
    if not options.cuff_done:
        os.remove(update_ref_gtf)
        os.remove('%s/skipped.gtf' % clip_peaks.out_dir)
        os.remove('%s/genes.fpkm_tracking' % clip_peaks.out_dir)

    if options.ignore_gff:
        os.close(bam_ignore_fd)
        os.remove(bam_ignore_file)
Exemplo n.º 3
0
def main():
    usage = "usage: %prog [options] <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-k",
        dest="k_fold",
        type="int",
        default=10,
        help="Number of folds to use for cross-validation [Default: %default]",
    )
    parser.add_option(
        "--lambda_min",
        dest="lambda_min",
        type="float",
        default=0.01,
        help="Minimum -lambda value to attempt [Default: %default]",
    )
    parser.add_option(
        "--lambda_max",
        dest="lambda_max",
        type="float",
        default=10.0,
        help="Maximum -lambda value to attempt [Default: %default]",
    )
    parser.add_option(
        "--lambda_mult",
        dest="lambda_mult",
        type="float",
        default=2.0,
        help="Multiplier for next -lambda value to attempt [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="lesser_kmers",
        action="store_true",
        default=False,
        help="Use all kmers of length less than and equal to that given by -k [Default: %default]",
    )
    # parser.add_option('-m', dest='model_file', help='File to output model to')
    parser.add_option(
        "-p", dest="parallel", type="int", default=4, help="Number of parallel threads to run [Default: %default]"
    )
    parser.add_option(
        "-r",
        dest="replicates",
        type="int",
        default=1,
        help="Number of times to repeat the optimization for each fold [Default: %default]",
    )
    parser.add_option(
        "-w", dest="weights", action="store_true", default=False, help="Print a summary of the weight vectors"
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide input file")
    else:
        input_file = args[0]
    input_base = os.path.splitext(input_file)[0]

    if options.weights:
        summarize_weights(input_base, options)
        exit()

    # determine % of positive examples
    input_pos, input_total = positive_percent(input_file)
    f1_base = input_pos / float(input_total)  # trust me, it works

    for r in range(options.replicates):
        rep_dir = "%s_rep%d" % (input_base, r)
        if os.path.isdir(rep_dir):
            shutil.rmtree(rep_dir)
        os.mkdir(rep_dir)
        os.chdir(rep_dir)

        # divide data into folds
        divide_data("../" + input_file, options.k_fold)

        # collect pegasos commands
        cmds = []
        peg_lambda = options.lambda_min
        while peg_lambda <= options.lambda_max:
            # run on each fold
            for f in range(options.k_fold):
                cmds.append(
                    "pegasos -lambda %f -modelFile fold%d/train_%.1e.mod fold%d/train.dat &> /dev/null"
                    % (peg_lambda, f, peg_lambda, f)
                )

            # increase lambda
            peg_lambda *= options.lambda_mult

        # exceute pegasos commands
        util.exec_par(cmds, options.parallel)

        # start to clean up space
        for f in range(options.k_fold):
            os.remove("fold%d/train.dat" % f)

        os.chdir("..")

    # collect results
    peg_lambda = options.lambda_min
    while peg_lambda <= options.lambda_max:
        recalls = []
        precisions = []
        failed = False

        for r in range(options.replicates):
            if not failed:
                outcomes = {"tp": 0, "fp": 0, "fn": 0}

                # collect each fold
                for f in range(options.k_fold):
                    if not compute_accuracy(outcomes, "%s_rep%d/fold%d" % (input_base, r, f), peg_lambda):
                        failed = True
                        break

                # save
                if not failed:
                    recalls.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fn"]))
                    precisions.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fp"]))

        # summarize and print
        if failed:
            print "%.1e %8s %7s %8s %7s %8s %8s" % (peg_lambda, "NA", "NA", "NA", "NA", "NA", "NA")
        else:
            recall, rsd = stats.mean_sd(recalls)
            rsd /= math.sqrt(len(recalls))
            precision, psd = stats.mean_sd(precisions)
            psd /= math.sqrt(len(precisions))

            # null_p = 1.0-binom.cdf(int(recall*input_total+0.5)-1, int(recall*input_total/precision + 0.5), float(input_pos)/input_total)

            f1 = 2 * recall * precision / (recall + precision)

            # print '%.1e %8.3f %6.3f %8.3f %6.3f %8.3f %8.3f %8.1e' % (peg_lambda, recall, rsd, precision, psd, f1, (f1-f1_base), null_p)
            print "%.1e %8.4f %7.4f %8.4f %7.4f %8.4f %8.4f" % (
                peg_lambda,
                recall,
                rsd,
                precision,
                psd,
                f1,
                (f1 - f1_base),
            )

        peg_lambda *= options.lambda_mult
Exemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <bam> <ref_gtf>'
    parser = OptionParser(usage)

    # IO options
    parser.add_option('-o', dest='out_dir', default='uniform', help='Output directory [Default: %default]')

    # window options
    parser.add_option('-w', dest='window_size', type='int', default=25, help='Window size for counting [Default: %default]')
    parser.add_option('-i', '--ignore', dest='ignore_gff', help='Ignore reads overlapping overlapping troublesome regions in the given GFF file')
    parser.add_option('-u', '--unstranded', dest='unstranded', action='store_true', default=False, help='Sequencing is unstranded [Default: %default]')

    # cufflinks options
    parser.add_option('--cuff_done', dest='cuff_done', action='store_true', default=False, help='The Cufflinks run to estimate the model parameters is already done [Default: %default]')
    parser.add_option('-t', dest='threads', type='int', default=2, help='Number of threads to use [Default: %default]')

    # debug options
    parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose output [Default: %default]')
    parser.add_option('-g', '--gene', dest='gene_only', help='Call peaks on the specified gene only')
    #parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        bam = args[0]
        ref_gtf = args[1]

    clip_peaks.out_dir = options.out_dir

    if not os.path.isdir(clip_peaks.out_dir):
        os.mkdir(clip_peaks.out_dir)

    ############################################
    # parameterize
    ############################################
    if not options.cuff_done:
        # make a new gtf w/ unspliced RNAs
        update_ref_gtf = clip_peaks.prerna_gtf(ref_gtf)

        subprocess.call('cufflinks -o %s -p %d -G %s %s' % (clip_peaks.out_dir, options.threads, update_ref_gtf, bam), shell=True)

    # store transcripts
    transcripts = clip_peaks.read_genes('%s/transcripts.gtf'%clip_peaks.out_dir, key_id='transcript_id')

    # merge overlapping genes
    g2t_merge, antisense_clusters = clip_peaks.merged_g2t('%s/transcripts.gtf'%clip_peaks.out_dir, options.unstranded)

    if options.unstranded:
        # alter strands
        clip_peaks.ambiguate_strands(transcripts, g2t_merge, antisense_clusters)

    # set transcript FPKMs
    clip_peaks.set_transcript_fpkms(transcripts, clip_peaks.out_dir, missing_fpkm=0)

    # possibly limit genes to examine
    if options.gene_only:
        gene_ids = []
        for gids in g2t_merge.keys():
            if options.gene_only in gids.split(','):
                gene_ids.append(gids)
        if len(gene_ids) == 0:
            print >> sys.stderr, 'gene_id %s not found' % options.gene_only
            exit(1)
    else:
        gene_ids = g2t_merge.keys()


    ############################################
    # filter BAM
    ############################################
    if options.ignore_gff:
        bam_ignore_fd, bam_ignore_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -v -abam %s -b %s > %s' % (bam, options.ignore_gff, bam_ignore_file), shell=True)
        bam = bam_ignore_file

    ############################################
    # process genes
    ############################################
    # index
    subprocess.call('samtools index %s' % bam, shell=True)

    # initialize stats
    table_out = open('%s/uniformity_table.txt' % clip_peaks.out_dir, 'w')
    id_list = []
    fpkm_list = []

    # open bam
    bam_in = pysam.Samfile(bam, 'rb')
    
    # for each gene
    for gene_id in gene_ids:
        # make a more focused transcript hash for this gene
        gene_transcripts = {}
        for tid in g2t_merge[gene_id]:
            gene_transcripts[tid] = transcripts[tid]

        # obtain basic gene attributes
        (gchrom, gstrand, gstart, gend) = clip_peaks.gene_attrs(gene_transcripts)

        # initialize window counts
        transcript_isoform_counts = {}
        for tid in gene_transcripts:
            transcript_isoform_counts[tid] = []

        # choose a single event position and weight the reads
        read_pos_weights = clip_peaks.position_reads(bam_in, gchrom, gstart, gend, gstrand, mapq_zero=True)

        # process read alignments
        for (pos, weight, mm) in read_pos_weights:
            # map pos to isoforms
            iso_pos = {}
            for tid in gene_transcripts:
                iso_pos[tid] = isoform_position(gene_transcripts[tid], pos)

            # sum fpkms for hit isoforms
            fpkm_sum = sum([gene_transcripts[tid].fpkm for tid in gene_transcripts if iso_pos[tid] != None])

            if fpkm_sum <= 0:
                pass
                #print >> sys.stderr, 'No FPKM for %s at %d' % (gene_id,pos)
            else:
                # distribute read to isoform counts
                for tid in gene_transcripts:
                    if iso_pos[tid] != None:
                        win_i = int(iso_pos[tid] / options.window_size)
                        while win_i >= len(transcript_isoform_counts[tid]):
                            transcript_isoform_counts[tid].append(0)
                        transcript_isoform_counts[tid][win_i] += weight*gene_transcripts[tid].fpkm/fpkm_sum

        # compute window stats
        for tid in gene_transcripts:
            if gene_transcripts[tid].fpkm > 1 and len(transcript_isoform_counts[tid]) > 5:
                u, sd = stats.mean_sd(transcript_isoform_counts[tid][:-1])
                if u > 0:
                    id_list.append(sd*sd/u)
                    fpkm_list.append(gene_transcripts[tid].fpkm)

                    cols = (tid, gene_transcripts[tid].fpkm, len(transcript_isoform_counts[tid])-1, u, sd, id_list[-1])
                    print >> table_out, '%-20s  %8.2f  %6d  %7.2f  %7.2f  %5.3f' % cols        

    bam_in.close()
    table_out.close()

    ############################################
    # summary stats
    ############################################
    median = stats.median(id_list)
    mean = stats.mean(id_list)

    fpkm_cv_sum = sum([id_list[i]*fpkm_list[i] for i in range(len(id_list))])
    fpkm_sum = sum(fpkm_list)
    fpkm_mean = fpkm_cv_sum / fpkm_sum

    logfpkm_cv_sum = sum([id_list[i]*math.log(fpkm_list[i]+1,2) for i in range(len(id_list))])
    logfpkm_sum = sum([math.log(f+1,2) for f in fpkm_list])
    logfpkm_mean = logfpkm_cv_sum / logfpkm_sum

    # print
    print 'Median:                %7.4f' % median
    print 'Mean:                  %7.4f' % mean
    print 'FPKM-weighted mean:    %7.4f' % fpkm_mean
    print 'logFPKM-weighted mean: %7.4f' % logfpkm_mean

    # clean cufflinks output
    if not options.cuff_done:
        os.remove(update_ref_gtf)
        os.remove('%s/skipped.gtf' % clip_peaks.out_dir)
        os.remove('%s/genes.fpkm_tracking' % clip_peaks.out_dir)

    if options.ignore_gff:
        os.close(bam_ignore_fd)
        os.remove(bam_ignore_file)