Exemplo n.º 1
0
def main(argv=None):
    """ Calculate significance of a motif in peaks with genomic background
    Can use restricted annotationDB, such as only promoter regions """

    parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" +
                                   main.__doc__)
    parser.add_option("--genome",
                      '-g',
                      dest="genome_resource",
                      type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option(
        "--motif_file",
        '-m',
        dest="motif_file",
        type="string",
        help=
        """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g.,
                      {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""")
    parser.add_option(
        "--consensus_file",
        '-c',
        dest="consensus_file",
        type="string",
        help="""index file for consensus motifs (IUPAC format, one
                      per line in the file""")
    parser.add_option(
        "--motif_key",
        '-k',
        dest="motif_key",
        type="string",
        help="""The key for the current motif in motif_file, default=all""")
    parser.add_option(
        '--zscore',
        '-z',
        dest='zscore',
        type='float',
        default=4.29,
        help=
        """Calculate threshold score estimate from this Z-score. [default=%default]"""
    )
    parser.add_option(
        '--overlap_resource',
        dest='overlap_resource',
        type='string',
        help="""Only count fg and bg that overlap with pygr resource""")
    parser.add_option(
        '--bg_samples',
        dest='bg_samples',
        type='string',
        help=
        """Pickled or Fasta file of background sequences to use instead of sampling the genome"""
    )
    parser.add_option('--no_bg',
                      dest='no_bg',
                      action='store_true',
                      help="""skip sampling in the background""")
    parser.add_option(
        '--report_region',
        type='string',
        help=
        'Report the genomic regions of peaks with motif instances to this file'
    )
    parser.add_option(
        "--output_file",
        '-f',
        dest="output_file",
        type="string",
        help="""Append the zscore information to the given file""")
    parser.add_option('--search_genome', action='store_true')
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1:
        parser.print_help()
        print 'Specify the peaks bed file!'
        sys.exit(-1)
    if not opts.motif_file and not opts.consensus_file:
        parser.print_help()
        print 'Specify the motif file!'
        sys.exit(-1)

    updated_motifs = False
    print '# Loading resources...'
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        annotMap = worldbase(opts.overlap_resource)
        annotDB = worldbase(opts.overlap_resource + '_db')

    allMotifs = {}
    # load pickled dict of motifs
    if opts.motif_file:
        if opts.motif_file.endswith('.transfac'):
            allMotifs.update(
                parseMotifsFromTransfac(open(opts.motif_file, 'r').read()))
        else:
            allMotifs.update(pickle.load(open(opts.motif_file)))
    # create consensus dict of motifs
    if opts.consensus_file:
        with open(opts.consensus_file) as infile:
            for line in infile:
                name, consensus = line.strip().split('\t')
                allMotifs.update({name: makePWMFromIUPAC(consensus)})

    if opts.motif_key:
        allKeys = [opts.motif_key]
    else:
        allKeys = allMotifs.keys()

    # write a header
    if opts.output_file:
        outstr = '\t'.join([
            'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z',
            'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize',
            'bgMatches', 'bgSize'
        ])
        open(opts.output_file, 'w').write(outstr)

    for motifKey in allKeys:
        print '# Loaded motif %s...' % motifKey
        pwm = allMotifs[motifKey]
        if isinstance(pwm, list):
            pwm = Motif(pwm)
            allMotifs[motifKey] = pwm
        if not pwm.bg_calculated():
            print '# Calculating motif background distribution...'
            pwm.calculate_background(genome)
            updated_motifs = True
        print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % (
            motifKey, len(pwm), pwm.get_threshold(
                opts.zscore), pwm._mean, pwm._sd, pwm.max_score())

        if opts.search_genome and opts.report_region is not None:
            # search the genome with the motif
            print 'searching genome!'
            with open(opts.report_region, 'w') as outfile:
                for chrom in genome:
                    for match in pwm.find_in_region(genome[chrom]):
                        outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format(
                            chrom=chrom,
                            start=match[0],
                            stop=match[1],
                            name=motifKey,
                            score=pwm.calc_score(match[3]),
                            strand='+' if match[2] == 1 else '-')
                        outfile.write(outstr)
            continue

        allPeaks = open(args[0]).readlines()
        allPeaks = list(readBedLines(allPeaks))
        peakSizes = [stop - start for _, start, stop, _ in allPeaks]

        print '# Searching foreground sequence...'
        sys.stdout.flush()
        peakRegions = (genome[chrom][start:stop]
                       for chrom, start, stop, _ in allPeaks)
        if opts.overlap_resource:
            # check to see if the bed line overlaps the resource
            overlappingRegions = [region for region in peakRegions \
                                        if len(annotMap[region]) > 0]
            # run a search in each of the overlapping regions
            motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \
                                        for region in overlappingRegions]
            fgSize = len(overlappingRegions)
            # count the number of peaks with at least one motif instance
            fgMatches = len(
                filter(lambda matches: len(matches) > 0,
                       motifInstancesInOverlap))
        else:
            matchingPeaks = [region for region in peakRegions \
                                        if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0]
            fgMatches = len(matchingPeaks)
            fgSize = len(allPeaks)

        if opts.report_region is not None:
            with open(opts.report_region, 'w') as outfile:
                outfile.writelines('%s\t%s\t%s\n' %
                                   (region.id, region.start, region.stop)
                                   for region in matchingPeaks)

        if opts.no_bg:
            outstr = '\t'.join([args[0], motifKey] + map(
                str,
                [opts.zscore, fgMatches, fgSize,
                 float(fgMatches) / fgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >> sys.stderr, outstr
        else:
            print '# Searching background sequence...'
            sys.stdout.flush()
            if opts.bg_samples:
                try:
                    bgSamples = pickle.load(open(opts.bg_samples))
                except:
                    try:
                        bgSamples = parseFastaLines(open(opts.bg_samples))
                    except:
                        raise RuntimeError(
                            "specified background samples file %s"
                            "was niether a pickled file nor a fasta file!" %
                            opts.bg_samples)

            elif opts.overlap_resource:
                bgSamples = sample_resource(annotDB,
                                            peakSizes,
                                            sampleSize=100000)
            else:
                bgSamples = sample_genome(genome, peakSizes, sampleSize=100000)
                #bgSamples = sample_genome(genome, peakSizes, sampleSize=100)
            bgSize = 0
            bgMatches = 0
            for region in bgSamples:
                bgSize += 1
                if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0:
                    bgMatches += 1

            #calculate significance of foreground vs. background
            zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize)
            pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches,
                                           bgSize)
            outstr = '\t'.join([args[0], motifKey] +
                               map(str, [
                                   'thesh_z=' + str(opts.zscore), zscore,
                                   pvalue, fgMatches, fgSize,
                                   float(fgMatches) / fgSize, bgMatches, bgSize
                               ]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >> sys.stderr, outstr
    if updated_motifs:
        print '# Saving motif info back to %s' % opts.motif_file
        pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main():
    """ Calculate significance of the intersection between two sets of regions.
        Regions may be either BED files or pygr AnnotationDB's.
    """

    parser = optparse.OptionParser("%prog [options] resource1 resource2 \n" +
                                   main.__doc__)
    parser.add_option("--genome_resource",
                      '-g',
                      dest="genome_resource",
                      type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option(
        '--filter_fxn1',
        dest='filter_fxn1',
        type='string',
        default='',
        help=
        """Use the given function as a filter on what is considered a hit from resource1.
                      available variables are seq1,annot1, edge1.  e.g.,
                      --filter_fxn1="len(seq1) > 10" """)
    parser.add_option(
        '--filter_fxn2',
        dest='filter_fxn2',
        type='string',
        default='',
        help=
        """Use the given function as a filter on what is considered a hit from resource2.
                      available variables are seq2,annot2, edge2.  e.g.,
                      --filter_fxn2="float(annot2.FDR) < .25" """)
    parser.add_option(
        "--format1",
        dest="format1",
        type="string",
        default='BED',
        help="""Format of resource1. One of [bed, resource, file] corresponding
                      to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default"""
    )
    parser.add_option("--format2",
                      dest="format2",
                      type="string",
                      default='BED',
                      help="""Format of resource2. See help for format1.""")
    parser.add_option(
        "--name1",
        dest="name1",
        type="string",
        default='',
        help=
        """Override the name for resource1.  Default=file or resource name""")
    parser.add_option(
        "--name2",
        dest="name2",
        type="string",
        default='',
        help=
        """Override the name for resource2.  Default=file or resource name""")
    parser.add_option(
        '--overlap_resource',
        dest='overlap_resource',
        type='string',
        default='',
        help=
        """Only count regions (both res1 and res2) that overlap with this worldbase ID"""
    )

    #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000,
    #                  help="""Total number of background samples to check for overlap""")
    parser.add_option(
        "--output_file",
        '-f',
        dest="output_file",
        type="string",
        help="""Write the significance calculation to the given file""")
    parser.add_option("--quiet",
                      '-q',
                      dest="quiet",
                      action="store_true",
                      help="""Suppress progress reports from stdout""")

    opts, args = parser.parse_args()
    print opts, args
    log = Logger(opts.quiet)
    if len(args) != 2:
        parser.print_help()
        log.error(
            'Need two genomic annotations! Please specify both resource1 and resource2 '
        )
        sys.exit(-1)
    print opts, args
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    log.log('# Loading genome resource %s' % opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        log.log('# Loading overlap resources %s and %s' %
                (opts.overlap_resource, opts.overlap_resource + '_db'))
        overlapMap = worldbase(opts.overlap_resource)
        overlapDB = worldbase(opts.overlap_resource + '_db')

    AllRes1Names, AllRes2Names = args
    for res1Name in AllRes1Names.split(','):

        if len(res1Name) == 0:
            continue
        opts.format1 = opts.format1.lower()
        if opts.format1 == 'bed':
            #if os.path.exists(res1Name):
            log.log('# Building resource1 from BED file %s' % res1Name)
            res1File = open(res1Name)
            res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome)
            res1File.close()
        elif opts.format1 == 'resource':
            log.log('# Loading resource1 %s from worldbase' % res1Name)
            res1Map = worldbase(res1Name)
        elif opts.format1 == 'file':
            res1_allVars = open(res1Name).readlines()
            log.log('# List for resource1 includes %s resources' %
                    len(res1_allVars))
        else:
            parser.print_help()
            log.error(
                'Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]'
                % (opts.format1, res1Name))
            sys.exit(-1)

        for res2Name in AllRes2Names.split(','):
            if len(res2Name) == 0:
                continue
            if opts.format2 == 'bed':
                #if os.path.exists(res2Name):
                log.log('# Building resource2 from BED file %s' % res2Name)
                res2File = open(res2Name)
                res2Table, res2DB, res2Map = makeResourceFromBed(
                    res2File, genome)
                res2File.close()
            elif opts.format2 == 'resource':
                log.log('# Loading resource2 %s from worldbase' % res2Name)
                res2Map = worldbase(res2Name)
                try:
                    res2DB = worldbase(res2Name + '_db')
                except:
                    log.log('No DB found for resource2 at %s' % res2Name +
                            '_db')
                    res2DB = None
            elif opts.format1 == 'file':
                log.error('several resource iteration not implemented yet...')
            else:
                parser.print_help()
                log.error(
                    'Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]'
                    % (opts.format2, res2Name))
                sys.exit(-1)

            # Unescape if filter functions have been escaped
            for key, value in escapedOperators.items():
                if opts.filter_fxn1:
                    opts.filter_fxn1 = opts.filter_fxn1.replace(key, value)
                if opts.filter_fxn2:
                    opts.filter_fxn2 = opts.filter_fxn2.replace(key, value)

            res1Lengths = []
            res12Intersect = 0
            res2Count = 0

            #res1Size, res2Size, resIntersectSize = 0,0,0
            #res2SizeInBP = 0
            log.log(
                '# Calculating overlap between resources... Iterating over resource 1'
            )
            sys.stdout.flush()
            for seq1, annot1, edge1 in res1Map.edges():
                if not opts.filter_fxn1 or eval(
                        opts.filter_fxn1):  # no filter1 or passed it
                    if not opts.overlap_resource or len(
                            list(get_overlap_edges_seq_msa(overlapMap, seq1))
                    ) > 0:  # no overlap req'd or seq1 overlaps
                        res1Lengths.append(len(annot1))
                        for seq2, annot2, edge2 in get_overlap_edges_seq_msa(
                                seq1, res2Map):
                            if not opts.filter_fxn2 or eval(
                                    opts.filter_fxn2
                            ):  # no filter2 or passed it
                                if not opts.overlap_resource or len(
                                        list(
                                            get_overlap_edges_seq_msa(
                                                overlapMap, seq2))
                                ) > 0:  # no overlap req'd or seq2 overlaps
                                    #res12Intersect.append(len(annot2))  # only counting the bases that actually overlap
                                    res12Intersect += 1
            # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary
            if not res2DB or opts.filter_fxn2 or opts.overlap_resource:
                log.log('# Iterating over resource 2')
                sys.stdout.flush()
                for seq2, annot2, edge2 in res2Map.edges():
                    #sys.stdout.flush()
                    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
                    if not opts.filter_fxn2 or eval(
                            opts.filter_fxn2):  # no filter2 or passed it
                        if not opts.overlap_resource or len(
                                list(
                                    get_overlap_edges_seq_msa(
                                        seq2, overlapMap))) > 0:
                            # instance of res2 found
                            #if res2Size % 1000 == 0:
                            #    print res2Size,
                            res2Count += 1
            else:
                res2Count = len(res2DB)
            log.log('# Calculating enrichment...')
            fgOverlap, fgSize = res12Intersect, sum(res1Lengths)
            bgOverlap, bgSize = res2Count, sum(
                len(chromSeq) for chromName, chromSeq in genome.iteritems()
                if '_' not in chromName)
            if fgSize == 0:
                log.error(
                    'ERROR: Empty resource1 or no hits passed filtering step!')
                log.error(
                    'fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' %
                    (fgOverlap, fgSize, bgOverlap, bgSize))
            else:
                zscore = sequence_motif.zscore_hypergeometric(
                    fgOverlap, fgSize, bgOverlap, bgSize)
                pvalue = sequence_motif.pvalue_hypergeometric(
                    fgOverlap, fgSize, bgOverlap, bgSize)
                fold_enrichment = sequence_motif.fold_enrichment(
                    fgOverlap, fgSize, bgOverlap, bgSize)
                if opts.name1:
                    curName1 = opts.name1
                else:
                    curName1 = res1Name
                if opts.name2:
                    curName2 = opts.name2
                else:
                    curName2 = res2Name
                outstr = '\t'.join(
                    map(str, [
                        curName1, curName2, zscore, pvalue, fold_enrichment,
                        fgOverlap, fgSize, bgOverlap, bgSize
                    ]))

            #print '# Now sampling %s times...' % opts.sample_size
            #sys.stdout.flush()
            #bgMatches = 0
            #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False)
            #for seq in genomicSamples:
            #    for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map):
            #        if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #            if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0:
            #                # instance of res2 found
            #                #if res2Size % 1000 == 0:
            #                #    print res2Size,
            #                bgMatches += 1
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size]))

            #print 'Iterating over resource 2'
            #for seq2, annot2, edge2 in res2Map.edges():
            #    #sys.stdout.flush()
            #    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
            #    if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #        if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0:
            #            # instance of res2 found
            #            #if res2Size % 1000 == 0:
            #            #    print res2Size,
            #            res2Size += 1
            #            res2SizeInBP += len(seq2)
            #avgRes2Size = float(res2SizeInBP) / res2Size
            #genomeSize = sum(map(len, genome.itervalues()))
            #genomeTotalPartitions = float(genomeSize) / avgRes2Size
            #print '# Calculating enrichment significance...'
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions]))

            print outstr
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
Exemplo n.º 3
0
def main(argv=None):
    """ Calculate significance of a motif in peaks with genomic background
    Can use restricted annotationDB, such as only promoter regions """

    parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__)
    parser.add_option("--genome", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option("--motif_file", '-m', dest="motif_file", type="string",
                      help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g.,
                      {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""")
    parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string",
                      help="""index file for consensus motifs (IUPAC format, one
                      per line in the file""")
    parser.add_option("--motif_key", '-k', dest="motif_key", type="string",
                      help="""The key for the current motif in motif_file, default=all""")
    parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29,
                      help="""Calculate threshold score estimate from this Z-score. [default=%default]""")
    parser.add_option('--overlap_resource', dest='overlap_resource', type='string',
                      help="""Only count fg and bg that overlap with pygr resource""")
    parser.add_option('--bg_samples', dest='bg_samples', type='string',
                      help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""")
    parser.add_option('--no_bg', dest='no_bg', action='store_true',
                      help="""skip sampling in the background""")
    parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file')
    parser.add_option("--output_file", '-f', dest="output_file", type="string",
                      help="""Append the zscore information to the given file""")
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1:
        parser.print_help()
        print 'Specify the peaks bed file!'
        sys.exit(-1)
    if not opts.motif_file and not opts.consensus_file:
        parser.print_help()
        print 'Specify the motif file!'
        sys.exit(-1)

    updated_motifs = False
    print '# Loading resources...'
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        annotMap = worldbase(opts.overlap_resource)
        annotDB = worldbase(opts.overlap_resource + '_db')
    
    allMotifs = {}
    # load pickled dict of motifs
    if opts.motif_file:
        allMotifs.update(pickle.load(file(opts.motif_file, 'rb')))
    # create consensus dict of motifs
    if opts.consensus_file:
        with open(opts.consensus_file) as infile:
            for line in infile:
                name, consensus = line.strip().split('\t')
                allMotifs.update({name:makePWMFromIUPAC(consensus)})

    if opts.motif_key:
        allKeys = [opts.motif_key]
    else:
        allKeys = allMotifs.keys()
    
    # write a header
    if opts.output_file:
        outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z',
                            'hypergeo_pvalue', 'fgMatches', 'fgSize',
                            'fgMatches/fgSize', 'bgMatches', 'bgSize'])
        open(opts.output_file, 'w').write(outstr)

    for motifKey in allKeys:
        print '# Loaded motif %s...' % motifKey
        pwm = allMotifs[motifKey]
        if type(pwm) is list:
            pwm = Motif(pwm)
            allMotifs[motifKey] = pwm
        if not pwm.bg_calculated():
            print '# Calculating motif background distribution...'
            pwm.calculate_background(genome)
            updated_motifs = True
        print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd)
        allPeaks = open(args[0]).readlines()
        allPeaks = list(readBedLines(allPeaks))
        peakSizes = [stop - start for _, start, stop, _ in allPeaks]

        print '# Searching foreground sequence...'
        sys.stdout.flush()
        peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks)
        if opts.overlap_resource:
            # check to see if the bed line overlaps the resource
            overlappingRegions = [region for region in peakRegions \
                                        if len(annotMap[region]) > 0]
            # run a search in each of the overlapping regions
            motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \
                                        for region in overlappingRegions]
            fgSize = len(overlappingRegions)
            # count the number of peaks with at least one motif instance
            fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap))
        else:
            matchingPeaks = [region for region in peakRegions \
                                        if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0]
            fgMatches = len(matchingPeaks)
            fgSize = len(allPeaks)
            
        if opts.report_region is not None:
            with open(opts.report_region, 'w') as outfile:
                outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks)

        if opts.no_bg:
            outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, 
                                                      float(fgMatches)/fgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >>sys.stderr, outstr
        else:
            print '# Searching background sequence...'
            sys.stdout.flush()
            if opts.bg_samples:
                try:
                    bgSamples = pickle.load(open(opts.bg_samples))
                except:
                    try:
                        bgSamples = parseFastaLines(open(opts.bg_samples))
                    except:
                        raise RuntimeError("specified background samples file %s"
                                           "was niether a pickled file nor a fasta file!" %
                                           opts.bg_samples)
                
            elif opts.overlap_resource:
                bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000)
            else:
                bgSamples = sample_genome(genome, peakSizes, sampleSize=100000)
                #bgSamples = sample_genome(genome, peakSizes, sampleSize=100)
            bgSize = 0
            bgMatches = 0
            for region in bgSamples:
                bgSize += 1
                if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0:
                    bgMatches += 1
    
            #calculate significance of foreground vs. background
            zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize)
            pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize)
            outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >>sys.stderr, outstr
    if updated_motifs:
        print '# Saving motif info back to %s' % opts.motif_file
        pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
Exemplo n.º 4
0
def main():
    """ Calculate significance of the intersection between two sets of regions.
        Regions may be either BED files or pygr AnnotationDB's.
    """

    parser = optparse.OptionParser("%prog [options] resource1 resource2 \n"+main.__doc__)
    parser.add_option("--genome_resource", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option('--filter_fxn1', dest='filter_fxn1', type='string', default='',
                      help="""Use the given function as a filter on what is considered a hit from resource1.
                      available variables are seq1,annot1, edge1.  e.g.,
                      --filter_fxn1="len(seq1) > 10" """)
    parser.add_option('--filter_fxn2', dest='filter_fxn2', type='string', default='',
                      help="""Use the given function as a filter on what is considered a hit from resource2.
                      available variables are seq2,annot2, edge2.  e.g.,
                      --filter_fxn2="float(annot2.FDR) < .25" """)
    parser.add_option("--format1", dest="format1", type="string", default='BED',
                      help="""Format of resource1. One of [bed, resource, file] corresponding
                      to a single BED file, a worldbase resource ID, or a list of IDs in a file. default:%default""")
    parser.add_option("--format2", dest="format2", type="string", default='BED',
                      help="""Format of resource2. See help for format1.""")
    parser.add_option("--name1", dest="name1", type="string", default='',
                      help="""Override the name for resource1.  Default=file or resource name""")
    parser.add_option("--name2", dest="name2", type="string", default='',
                      help="""Override the name for resource2.  Default=file or resource name""")
    parser.add_option('--overlap_resource', dest='overlap_resource', type='string', default='',
                      help="""Only count regions (both res1 and res2) that overlap with this worldbase ID""")
    
    #parser.add_option("--sample_size", '-s', dest="sample_size", type="int", default=10000,
    #                  help="""Total number of background samples to check for overlap""")
    parser.add_option("--output_file", '-f', dest="output_file", type="string",
                      help="""Write the significance calculation to the given file""")
    parser.add_option("--quiet", '-q', dest="quiet", action="store_true",
                      help="""Suppress progress reports from stdout""")
    
    opts, args = parser.parse_args()
    print opts, args
    log = Logger(opts.quiet)
    if len(args) != 2:
        parser.print_help()
        log.error('Need two genomic annotations! Please specify both resource1 and resource2 ')
        sys.exit(-1)
    print opts, args
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    log.log('# Loading genome resource %s' % opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        log.log('# Loading overlap resources %s and %s' % (opts.overlap_resource, opts.overlap_resource + '_db'))
        overlapMap = worldbase(opts.overlap_resource)
        overlapDB = worldbase(opts.overlap_resource + '_db')
    
    AllRes1Names, AllRes2Names = args
    for res1Name in AllRes1Names.split(','):
        
        if len(res1Name) == 0:
            continue
        opts.format1 = opts.format1.lower()
        if opts.format1 == 'bed':
        #if os.path.exists(res1Name):
            log.log('# Building resource1 from BED file %s' % res1Name)
            res1File = open(res1Name)
            res1Table, res1DB, res1Map = makeResourceFromBed(res1File, genome)
            res1File.close()
        elif opts.format1 == 'resource':
            log.log('# Loading resource1 %s from worldbase' % res1Name)
            res1Map = worldbase(res1Name)
        elif opts.format1 == 'file':
            res1_allVars = open(res1Name).readlines()
            log.log('# List for resource1 includes %s resources' % len(res1_allVars))
        else:
            parser.print_help()
            log.error('Unrecognized format specified for resource1: %s %s should be one of [bed, resource, file]' % (opts.format1, res1Name))
            sys.exit(-1)
        
        for res2Name in AllRes2Names.split(','):
            if len(res2Name) == 0:
                continue
            if opts.format2 == 'bed':
            #if os.path.exists(res2Name):
                log.log('# Building resource2 from BED file %s' % res2Name)
                res2File = open(res2Name)
                res2Table, res2DB, res2Map = makeResourceFromBed(res2File, genome)
                res2File.close()
            elif opts.format2 == 'resource':
                log.log('# Loading resource2 %s from worldbase' % res2Name)
                res2Map = worldbase(res2Name)
                try:
                    res2DB = worldbase(res2Name + '_db')
                except:
                    log.log('No DB found for resource2 at %s' % res2Name + '_db')
                    res2DB = None
            elif opts.format1 == 'file':
                log.error('several resource iteration not implemented yet...')
            else:
                parser.print_help()
                log.error('Unrecognized format specified for resource2: %s %s should be one of [bed, resource, file]' % (opts.format2, res2Name))
                sys.exit(-1)
        
            # Unescape if filter functions have been escaped
            for key, value in escapedOperators.items():
                if opts.filter_fxn1:
                    opts.filter_fxn1 = opts.filter_fxn1.replace( key, value )
                if opts.filter_fxn2:
                    opts.filter_fxn2 = opts.filter_fxn2.replace( key, value )
            
            res1Lengths = []
            res12Intersect = 0
            res2Count = 0
            
            #res1Size, res2Size, resIntersectSize = 0,0,0
            #res2SizeInBP = 0 
            log.log('# Calculating overlap between resources... Iterating over resource 1')
            sys.stdout.flush()
            for seq1, annot1, edge1  in res1Map.edges():
                if not opts.filter_fxn1 or eval(opts.filter_fxn1):      # no filter1 or passed it
                    if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq1))) > 0:  # no overlap req'd or seq1 overlaps
                        res1Lengths.append(len(annot1))
                        for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq1, res2Map):
                            if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
                                if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(overlapMap, seq2))) > 0:  # no overlap req'd or seq2 overlaps
                                    #res12Intersect.append(len(annot2))  # only counting the bases that actually overlap
                                    res12Intersect += 1
            # only iterate over res2 if we don't have a db resource for it or there is some filtering necessary
            if not res2DB or opts.filter_fxn2 or opts.overlap_resource:
                log.log('# Iterating over resource 2')
                sys.stdout.flush()
                for seq2, annot2, edge2 in res2Map.edges():
                    #sys.stdout.flush()
                    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
                    if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
                        if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0:
                            # instance of res2 found
                            #if res2Size % 1000 == 0:
                            #    print res2Size,
                            res2Count += 1
            else:
                res2Count = len(res2DB)
            log.log('# Calculating enrichment...')
            fgOverlap, fgSize = res12Intersect, sum(res1Lengths)
            bgOverlap, bgSize = res2Count, sum(len(chromSeq) for chromName, chromSeq in genome.iteritems() if '_' not in chromName)
            if fgSize == 0:
                log.error('ERROR: Empty resource1 or no hits passed filtering step!')
                log.error('fgOverlap, fgSize, bgOverlap, bgSize = %s %s %s %s' % (fgOverlap, fgSize, bgOverlap, bgSize))
            else:
                zscore = sequence_motif.zscore_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize)
                pvalue = sequence_motif.pvalue_hypergeometric(fgOverlap, fgSize, bgOverlap, bgSize)
                fold_enrichment = sequence_motif.fold_enrichment(fgOverlap, fgSize, bgOverlap, bgSize)
                if opts.name1:
                    curName1 = opts.name1
                else:
                    curName1 = res1Name
                if opts.name2:
                    curName2 = opts.name2
                else:
                    curName2 = res2Name
                outstr = '\t'.join(map(str, [curName1, curName2, zscore, pvalue, fold_enrichment, fgOverlap, fgSize, bgOverlap, bgSize]))
            
        
            
            
            #print '# Now sampling %s times...' % opts.sample_size
            #sys.stdout.flush()
            #bgMatches = 0
            #genomicSamples = sampling.sample_genome(genome, res1Lengths, sampleSize=opts.sample_size, excludeRepeat=False, excludeN=False)
            #for seq in genomicSamples:
            #    for seq2, annot2, edge2 in get_overlap_edges_seq_msa(seq, res2Map):
            #        if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #            if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq, overlapMap))) > 0:
            #                # instance of res2 found
            #                #if res2Size % 1000 == 0:
            #                #    print res2Size,
            #                bgMatches += 1
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, bgMatches, opts.sample_size)
            #outstr = '\t'.join(map(str, [res1Name, res2Name, zscore, pvalue, resIntersectSize, res1Size, bgMatches, opts.sample_size]))
            
            #print 'Iterating over resource 2'
            #for seq2, annot2, edge2 in res2Map.edges():
            #    #sys.stdout.flush()
            #    #print '# iterating over res2 %s...' % res2Name, opts.overlap_resource,
            #    if not opts.filter_fxn2 or eval(opts.filter_fxn2):  # no filter2 or passed it
            #        if not opts.overlap_resource or len(list(get_overlap_edges_seq_msa(seq2, overlapMap))) > 0:
            #            # instance of res2 found
            #            #if res2Size % 1000 == 0:
            #            #    print res2Size,
            #            res2Size += 1
            #            res2SizeInBP += len(seq2)
            #avgRes2Size = float(res2SizeInBP) / res2Size
            #genomeSize = sum(map(len, genome.itervalues()))
            #genomeTotalPartitions = float(genomeSize) / avgRes2Size
            #print '# Calculating enrichment significance...'
            #zscore = sequence_motif.zscore_normal(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #pvalue = sequence_motif.pvalue_hypergeometric(resIntersectSize, res1Size, res2Size, genomeTotalPartitions)
            #outstr = '\t'.join(map(str, [zscore, pvalue, resIntersectSize, res1Size, res2Size, genomeTotalPartitions]))
            
            print outstr
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')