示例#1
0
def main():
    #ALWAYS PRINT OUT VERSION INFO:
    print(mdseqpos.__version__)
    print('Library path:', mdseqpos.__file__)

    USAGE = """USAGE: MDSeqPos.py [options] BEDFILE GENOME
    
    BEDFILE - regions file
    GENOME  - assembly which the regions pertain to, e.g. 'hg18', 'mm9', etc.
              as defined in BUILD_DICT in lib/settings.py"""

    parser = optparse.OptionParser(usage=USAGE)
    parser.add_option('-g',
                      '--genome-dir',
                      dest="genome_dir",
                      default=None,
                      help="Path to the genome assembly dir")
    parser.add_option('-d',
                      '--denovo',
                      default=False,
                      action="store_true",
                      help="flag to run denovo motif search (default: False)")
    parser.add_option(
        '-m',
        '--known-motifs',
        default=None,
        help=
        "comma separated list of known motifs dbs to use in the motif search, e.g. -m pbm.xml,transfac.xml"
    )
    parser.add_option(
        '-n',
        '--new-motifs',
        default='denovo.xml',
        help=
        "name of the output XML file which stores new motifs found during adenovo search, e.g. -n foo.xml (default: denovo.xml)"
    )
    parser.add_option(
        '-p',
        '--pval',
        default=0.001,
        help="pvalue cutoff for motif significance, (default: 0.001)")
    parser.add_option(
        '-s',
        '--species-list',
        default=None,
        help=
        "name of species to filter the results with--if multuple species, comma-separate them, e.g. hs,mm,dm"
    )
    parser.add_option(
        '-w',
        '--width',
        default=600,
        help=
        "width of the region to be scanned for motifs; depends on resoution of assay, (default: 600 basepairs)"
    )
    parser.add_option('-v',
                      '--verbose',
                      default=False,
                      action="store_true",
                      help="flag to print debugging info (default: False)")
    parser.add_option(
        '--hcluster',
        type='float',
        default=0.8,
        help=
        "The similarity cutoff for hierarchical clustering of the result, (default: 0.8, The higher, the more groups, 0 ~ 1)"
    )
    parser.add_option(
        '-c',
        '--cluster',
        default=False,
        action="store_true",
        help=
        "This option only for know-motifs cistrome.xml, If you want to use pre-clustered database to accelerate seqpos. default (not set): False"
    )
    parser.add_option(
        '--maxmotif',
        default=0,
        help="maximum number of motifs to report, (default: 0, i.e. no max)")
    parser.add_option('-O',
                      '--output-directory',
                      default="results",
                      help="output directory name (default: results)")

    #parse the command line options
    (opts, args) = parser.parse_args(sys.argv)
    if len(args) < 3:
        parser.print_help()
        sys.exit(-1)
    bedfile_name = args[1]
    genome = args[2]
    _DEBUG = opts.verbose
    output_dir = opts.output_directory

    #READ in the regions that are specified in the BED file
    print("read regions start time: %s" % time.ctime())
    #HERE we should rely on a standard package to read in bed files; stub it
    chip_regions = ChipRegions(bedfile_name,
                               genome,
                               genome_dir=opts.genome_dir)
    print("read regions end time: %s" % time.ctime())
    #LOAD the motifs (both known and denovo)
    known_motifs, new_motifs = None, None
    if opts.known_motifs:
        motif_dbs = [x.strip() for x in opts.known_motifs.split(',')]
        known_motifs = read_known_motifs(motif_dbs, _DEBUG)

    if opts.denovo:
        print("starting denovo search...(time: %s)" % time.ctime())
        new_motifs = chip_regions.mdmodule(width=int(opts.width))
        print("completed denovo search...(time: %s)" % time.ctime())
        new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs))

    #Run seqpos stats on all_motifs
    print("starting seqpos stats...(time: %s)" % time.ctime())
    if new_motifs:
        for m in new_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    if opts.cluster and opts.known_motifs == 'cistrome.xml':  #only for cistrome.xml to use cistrome.cluster
        known_motifs = seqpos_cluster_known_motifs(known_motifs, chip_regions,
                                                   int(opts.width),
                                                   float(opts.pval))
    elif known_motifs:
        for m in known_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    print("completed seqpos stats...(time: %s)" % time.ctime())

    #Combine both known and new motifs
    all_motifs = None
    if known_motifs and new_motifs:
        all_motifs = MotifList(known_motifs + new_motifs)
    elif known_motifs:
        all_motifs = known_motifs
    else:
        all_motifs = new_motifs

    #CULL the results to see only the relevant results, and output
    sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif))

    #filter by species?
    if opts.species_list:
        species_list = opts.species_list.split(',')
        sig_motifs = sig_motifs.filterBySpecies(species_list)

    #dists = calc_motif_dist_pcc(sig_motifs)
    #save_to_html(output_dir, sig_motifs, dists)
    save_to_html_plain(output_dir, sig_motifs, opts.hcluster)

    json_list = [t.to_json() for t in sig_motifs]
    jsonf = open(os.path.join(output_dir, 'motif_list.json'), 'w')
    for js in json_list:
        jsonf.write(js + '\n')
    jsonf.close()
示例#2
0
文件: MDSeqPos.py 项目: cfce/chilin
def main():
    #ALWAYS PRINT OUT VERSION INFO: 
    print mdseqpos.__version__
    print 'Library path:', mdseqpos.__file__
    print 
    
    USAGE = """USAGE: MDSeqPos.py [options] BEDFILE GENOME
    
    BEDFILE - regions file
    GENOME  - assembly which the regions pertain to, e.g. 'hg18', 'mm9', etc.
              as defined in BUILD_DICT in lib/settings.py"""
              
    parser = optparse.OptionParser(usage=USAGE)
    parser.add_option('-g', '--genome-dir', dest="genome_dir", default=None,
                      help="Path to the genome assembly dir")
    parser.add_option('-d', '--denovo', default=False, action="store_true",
                      help="flag to run denovo motif search (default: False)")
    parser.add_option('-m', '--known-motifs', default=None,
                      help="comma separated list of known motifs dbs to use in the motif search, e.g. -m pbm.xml,transfac.xml")
    parser.add_option('-n', '--new-motifs', default='denovo.xml',
                      help="name of the output XML file which stores new motifs found during adenovo search, e.g. -n foo.xml (default: denovo.xml)")
    parser.add_option('-p', '--pval', default=0.001,
                      help="pvalue cutoff for motif significance, (default: 0.001)")
    parser.add_option('-s', '--species-list', default=None, 
                      help="name of species to filter the results with--if multuple species, comma-separate them, e.g. hs,mm,dm")
    parser.add_option('-w', '--width', default=600,
                      help="width of the region to be scanned for motifs; depends on resoution of assay, (default: 600 basepairs)")
    parser.add_option('-v', '--verbose', default=False, action="store_true",
                      help="flag to print debugging info (default: False)")
    parser.add_option('--hcluster', type = 'float', default=0.8,
                      help="The similarity cutoff for hierarchical clustering of the result, (default: 0.8, The higher, the more groups, 0 ~ 1)")
    parser.add_option('-c', '--cluster', default=False, action="store_true",
                      help="This option only for know-motifs cistrome.xml, If you want to use pre-clustered database to accelerate seqpos. default (not set): False")
    parser.add_option('--maxmotif', default=0,
                      help="maximum number of motifs to report, (default: 0, i.e. no max)")
    parser.add_option('-O', '--output-directory', default="results", 
                      help="output directory name (default: results)")
    
    #parse the command line options
    (opts, args) = parser.parse_args(sys.argv)
    if len(args) < 3: 
        parser.print_help()
        sys.exit(-1)
    bedfile_name = args[1]
    genome = args[2]
    _DEBUG = opts.verbose
    output_dir = opts.output_directory
    print opts.genome_dir

    #READ in the regions that are specified in the BED file
    print "read regions start time: %s" % time.ctime()
    #HERE we should rely on a standard package to read in bed files; stub it
    chip_regions = ChipRegions(bedfile_name, genome, genome_dir=opts.genome_dir)
    print "read regions end time: %s" % time.ctime()

    #LOAD the motifs (both known and denovo)
    known_motifs, new_motifs = None, None
    if opts.known_motifs:
        motif_dbs = [x.strip() for x in opts.known_motifs.split(',')]
        known_motifs = read_known_motifs(motif_dbs, _DEBUG)

    if opts.denovo:
        print "starting denovo search...(time: %s)" % time.ctime()
        new_motifs = chip_regions.mdmodule(width=int(opts.width))
        print "completed denovo search...(time: %s)" % time.ctime()
        new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs))
        
    #Run seqpos stats on all_motifs
    print "starting seqpos stats...(time: %s)" % time.ctime()
    if new_motifs:
        for m in new_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    if opts.cluster and opts.known_motifs == 'cistrome.xml': #only for cistrome.xml to use cistrome.cluster
        known_motifs = seqpos_cluster_known_motifs(known_motifs, chip_regions, int(opts.width), float(opts.pval))
    elif known_motifs:
        for m in known_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    print "completed seqpos stats...(time: %s)" % time.ctime()

    #Combine both known and new motifs
    all_motifs = None
    if known_motifs and new_motifs:
        all_motifs = MotifList(known_motifs + new_motifs)
    elif known_motifs:
        all_motifs = known_motifs
    else:
        all_motifs = new_motifs

    #CULL the results to see only the relevant results, and output
    sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif))
    
    #filter by species?
    if opts.species_list:
        species_list = opts.species_list.split(',')
        sig_motifs = sig_motifs.filterBySpecies(species_list)

    #dists = calc_motif_dist_pcc(sig_motifs)
    #save_to_html(output_dir, sig_motifs, dists)
    save_to_html_plain(output_dir, sig_motifs, opts.hcluster)

    json_list = [t.to_json() for t in sig_motifs]
    jsonf = open(os.path.join(output_dir, 'motif_list.json'),'w')
    for js in json_list:
        jsonf.write(js +'\n')
    jsonf.close()
示例#3
0
def main():
    #ALWAYS PRINT OUT VERSION INFO: 
    print mdseqpos.__version__

    parser = optparse.OptionParser(usage=USAGE)
    parser.add_option('-d', '--denovo', default=False, action="store_true",
                      help="flag to run denovo motif search (default: False)")
    parser.add_option('-m', '--known-motifs', default=None,
                      help="comma separated list of known motifs dbs to use \
                      in the motif search, e.g. -m pbm.xml,transfac.xml")
    parser.add_option('-n', '--new-motifs', default='denovo.xml',
                      help="name of the output XML file which stores new \
                      motifs found during adenovo search, e.g. -n foo.xml \
                      (default: denovo.xml)")
    parser.add_option('-p', '--pval', default=0.001,
                      help="pvalue cutoff for motif significance, \
                      (default: 0.001)")
    parser.add_option('-s', '--species-list', default=None, help="name of \
                      species to filter the results with--if multuple \
                      species, comma-separate them, e.g. hs,mm,dm")
    parser.add_option('-w', '--width', default=600,
                      help="width of the region to be scanned for motifs; \
                      depends on resoution of assay, (default: 600 basepairs)")
    parser.add_option('-v', '--verbose', default=False, action="store_true",
                      help="flag to print debugging info (default: False)")
    parser.add_option('--maxmotif', default=-1,
                      help="maximum number of motifs to report, \
                      (default: -1, i.e. no max)")
    parser.add_option('-O', '--output-directory', default="results", 
                      help="output directory name (default: results)")
    
    #parse the command line options
    (opts, args) = parser.parse_args(sys.argv)
    if len(args) < 3: 
        parser.print_help()
        sys.exit(-1)
    bedfile_name = args[1]
    genome = args[2]
        
    #quoted bug fix
    if bedfile_name and '"' in bedfile_name:
        bedfile_name = bedfile_name.replace('"','')
    if genome and '"' in genome:
        genome = genome.replace('"','')
    _DEBUG = opts.verbose
    output_dir = opts.output_directory
    
    output_dir = output_dir.replace('\\', '/')
    genome = genome.replace('\\', '/')
    
    if not os.path.exists(output_dir):
            os.mkdir(output_dir)

    #READ in the regions that are specified in the BED file
    if _DEBUG: print "read regions start time: %s" % time.ctime()
    #HERE we should rely on a standard package to read in bed files; stub it
    chip_regions = ChipRegions(bedfile_name, genome)
    if _DEBUG: print "read regions end time: %s" % time.ctime()

    #LOAD the motifs (both known and denovo)
    known_motifs, new_motifs = None, None
    if opts.known_motifs:
        motif_dbs = [x.strip() for x in opts.known_motifs.split(',')]
        known_motifs = read_known_motifs(motif_dbs, _DEBUG)

    if opts.denovo:
        if _DEBUG: print "starting denovo search...(time: %s)" % time.ctime()
        new_motifs = chip_regions.mdmodule(width=int(opts.width))
        if _DEBUG: print "completed denovo search...(time: %s)" % time.ctime()
        new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs))
        
    #Combine both known and new motifs
    all_motifs = None
    if known_motifs and new_motifs:
        all_motifs = MotifList(known_motifs + new_motifs)
    elif known_motifs:
        all_motifs = known_motifs
    else:
        all_motifs = new_motifs
        
    if all_motifs == None:
        print "No motifs found"
        exit(1)

    #Run seqpos stats on all_motifs
    if _DEBUG: print "starting seqpos stats...(time: %s)" % time.ctime()
    for m in all_motifs: m.seqpos(chip_regions, width=int(opts.width))
    if _DEBUG: print "completed seqpos stats...(time: %s)" % time.ctime()

    #CULL the results to see only the relevant results, and output
    sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif))
    
    #filter by species?
    if opts.species_list:
        species_list = opts.species_list.split(',')
        sig_motifs = sig_motifs.filterBySpecies(species_list)

    dists = calc_motif_dist(sig_motifs)
    #save_to_html(output_dir, sig_motifs, dists)
    save_to_html_plain(output_dir, sig_motifs, dists)