示例#1
0
def seqpos_cluster_known_motifs(known_motifs, chip_regions, width, cutoff):
    """Cluster the known motifs based on their similarity score,then one motif
    could represent the whole cluster
    """
    CLUSTER = os.path.join(settings.DEPLOY_DIR, 'database', 'cistrome.cluster')
    cluster_motifs = {}  #cluster_id to motifs
    id2motif = {}
    for m in known_motifs:
        id2motif[m.id] = m
    for line in open(CLUSTER, 'r').readlines():
        mid, cid = line.strip().split('\t')
        if mid in id2motif:
            if cid in cluster_motifs:
                cluster_motifs[cid].append(id2motif[mid])
            else:
                cluster_motifs[cid] = [id2motif[mid]]

    fitered_motifs = []
    for motifs in list(cluster_motifs.values()):
        m0 = motifs[0]
        m0.seqpos(chip_regions, width)
        if m0.seqpos_results['pvalue'] <= cutoff:
            fitered_motifs.append(m0)
            for m in motifs[1:]:
                m.seqpos(chip_regions, width)
                fitered_motifs.append(m)
    return MotifList(fitered_motifs)
示例#2
0
def read_known_motifs(motif_dbs, _DEBUG=False):
    """Given a list of xml file names, this function tries to load the motifs
    in those databases
    """
    DATA_DIR = os.path.join(settings.DEPLOY_DIR, 'database')

    known_motifs = MotifList()
    for db in motif_dbs:
        if _DEBUG: print("loading (time): %s (%s)" % (db, time.ctime()))
        tmp = MotifList()
        tmp.from_xml_file(os.path.join(DATA_DIR, db))
        known_motifs.extend(tmp)
        if _DEBUG: print("load Complete (time): %s (%s)" % (db, time.ctime()))
    return known_motifs
示例#3
0
文件: MDSeqPos.py 项目: cfce/chilin
def read_known_motifs(motif_dbs, _DEBUG = False):
    """Given a list of xml file names, this function tries to load the motifs
    in those databases
    """
    DATA_DIR = os.path.join(settings.DEPLOY_DIR, 'database')

    known_motifs = MotifList()
    for db in motif_dbs:
        if _DEBUG: print "loading (time): %s (%s)" % (db, time.ctime())
        tmp = MotifList()
        tmp.from_xml_file(os.path.join(DATA_DIR, db))
        known_motifs.extend(tmp)
        if _DEBUG: print "load Complete (time): %s (%s)" % (db, time.ctime())
    return known_motifs
示例#4
0
文件: MDSeqPos.py 项目: cfce/chilin
def main():
    #ALWAYS PRINT OUT VERSION INFO: 
    print mdseqpos.__version__
    print 'Library path:', mdseqpos.__file__
    print 
    
    USAGE = """USAGE: MDSeqPos.py [options] BEDFILE GENOME
    
    BEDFILE - regions file
    GENOME  - assembly which the regions pertain to, e.g. 'hg18', 'mm9', etc.
              as defined in BUILD_DICT in lib/settings.py"""
              
    parser = optparse.OptionParser(usage=USAGE)
    parser.add_option('-g', '--genome-dir', dest="genome_dir", default=None,
                      help="Path to the genome assembly dir")
    parser.add_option('-d', '--denovo', default=False, action="store_true",
                      help="flag to run denovo motif search (default: False)")
    parser.add_option('-m', '--known-motifs', default=None,
                      help="comma separated list of known motifs dbs to use in the motif search, e.g. -m pbm.xml,transfac.xml")
    parser.add_option('-n', '--new-motifs', default='denovo.xml',
                      help="name of the output XML file which stores new motifs found during adenovo search, e.g. -n foo.xml (default: denovo.xml)")
    parser.add_option('-p', '--pval', default=0.001,
                      help="pvalue cutoff for motif significance, (default: 0.001)")
    parser.add_option('-s', '--species-list', default=None, 
                      help="name of species to filter the results with--if multuple species, comma-separate them, e.g. hs,mm,dm")
    parser.add_option('-w', '--width', default=600,
                      help="width of the region to be scanned for motifs; depends on resoution of assay, (default: 600 basepairs)")
    parser.add_option('-v', '--verbose', default=False, action="store_true",
                      help="flag to print debugging info (default: False)")
    parser.add_option('--hcluster', type = 'float', default=0.8,
                      help="The similarity cutoff for hierarchical clustering of the result, (default: 0.8, The higher, the more groups, 0 ~ 1)")
    parser.add_option('-c', '--cluster', default=False, action="store_true",
                      help="This option only for know-motifs cistrome.xml, If you want to use pre-clustered database to accelerate seqpos. default (not set): False")
    parser.add_option('--maxmotif', default=0,
                      help="maximum number of motifs to report, (default: 0, i.e. no max)")
    parser.add_option('-O', '--output-directory', default="results", 
                      help="output directory name (default: results)")
    
    #parse the command line options
    (opts, args) = parser.parse_args(sys.argv)
    if len(args) < 3: 
        parser.print_help()
        sys.exit(-1)
    bedfile_name = args[1]
    genome = args[2]
    _DEBUG = opts.verbose
    output_dir = opts.output_directory
    print opts.genome_dir

    #READ in the regions that are specified in the BED file
    print "read regions start time: %s" % time.ctime()
    #HERE we should rely on a standard package to read in bed files; stub it
    chip_regions = ChipRegions(bedfile_name, genome, genome_dir=opts.genome_dir)
    print "read regions end time: %s" % time.ctime()

    #LOAD the motifs (both known and denovo)
    known_motifs, new_motifs = None, None
    if opts.known_motifs:
        motif_dbs = [x.strip() for x in opts.known_motifs.split(',')]
        known_motifs = read_known_motifs(motif_dbs, _DEBUG)

    if opts.denovo:
        print "starting denovo search...(time: %s)" % time.ctime()
        new_motifs = chip_regions.mdmodule(width=int(opts.width))
        print "completed denovo search...(time: %s)" % time.ctime()
        new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs))
        
    #Run seqpos stats on all_motifs
    print "starting seqpos stats...(time: %s)" % time.ctime()
    if new_motifs:
        for m in new_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    if opts.cluster and opts.known_motifs == 'cistrome.xml': #only for cistrome.xml to use cistrome.cluster
        known_motifs = seqpos_cluster_known_motifs(known_motifs, chip_regions, int(opts.width), float(opts.pval))
    elif known_motifs:
        for m in known_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    print "completed seqpos stats...(time: %s)" % time.ctime()

    #Combine both known and new motifs
    all_motifs = None
    if known_motifs and new_motifs:
        all_motifs = MotifList(known_motifs + new_motifs)
    elif known_motifs:
        all_motifs = known_motifs
    else:
        all_motifs = new_motifs

    #CULL the results to see only the relevant results, and output
    sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif))
    
    #filter by species?
    if opts.species_list:
        species_list = opts.species_list.split(',')
        sig_motifs = sig_motifs.filterBySpecies(species_list)

    #dists = calc_motif_dist_pcc(sig_motifs)
    #save_to_html(output_dir, sig_motifs, dists)
    save_to_html_plain(output_dir, sig_motifs, opts.hcluster)

    json_list = [t.to_json() for t in sig_motifs]
    jsonf = open(os.path.join(output_dir, 'motif_list.json'),'w')
    for js in json_list:
        jsonf.write(js +'\n')
    jsonf.close()
示例#5
0
def main():
    #ALWAYS PRINT OUT VERSION INFO:
    print(mdseqpos.__version__)
    print('Library path:', mdseqpos.__file__)

    USAGE = """USAGE: MDSeqPos.py [options] BEDFILE GENOME
    
    BEDFILE - regions file
    GENOME  - assembly which the regions pertain to, e.g. 'hg18', 'mm9', etc.
              as defined in BUILD_DICT in lib/settings.py"""

    parser = optparse.OptionParser(usage=USAGE)
    parser.add_option('-g',
                      '--genome-dir',
                      dest="genome_dir",
                      default=None,
                      help="Path to the genome assembly dir")
    parser.add_option('-d',
                      '--denovo',
                      default=False,
                      action="store_true",
                      help="flag to run denovo motif search (default: False)")
    parser.add_option(
        '-m',
        '--known-motifs',
        default=None,
        help=
        "comma separated list of known motifs dbs to use in the motif search, e.g. -m pbm.xml,transfac.xml"
    )
    parser.add_option(
        '-n',
        '--new-motifs',
        default='denovo.xml',
        help=
        "name of the output XML file which stores new motifs found during adenovo search, e.g. -n foo.xml (default: denovo.xml)"
    )
    parser.add_option(
        '-p',
        '--pval',
        default=0.001,
        help="pvalue cutoff for motif significance, (default: 0.001)")
    parser.add_option(
        '-s',
        '--species-list',
        default=None,
        help=
        "name of species to filter the results with--if multuple species, comma-separate them, e.g. hs,mm,dm"
    )
    parser.add_option(
        '-w',
        '--width',
        default=600,
        help=
        "width of the region to be scanned for motifs; depends on resoution of assay, (default: 600 basepairs)"
    )
    parser.add_option('-v',
                      '--verbose',
                      default=False,
                      action="store_true",
                      help="flag to print debugging info (default: False)")
    parser.add_option(
        '--hcluster',
        type='float',
        default=0.8,
        help=
        "The similarity cutoff for hierarchical clustering of the result, (default: 0.8, The higher, the more groups, 0 ~ 1)"
    )
    parser.add_option(
        '-c',
        '--cluster',
        default=False,
        action="store_true",
        help=
        "This option only for know-motifs cistrome.xml, If you want to use pre-clustered database to accelerate seqpos. default (not set): False"
    )
    parser.add_option(
        '--maxmotif',
        default=0,
        help="maximum number of motifs to report, (default: 0, i.e. no max)")
    parser.add_option('-O',
                      '--output-directory',
                      default="results",
                      help="output directory name (default: results)")

    #parse the command line options
    (opts, args) = parser.parse_args(sys.argv)
    if len(args) < 3:
        parser.print_help()
        sys.exit(-1)
    bedfile_name = args[1]
    genome = args[2]
    _DEBUG = opts.verbose
    output_dir = opts.output_directory

    #READ in the regions that are specified in the BED file
    print("read regions start time: %s" % time.ctime())
    #HERE we should rely on a standard package to read in bed files; stub it
    chip_regions = ChipRegions(bedfile_name,
                               genome,
                               genome_dir=opts.genome_dir)
    print("read regions end time: %s" % time.ctime())
    #LOAD the motifs (both known and denovo)
    known_motifs, new_motifs = None, None
    if opts.known_motifs:
        motif_dbs = [x.strip() for x in opts.known_motifs.split(',')]
        known_motifs = read_known_motifs(motif_dbs, _DEBUG)

    if opts.denovo:
        print("starting denovo search...(time: %s)" % time.ctime())
        new_motifs = chip_regions.mdmodule(width=int(opts.width))
        print("completed denovo search...(time: %s)" % time.ctime())
        new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs))

    #Run seqpos stats on all_motifs
    print("starting seqpos stats...(time: %s)" % time.ctime())
    if new_motifs:
        for m in new_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    if opts.cluster and opts.known_motifs == 'cistrome.xml':  #only for cistrome.xml to use cistrome.cluster
        known_motifs = seqpos_cluster_known_motifs(known_motifs, chip_regions,
                                                   int(opts.width),
                                                   float(opts.pval))
    elif known_motifs:
        for m in known_motifs:
            m.seqpos(chip_regions, width=int(opts.width))
    print("completed seqpos stats...(time: %s)" % time.ctime())

    #Combine both known and new motifs
    all_motifs = None
    if known_motifs and new_motifs:
        all_motifs = MotifList(known_motifs + new_motifs)
    elif known_motifs:
        all_motifs = known_motifs
    else:
        all_motifs = new_motifs

    #CULL the results to see only the relevant results, and output
    sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif))

    #filter by species?
    if opts.species_list:
        species_list = opts.species_list.split(',')
        sig_motifs = sig_motifs.filterBySpecies(species_list)

    #dists = calc_motif_dist_pcc(sig_motifs)
    #save_to_html(output_dir, sig_motifs, dists)
    save_to_html_plain(output_dir, sig_motifs, opts.hcluster)

    json_list = [t.to_json() for t in sig_motifs]
    jsonf = open(os.path.join(output_dir, 'motif_list.json'), 'w')
    for js in json_list:
        jsonf.write(js + '\n')
    jsonf.close()
示例#6
0
        new_motifs_path = None
    else:
        if opts.new_motifs_dir is None:
            new_motifs_path = opts.new_motifs_file
        else:
            new_motifs_path = os.path.join(opts.new_motifs_dir,
                                           opts.new_motifs_file)

    # set BED file path
    bed_path = os.path.join(opts.bed_dir, opts.bed_file)

    # set FASTA file path
    fasta_path = os.path.join(opts.fasta_dir, opts.fasta_file)

    # retrieve known motifs
    motifs = MotifList()
    if known_motifs_path is not None:
        motifs.from_xml_file(known_motifs_path)

    # retrieve new motifs if new motifs file is specified by user
    if new_motifs_path is not None:
        new_motifs = MotifList()
        new_motifs.from_xml_file(new_motifs_path)
        #motifs.append(new_motifs)
        motifs += new_motifs

    # scan all motifs for the desired motif ID
    if opts.pssm_file is None:
        for motif in motifs:
            if motif.id == motif_id:
                desired_motif = motif
示例#7
0
文件: MotifScan.py 项目: cfce/chilin
 if opts.new_motifs_file == 'NULL':
     new_motifs_path = None
 else:
     if opts.new_motifs_dir is None:
         new_motifs_path = opts.new_motifs_file
     else:
         new_motifs_path = os.path.join(opts.new_motifs_dir, opts.new_motifs_file)
         
 # set BED file path
 bed_path = os.path.join(opts.bed_dir, opts.bed_file)
     
 # set FASTA file path
 fasta_path = os.path.join(opts.fasta_dir, opts.fasta_file)
 
 # retrieve known motifs
 motifs = MotifList()
 if known_motifs_path is not None:
     motifs.from_xml_file(known_motifs_path)
 
 # retrieve new motifs if new motifs file is specified by user
 if new_motifs_path is not None:
     new_motifs = MotifList()
     new_motifs.from_xml_file(new_motifs_path)
     #motifs.append(new_motifs)
     motifs += new_motifs
 
 # scan all motifs for the desired motif ID
 if opts.pssm_file is None:
     for motif in motifs:
         if motif.id == motif_id:
             desired_motif = motif
示例#8
0
def main():
    #ALWAYS PRINT OUT VERSION INFO: 
    print mdseqpos.__version__

    parser = optparse.OptionParser(usage=USAGE)
    parser.add_option('-d', '--denovo', default=False, action="store_true",
                      help="flag to run denovo motif search (default: False)")
    parser.add_option('-m', '--known-motifs', default=None,
                      help="comma separated list of known motifs dbs to use \
                      in the motif search, e.g. -m pbm.xml,transfac.xml")
    parser.add_option('-n', '--new-motifs', default='denovo.xml',
                      help="name of the output XML file which stores new \
                      motifs found during adenovo search, e.g. -n foo.xml \
                      (default: denovo.xml)")
    parser.add_option('-p', '--pval', default=0.001,
                      help="pvalue cutoff for motif significance, \
                      (default: 0.001)")
    parser.add_option('-s', '--species-list', default=None, help="name of \
                      species to filter the results with--if multuple \
                      species, comma-separate them, e.g. hs,mm,dm")
    parser.add_option('-w', '--width', default=600,
                      help="width of the region to be scanned for motifs; \
                      depends on resoution of assay, (default: 600 basepairs)")
    parser.add_option('-v', '--verbose', default=False, action="store_true",
                      help="flag to print debugging info (default: False)")
    parser.add_option('--maxmotif', default=-1,
                      help="maximum number of motifs to report, \
                      (default: -1, i.e. no max)")
    parser.add_option('-O', '--output-directory', default="results", 
                      help="output directory name (default: results)")
    
    #parse the command line options
    (opts, args) = parser.parse_args(sys.argv)
    if len(args) < 3: 
        parser.print_help()
        sys.exit(-1)
    bedfile_name = args[1]
    genome = args[2]
        
    #quoted bug fix
    if bedfile_name and '"' in bedfile_name:
        bedfile_name = bedfile_name.replace('"','')
    if genome and '"' in genome:
        genome = genome.replace('"','')
    _DEBUG = opts.verbose
    output_dir = opts.output_directory
    
    output_dir = output_dir.replace('\\', '/')
    genome = genome.replace('\\', '/')
    
    if not os.path.exists(output_dir):
            os.mkdir(output_dir)

    #READ in the regions that are specified in the BED file
    if _DEBUG: print "read regions start time: %s" % time.ctime()
    #HERE we should rely on a standard package to read in bed files; stub it
    chip_regions = ChipRegions(bedfile_name, genome)
    if _DEBUG: print "read regions end time: %s" % time.ctime()

    #LOAD the motifs (both known and denovo)
    known_motifs, new_motifs = None, None
    if opts.known_motifs:
        motif_dbs = [x.strip() for x in opts.known_motifs.split(',')]
        known_motifs = read_known_motifs(motif_dbs, _DEBUG)

    if opts.denovo:
        if _DEBUG: print "starting denovo search...(time: %s)" % time.ctime()
        new_motifs = chip_regions.mdmodule(width=int(opts.width))
        if _DEBUG: print "completed denovo search...(time: %s)" % time.ctime()
        new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs))
        
    #Combine both known and new motifs
    all_motifs = None
    if known_motifs and new_motifs:
        all_motifs = MotifList(known_motifs + new_motifs)
    elif known_motifs:
        all_motifs = known_motifs
    else:
        all_motifs = new_motifs
        
    if all_motifs == None:
        print "No motifs found"
        exit(1)

    #Run seqpos stats on all_motifs
    if _DEBUG: print "starting seqpos stats...(time: %s)" % time.ctime()
    for m in all_motifs: m.seqpos(chip_regions, width=int(opts.width))
    if _DEBUG: print "completed seqpos stats...(time: %s)" % time.ctime()

    #CULL the results to see only the relevant results, and output
    sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif))
    
    #filter by species?
    if opts.species_list:
        species_list = opts.species_list.split(',')
        sig_motifs = sig_motifs.filterBySpecies(species_list)

    dists = calc_motif_dist(sig_motifs)
    #save_to_html(output_dir, sig_motifs, dists)
    save_to_html_plain(output_dir, sig_motifs, dists)