def main(): #ALWAYS PRINT OUT VERSION INFO: print(mdseqpos.__version__) print('Library path:', mdseqpos.__file__) USAGE = """USAGE: MDSeqPos.py [options] BEDFILE GENOME BEDFILE - regions file GENOME - assembly which the regions pertain to, e.g. 'hg18', 'mm9', etc. as defined in BUILD_DICT in lib/settings.py""" parser = optparse.OptionParser(usage=USAGE) parser.add_option('-g', '--genome-dir', dest="genome_dir", default=None, help="Path to the genome assembly dir") parser.add_option('-d', '--denovo', default=False, action="store_true", help="flag to run denovo motif search (default: False)") parser.add_option( '-m', '--known-motifs', default=None, help= "comma separated list of known motifs dbs to use in the motif search, e.g. -m pbm.xml,transfac.xml" ) parser.add_option( '-n', '--new-motifs', default='denovo.xml', help= "name of the output XML file which stores new motifs found during adenovo search, e.g. -n foo.xml (default: denovo.xml)" ) parser.add_option( '-p', '--pval', default=0.001, help="pvalue cutoff for motif significance, (default: 0.001)") parser.add_option( '-s', '--species-list', default=None, help= "name of species to filter the results with--if multuple species, comma-separate them, e.g. hs,mm,dm" ) parser.add_option( '-w', '--width', default=600, help= "width of the region to be scanned for motifs; depends on resoution of assay, (default: 600 basepairs)" ) parser.add_option('-v', '--verbose', default=False, action="store_true", help="flag to print debugging info (default: False)") parser.add_option( '--hcluster', type='float', default=0.8, help= "The similarity cutoff for hierarchical clustering of the result, (default: 0.8, The higher, the more groups, 0 ~ 1)" ) parser.add_option( '-c', '--cluster', default=False, action="store_true", help= "This option only for know-motifs cistrome.xml, If you want to use pre-clustered database to accelerate seqpos. default (not set): False" ) parser.add_option( '--maxmotif', default=0, help="maximum number of motifs to report, (default: 0, i.e. no max)") parser.add_option('-O', '--output-directory', default="results", help="output directory name (default: results)") #parse the command line options (opts, args) = parser.parse_args(sys.argv) if len(args) < 3: parser.print_help() sys.exit(-1) bedfile_name = args[1] genome = args[2] _DEBUG = opts.verbose output_dir = opts.output_directory #READ in the regions that are specified in the BED file print("read regions start time: %s" % time.ctime()) #HERE we should rely on a standard package to read in bed files; stub it chip_regions = ChipRegions(bedfile_name, genome, genome_dir=opts.genome_dir) print("read regions end time: %s" % time.ctime()) #LOAD the motifs (both known and denovo) known_motifs, new_motifs = None, None if opts.known_motifs: motif_dbs = [x.strip() for x in opts.known_motifs.split(',')] known_motifs = read_known_motifs(motif_dbs, _DEBUG) if opts.denovo: print("starting denovo search...(time: %s)" % time.ctime()) new_motifs = chip_regions.mdmodule(width=int(opts.width)) print("completed denovo search...(time: %s)" % time.ctime()) new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs)) #Run seqpos stats on all_motifs print("starting seqpos stats...(time: %s)" % time.ctime()) if new_motifs: for m in new_motifs: m.seqpos(chip_regions, width=int(opts.width)) if opts.cluster and opts.known_motifs == 'cistrome.xml': #only for cistrome.xml to use cistrome.cluster known_motifs = seqpos_cluster_known_motifs(known_motifs, chip_regions, int(opts.width), float(opts.pval)) elif known_motifs: for m in known_motifs: m.seqpos(chip_regions, width=int(opts.width)) print("completed seqpos stats...(time: %s)" % time.ctime()) #Combine both known and new motifs all_motifs = None if known_motifs and new_motifs: all_motifs = MotifList(known_motifs + new_motifs) elif known_motifs: all_motifs = known_motifs else: all_motifs = new_motifs #CULL the results to see only the relevant results, and output sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif)) #filter by species? if opts.species_list: species_list = opts.species_list.split(',') sig_motifs = sig_motifs.filterBySpecies(species_list) #dists = calc_motif_dist_pcc(sig_motifs) #save_to_html(output_dir, sig_motifs, dists) save_to_html_plain(output_dir, sig_motifs, opts.hcluster) json_list = [t.to_json() for t in sig_motifs] jsonf = open(os.path.join(output_dir, 'motif_list.json'), 'w') for js in json_list: jsonf.write(js + '\n') jsonf.close()
def main(): #ALWAYS PRINT OUT VERSION INFO: print mdseqpos.__version__ print 'Library path:', mdseqpos.__file__ print USAGE = """USAGE: MDSeqPos.py [options] BEDFILE GENOME BEDFILE - regions file GENOME - assembly which the regions pertain to, e.g. 'hg18', 'mm9', etc. as defined in BUILD_DICT in lib/settings.py""" parser = optparse.OptionParser(usage=USAGE) parser.add_option('-g', '--genome-dir', dest="genome_dir", default=None, help="Path to the genome assembly dir") parser.add_option('-d', '--denovo', default=False, action="store_true", help="flag to run denovo motif search (default: False)") parser.add_option('-m', '--known-motifs', default=None, help="comma separated list of known motifs dbs to use in the motif search, e.g. -m pbm.xml,transfac.xml") parser.add_option('-n', '--new-motifs', default='denovo.xml', help="name of the output XML file which stores new motifs found during adenovo search, e.g. -n foo.xml (default: denovo.xml)") parser.add_option('-p', '--pval', default=0.001, help="pvalue cutoff for motif significance, (default: 0.001)") parser.add_option('-s', '--species-list', default=None, help="name of species to filter the results with--if multuple species, comma-separate them, e.g. hs,mm,dm") parser.add_option('-w', '--width', default=600, help="width of the region to be scanned for motifs; depends on resoution of assay, (default: 600 basepairs)") parser.add_option('-v', '--verbose', default=False, action="store_true", help="flag to print debugging info (default: False)") parser.add_option('--hcluster', type = 'float', default=0.8, help="The similarity cutoff for hierarchical clustering of the result, (default: 0.8, The higher, the more groups, 0 ~ 1)") parser.add_option('-c', '--cluster', default=False, action="store_true", help="This option only for know-motifs cistrome.xml, If you want to use pre-clustered database to accelerate seqpos. default (not set): False") parser.add_option('--maxmotif', default=0, help="maximum number of motifs to report, (default: 0, i.e. no max)") parser.add_option('-O', '--output-directory', default="results", help="output directory name (default: results)") #parse the command line options (opts, args) = parser.parse_args(sys.argv) if len(args) < 3: parser.print_help() sys.exit(-1) bedfile_name = args[1] genome = args[2] _DEBUG = opts.verbose output_dir = opts.output_directory print opts.genome_dir #READ in the regions that are specified in the BED file print "read regions start time: %s" % time.ctime() #HERE we should rely on a standard package to read in bed files; stub it chip_regions = ChipRegions(bedfile_name, genome, genome_dir=opts.genome_dir) print "read regions end time: %s" % time.ctime() #LOAD the motifs (both known and denovo) known_motifs, new_motifs = None, None if opts.known_motifs: motif_dbs = [x.strip() for x in opts.known_motifs.split(',')] known_motifs = read_known_motifs(motif_dbs, _DEBUG) if opts.denovo: print "starting denovo search...(time: %s)" % time.ctime() new_motifs = chip_regions.mdmodule(width=int(opts.width)) print "completed denovo search...(time: %s)" % time.ctime() new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs)) #Run seqpos stats on all_motifs print "starting seqpos stats...(time: %s)" % time.ctime() if new_motifs: for m in new_motifs: m.seqpos(chip_regions, width=int(opts.width)) if opts.cluster and opts.known_motifs == 'cistrome.xml': #only for cistrome.xml to use cistrome.cluster known_motifs = seqpos_cluster_known_motifs(known_motifs, chip_regions, int(opts.width), float(opts.pval)) elif known_motifs: for m in known_motifs: m.seqpos(chip_regions, width=int(opts.width)) print "completed seqpos stats...(time: %s)" % time.ctime() #Combine both known and new motifs all_motifs = None if known_motifs and new_motifs: all_motifs = MotifList(known_motifs + new_motifs) elif known_motifs: all_motifs = known_motifs else: all_motifs = new_motifs #CULL the results to see only the relevant results, and output sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif)) #filter by species? if opts.species_list: species_list = opts.species_list.split(',') sig_motifs = sig_motifs.filterBySpecies(species_list) #dists = calc_motif_dist_pcc(sig_motifs) #save_to_html(output_dir, sig_motifs, dists) save_to_html_plain(output_dir, sig_motifs, opts.hcluster) json_list = [t.to_json() for t in sig_motifs] jsonf = open(os.path.join(output_dir, 'motif_list.json'),'w') for js in json_list: jsonf.write(js +'\n') jsonf.close()
def main(): #ALWAYS PRINT OUT VERSION INFO: print mdseqpos.__version__ parser = optparse.OptionParser(usage=USAGE) parser.add_option('-d', '--denovo', default=False, action="store_true", help="flag to run denovo motif search (default: False)") parser.add_option('-m', '--known-motifs', default=None, help="comma separated list of known motifs dbs to use \ in the motif search, e.g. -m pbm.xml,transfac.xml") parser.add_option('-n', '--new-motifs', default='denovo.xml', help="name of the output XML file which stores new \ motifs found during adenovo search, e.g. -n foo.xml \ (default: denovo.xml)") parser.add_option('-p', '--pval', default=0.001, help="pvalue cutoff for motif significance, \ (default: 0.001)") parser.add_option('-s', '--species-list', default=None, help="name of \ species to filter the results with--if multuple \ species, comma-separate them, e.g. hs,mm,dm") parser.add_option('-w', '--width', default=600, help="width of the region to be scanned for motifs; \ depends on resoution of assay, (default: 600 basepairs)") parser.add_option('-v', '--verbose', default=False, action="store_true", help="flag to print debugging info (default: False)") parser.add_option('--maxmotif', default=-1, help="maximum number of motifs to report, \ (default: -1, i.e. no max)") parser.add_option('-O', '--output-directory', default="results", help="output directory name (default: results)") #parse the command line options (opts, args) = parser.parse_args(sys.argv) if len(args) < 3: parser.print_help() sys.exit(-1) bedfile_name = args[1] genome = args[2] #quoted bug fix if bedfile_name and '"' in bedfile_name: bedfile_name = bedfile_name.replace('"','') if genome and '"' in genome: genome = genome.replace('"','') _DEBUG = opts.verbose output_dir = opts.output_directory output_dir = output_dir.replace('\\', '/') genome = genome.replace('\\', '/') if not os.path.exists(output_dir): os.mkdir(output_dir) #READ in the regions that are specified in the BED file if _DEBUG: print "read regions start time: %s" % time.ctime() #HERE we should rely on a standard package to read in bed files; stub it chip_regions = ChipRegions(bedfile_name, genome) if _DEBUG: print "read regions end time: %s" % time.ctime() #LOAD the motifs (both known and denovo) known_motifs, new_motifs = None, None if opts.known_motifs: motif_dbs = [x.strip() for x in opts.known_motifs.split(',')] known_motifs = read_known_motifs(motif_dbs, _DEBUG) if opts.denovo: if _DEBUG: print "starting denovo search...(time: %s)" % time.ctime() new_motifs = chip_regions.mdmodule(width=int(opts.width)) if _DEBUG: print "completed denovo search...(time: %s)" % time.ctime() new_motifs.save_to_xml(os.path.join(output_dir, opts.new_motifs)) #Combine both known and new motifs all_motifs = None if known_motifs and new_motifs: all_motifs = MotifList(known_motifs + new_motifs) elif known_motifs: all_motifs = known_motifs else: all_motifs = new_motifs if all_motifs == None: print "No motifs found" exit(1) #Run seqpos stats on all_motifs if _DEBUG: print "starting seqpos stats...(time: %s)" % time.ctime() for m in all_motifs: m.seqpos(chip_regions, width=int(opts.width)) if _DEBUG: print "completed seqpos stats...(time: %s)" % time.ctime() #CULL the results to see only the relevant results, and output sig_motifs = all_motifs.cull(float(opts.pval), int(opts.maxmotif)) #filter by species? if opts.species_list: species_list = opts.species_list.split(',') sig_motifs = sig_motifs.filterBySpecies(species_list) dists = calc_motif_dist(sig_motifs) #save_to_html(output_dir, sig_motifs, dists) save_to_html_plain(output_dir, sig_motifs, dists)