def input_parser(filepath): bamfiles_1 = get_data_block(filepath, "rep1") bamfiles_1 = map(npath, bamfiles_1) bamfiles_2 = get_data_block(filepath, "rep2") bamfiles_2 = map(npath, bamfiles_2) # genome is optional, so if we get an empty list # we set it to None, otherwise we normalise the path genome = get_data_block(filepath, "genome") genome = npath(genome) if genome else None # the chrom sizes are not optional, but right now it's undefined # what happens if the user doesn't specify them, or specifies more # than one. So we just relay whatever we got from the file. chrom_sizes = npath(get_data_block(filepath, "chrom_sizes")) chrom_sizes = npath(chrom_sizes) if chrom_sizes else chrom_sizes inputs1 = get_data_block(filepath, "inputs1") inputs1 = map(npath, inputs1) inputs2 = get_data_block(filepath, "inputs2") inputs2 = map(npath, inputs2) dims = [len(bamfiles_1), len(bamfiles_2)] if not inputs1 and not inputs2: inputs = None else: inputs = inputs1 + inputs2 return bamfiles_1 + bamfiles_2, genome, chrom_sizes, inputs, dims
def read_enrichment(self, enrichment_files, threshold=1): """ Reads current output of motif enrichment analysis to get gene targets. *Keyword arguments:* - enrichment_files -- One string, or a list of strings, representing enrichment file paths. - threshold -- P-value threshold for motif acceptance. """ if isinstance(enrichment_files, list): file_list = [ filename for pattern in enrichment_files for filename in glob.glob(npath(pattern)) ] else: file_list = glob.glob(npath(enrichment_files)) # reading networks for filename in file_list: # use last dir name as name for condition condition = os.path.dirname(filename) condition = condition.split("/")[-1] self.conditions.append(condition) network = {} f = open(filename, "r") # skip header next(f) for line in f: line = line.strip("\n") values = line.split("\t") motif = values[0] if motif in self.motifs_map: p_value = float(values[2]) genes = values[9].split(",") if threshold >= p_value: network[motif] = genes if motif in self.motifs_enrichment: self.motifs_enrichment[motif][condition] = p_value else: self.motifs_enrichment[motif] = {condition: p_value} else: print("motif not found: " + motif) self.networks[condition] = network f.close()
def write_enrichment(self, out_file, threshold=1): """ Writes enrichment table for network generation. *Keyword arguments:* - out_file -- Output file name. - threshold -- P-value threshold for motif acceptance. """ f = open(npath(out_file), "w") f.write("\t" + ("\t".join(self.conditions)) + "\n") for v in self.motifs_enrichment: values = self.motifs_enrichment[v] filter_p = False p_values = [] for c in self.conditions: if c in values: pvalue = values[c] p_values.append(str(pvalue)) if pvalue <= threshold: filter_p = True else: p_values.append("1") if filter_p and ( v in self.motifs_map) and self.motifs_map[v].gene_names: genes = "|".join(self.motifs_map[v].gene_names) f.write(v + "|" + genes + "\t" + ("\t".join(p_values)) + "\n")
def merge_output(bamfiles, dims, options, no_bw_files, chrom_sizes): for i in range(len(bamfiles)): rep = i if i < dims[0] else i - dims[0] sig = 1 if i < dims[0] else 2 temp_bed = npath(options.name + '-s%s-rep%s_temp.bed' % (sig, rep)) files = [options.name + '-' + str(j) + '-s%s-rep%s.bw' %(sig, rep) for j in no_bw_files] if len(no_bw_files) > len(bamfiles): files = filter(lambda x: isfile(x), files) t = ['bigWigMerge'] + files + [temp_bed] c = " ".join(t) os.system(c) os.system("LC_COLLATE=C sort -k1,1 -k2,2n " + temp_bed + ' > ' + temp_bed +'.sort') t = ['bedGraphToBigWig', temp_bed + '.sort', chrom_sizes, options.name + '-s%s-rep%s.bw' % (sig, rep)] c = " ".join(t) os.system(c) for f in files: os.remove(f) os.remove(temp_bed) os.remove(temp_bed + ".sort") else: ftarget = [options.name + '-s%s-rep%s.bw' %(sig, rep) for j in no_bw_files] for i in range(len(ftarget)): c = ['mv', files[i], ftarget[i]] c = " ".join(c) os.system(c)
def write_enrichment(self, out_file, threshold=1): """ Writes enrichment table for network generation. *Keyword arguments:* - out_file -- Output file name. - threshold -- P-value threshold for motif acceptance. """ f = open(npath(out_file), "w") f.write("\t" + ("\t".join(self.conditions)) + "\n") for v in self.motifs_enrichment: values = self.motifs_enrichment[v] filter_p = False p_values = [] for c in self.conditions: if c in values: pvalue = values[c] p_values.append(str(pvalue)) if pvalue <= threshold: filter_p = True else: p_values.append("1") if filter_p and (v in self.motifs_map) and self.motifs_map[v].gene_names: genes = "|".join(self.motifs_map[v].gene_names) f.write(v + "|" + genes + "\t" + ("\t".join(p_values)) + "\n")
def read_mtf(self, mtf_filenames): """ Reads TF annotation in mtf (internal format; check manual) format. *Keyword arguments:* - mtf_filenames -- A string, or a list of strings, representing .mtf file paths. """ if not isinstance(mtf_filenames, list): mtf_filenames = [mtf_filenames] file_list = [ filename for pattern in mtf_filenames for filename in glob.glob(npath(pattern)) ] # Iterating over the file name list for filename in file_list: database = os.path.splitext(os.path.basename(filename))[0] # Opening MTF file mtf_file = open(filename, "r") # Reading file for line in mtf_file: # Processing line line_list = line.strip().split("\t") tf_id = line_list[0].strip() name = line_list[1].strip() version = line_list[2].strip() gene_names = line_list[3].strip().split("+") tf_class = line_list[4].strip() uniprot_ids = line_list[5].strip().split(";") data_source = line_list[6].strip() tax_group = line_list[7].strip() species = line_list[8].strip() threshold_list = line_list[9].strip().split(",") fpr_list = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001] thresholds = {} for i in range(0, 6): thresholds[fpr_list[i]] = float(threshold_list[i]) self.add( MotifAnnotation(tf_id, name, database, version, gene_names, tf_class, uniprot_ids, data_source, tax_group, species, thresholds)) # Termination mtf_file.close()
def read_enrichment(self, enrichment_files, threshold=1): """ Reads current output of motif enrichment analysis to get gene targets. *Keyword arguments:* - enrichment_files -- One string, or a list of strings, representing enrichment file paths. - threshold -- P-value threshold for motif acceptance. """ if isinstance(enrichment_files, list): file_list = [filename for pattern in enrichment_files for filename in glob.glob(npath(pattern))] else: file_list = glob.glob(npath(enrichment_files)) # reading networks for filename in file_list: # use last dir name as name for condition condition = os.path.dirname(filename) condition = condition.split("/")[-1] self.conditions.append(condition) network = {} f = open(filename, "r") # skip header next(f) for line in f: line = line.strip("\n") values = line.split("\t") motif = values[0] if motif in self.motifs_map: p_value = float(values[2]) genes = values[9].split(",") if threshold >= p_value: network[motif] = genes if motif in self.motifs_enrichment: self.motifs_enrichment[motif][condition] = p_value else: self.motifs_enrichment[motif] = {condition: p_value} else: print("motif not found: " + motif) self.networks[condition] = network f.close()
def read_mtf(self, mtf_filenames): """ Reads TF annotation in mtf (internal format; check manual) format. *Keyword arguments:* - mtf_filenames -- A string, or a list of strings, representing .mtf file paths. """ if not isinstance(mtf_filenames, list): mtf_filenames = [mtf_filenames] file_list = [filename for pattern in mtf_filenames for filename in glob.glob(npath(pattern))] # Iterating over the file name list for filename in file_list: database = os.path.splitext(os.path.basename(filename))[0] # Opening MTF file mtf_file = open(filename, "r") # Reading file for line in mtf_file: # Processing line line_list = line.strip().split("\t") tf_id = line_list[0].strip() name = line_list[1].strip() version = line_list[2].strip() gene_names = line_list[3].strip().split("+") tf_class = line_list[4].strip() uniprot_ids = line_list[5].strip().split(";") data_source = line_list[6].strip() tax_group = line_list[7].strip() species = line_list[8].strip() threshold_list = line_list[9].strip().split(",") fpr_list = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001] thresholds = {} for i in range(0, 6): thresholds[fpr_list[i]] = float(threshold_list[i]) self.add(MotifAnnotation(tf_id, name, database, version, gene_names, tf_class, uniprot_ids, data_source, tax_group, species, thresholds)) # Termination mtf_file.close()
def merge_output(bamfiles, dims, options, no_bw_files, chrom_sizes): for i in range(len(bamfiles)): rep = i if i < dims[0] else i - dims[0] sig = 1 if i < dims[0] else 2 temp_bed = npath(options.name + '-s%s-rep%s_temp.bed' % (sig, rep)) files = [ options.name + '-' + str(j) + '-s%s-rep%s.bw' % (sig, rep) for j in no_bw_files ] if len(no_bw_files) > len(bamfiles): files = filter(lambda x: isfile(x), files) t = ['bigWigMerge'] + files + [temp_bed] c = " ".join(t) os.system(c) os.system("LC_COLLATE=C sort -k1,1 -k2,2n " + temp_bed + ' > ' + temp_bed + '.sort') t = [ 'bedGraphToBigWig', temp_bed + '.sort', chrom_sizes, options.name + '-s%s-rep%s.bw' % (sig, rep) ] c = " ".join(t) os.system(c) for f in files: os.remove(f) os.remove(temp_bed) os.remove(temp_bed + ".sort") else: ftarget = [ options.name + '-s%s-rep%s.bw' % (sig, rep) for j in no_bw_files ] for i in range(len(ftarget)): c = ['mv', files[i], ftarget[i]] c = " ".join(c) os.system(c)
def read_mtf(self, mtf_filenames): """ Reads TF annotation in mtf (internal format; check manual) format. *Keyword arguments:* - file_name_list -- A string, or a list of strings, representing .mtf file paths. """ if isinstance(mtf_filenames, list): file_list = [filename for pattern in mtf_filenames for filename in glob.glob(npath(pattern))] else: file_list = glob.glob(npath(mtf_filenames)) # Iterating over the file name list for filename in file_list: # Opening MTF file mtf_file = open(filename, "r") # Reading file for line in mtf_file: # Processing line line_list = line.strip().split("\t") tf_id = line_list[0].strip() name = line_list[1].strip() database = line_list[2].strip() version = int(line_list[3].strip()) gene_names = line_list[4].strip().split("+") tf_class = line_list[5].strip() uniprot_ids = line_list[6].strip().split(";") data_source = line_list[7].strip() if len(line_list) > 7 else "" self.add(MotifAnnotation(tf_id, name, database, version, gene_names, tf_class, uniprot_ids, data_source)) # Termination mtf_file.close()
def write_network(self, targets, out_path, threshold=1): """ If enrichment information has been loaded before (via read_enrichment), this function creates a cytoscape-compatible network into the output folder. *Keyword arguments:* - targets -- Gene targets. - out_path -- Output path. - threshold -- Threshold for motif acceptance. """ self.write_enrichment( out_path + "/pvalue_table_" + str(threshold * 100) + ".txt", threshold) out_path = npath(out_path) _, genes_motifs = self.get_mappings(key_type="gene_names") net_pairs = {} net_tfs = {} all_pairs = set() all_tfs = set() all_genes = set() if targets: filter_targets = True else: filter_targets = False # using genes to motif mapping to get network in all conditions for net_name in self.networks: net = self.networks[net_name] pairs = set() tfs = set() net_pairs[net_name] = pairs net_tfs[net_name] = tfs for tf in genes_motifs: motifs = genes_motifs[tf] for m in motifs: if m in net: for target in net[m]: if not filter_targets or (target in targets): pairs.add((tf, target)) tfs.add(tf) all_genes.add(tf) all_genes.add(target) else: print("motif not in network: " + m + " " + str(tf) + " ") all_pairs = all_pairs.union(pairs) all_tfs = all_tfs.union(tfs) # printing out network for net_name, pairs_aux in net_pairs.items(): f = open(out_path + "/" + net_name + "_targets.txt", "w") for pair in all_pairs: # check if pair is active in the network if pair in pairs_aux: f.write(pair[0] + "\t" + pair[1] + "\tactive\n") else: f.write(pair[0] + "\t" + pair[1] + "\tinactive\n") f.close() f = open(out_path + "/" + net_name + "_genes.txt", "w") for gene in all_genes: # check if gene is tf active in network if gene in net_tfs[net_name]: f.write(gene + "\ttf_active\n") elif gene in all_tfs: f.write(gene + "\ttf_inactive\n") else: f.write(gene + "\ttarget\n") f.close()
def handle_input(): parser = HelpfulOptionParser(usage=__doc__) parser.add_option("-n", "--name", default=None, dest="name", type="string", help="Experiment's name and prefix for all files that are created.") parser.add_option("-m", "--merge", default=False, dest="merge", action="store_true", help="Merge peaks which have a distance less than the estimated mean fragment size " "(recommended for histone data). [default: do not merge]") parser.add_option("--housekeeping-genes", default=None, dest="housekeeping_genes", type="str", help="Define housekeeping genes (BED format) used for normalizing. [default: %default]") parser.add_option("--output-dir", dest="outputdir", default=None, type="string", help="Store files in output directory. [default: %default]") parser.add_option("--report", dest="report", default=False, action="store_true", help="Generate HTML report about experiment. [default: %default]") parser.add_option("--deadzones", dest="deadzones", default=None, help="Define blacklisted genomic regions avoided for analysis (BED format). [default: %default]") parser.add_option("--no-correction", default=False, dest="no_correction", action="store_true", help="Do not use multipe test correction for p-values (Benjamini/Hochberg). [default: %default]") parser.add_option("-p", "--pvalue", dest="pcutoff", default=0.1, type="float", help="P-value cutoff for peak detection. Call only peaks with p-value lower than cutoff. " "[default: %default]") parser.add_option("--exts", default=None, dest="exts", type="str", action='callback', callback=_callback_list, help="Read's extension size for BAM files (comma separated list for each BAM file in config " "file). If option is not chosen, estimate extension sizes. [default: %default]") parser.add_option("--factors-inputs", default=None, dest="factors_inputs", type="str", action="callback", callback=_callback_list_float, help="Normalization factors for input-DNA (comma separated list for each BAM file in config " "file). If option is not chosen, estimate factors. [default: %default]") parser.add_option("--scaling-factors", default=None, dest="scaling_factors_ip", type="str", action='callback', callback=_callback_list_float, help="Scaling factor for each BAM file (not control input-DNA) as comma separated list for " "each BAM file in config file. If option is not chosen, follow normalization strategy " "(TMM or HK approach) [default: %default]") parser.add_option("--save-input", dest="save_input", default=False, action="store_true", help="Save input-DNA file if available. [default: %default]") parser.add_option("--version", dest="version", default=False, action="store_true", help="Show script's version.") group = OptionGroup(parser, "Advanced options") group.add_option("--regions", dest="regions", default=None, type="string", help="Define regions (BED format) to restrict the analysis, that is, where to train the HMM and " "search for DPs. It is faster, but less precise.") group.add_option("-b", "--binsize", dest="binsize", default=100, type="int", help="Size of underlying bins for creating the signal. [default: %default]") group.add_option("-s", "--step", dest="stepsize", default=50, type="int", help="Stepsize with which the window consecutively slides across the genome to create the " "signal. [default: %default]") group.add_option("--debug", default=False, dest="debug", action="store_true", help="Output debug information. Warning: space consuming! [default: %default]") group.add_option("--no-gc-content", dest="no_gc_content", default=False, action="store_true", help="Do not normalize towards GC content. [default: %default]") group.add_option("--norm-regions", default=None, dest="norm_regions", type="str", help="Restrict normalization to particular regions (BED format). [default: %default]") group.add_option("-f", "--foldchange", dest="foldchange", default=1.6, type="float", help="Fold change parameter to define training set (t_1, see paper). [default: %default]") group.add_option("-t", "--threshold", dest="threshold", default=95, type="float", help="Minimum signal support for differential peaks to define training set as percentage " "(t_2, see paper). [default: %default]") group.add_option("--size", dest="size_ts", default=10000, type="int", help="Number of bins the HMM's training set constists of. [default: %default]") group.add_option("--par", dest="par", default=1, type="int", help="Percentile for p-value postprocessing filter. [default: %default]") group.add_option("--poisson", default=False, dest="poisson", action="store_true", help="Use binomial distribution as emmission. [default: %default]") group.add_option("--single-strand", default=False, dest="singlestrand", action="store_true", help="Allow single strand BAM file as input. [default: %default]") group.add_option("--m_threshold", default=80, dest="m_threshold", type="int", help="Define the M threshold of percentile for training TMM. [default: %default]") group.add_option("--a_threshold", default=95, dest="a_threshold", type="int", help="Define the A threshold of percentile for training TMM. [default: %default]") group.add_option("--rmdup", default=False, dest="rmdup", action="store_true", help="Remove the duplicate reads [default: %default]") parser.add_option_group(group) (options, args) = parser.parse_args() options.save_wig = False options.exts_inputs = None options.verbose = False options.hmm_free_para = False if options.version: print("") print(__version__) sys.exit() if len(args) != 1: parser.error("Please give config file") config_path = npath(args[0]) if not isfile(config_path): parser.error("Config file %s does not exist!" % config_path) bamfiles, genome, chrom_sizes, inputs, dims = input_parser(config_path) if not genome: options.no_gc_content = True if options.exts and len(options.exts) != len(bamfiles): parser.error("Number of Extension Sizes must equal number of bamfiles") if options.exts_inputs and len(options.exts_inputs) != len(inputs): parser.error("Number of Input Extension Sizes must equal number of input bamfiles") if options.scaling_factors_ip and len(options.scaling_factors_ip) != len(bamfiles): parser.error("Number of scaling factors for IP must equal number of bamfiles") for bamfile in bamfiles: if not isfile(bamfile): parser.error("BAM file %s does not exist!" % bamfile) if not inputs and options.factors_inputs: print("As no input-DNA, do not use input-DNA factors", file=sys.stderr) options.factors_inputs = None if options.factors_inputs and len(options.factors_inputs) != len(bamfiles): parser.error("factors for input-DNA must equal number of BAM files!") if inputs: for bamfile in inputs: if not isfile(bamfile): parser.error("BAM file %s does not exist!" % bamfile) if options.regions: if not isfile(options.regions): parser.error("Region file %s does not exist!" % options.regions) if genome and not isfile(genome): parser.error("Genome file %s does not exist!" % genome) if options.name is None: d = str(datetime.now()).replace("-", "_").replace(":", "_").replace(" ", "_").replace(".", "_").split("_") options.name = "THOR-exp" + "-" + "_".join(d[:len(d) - 1]) if not which("wigToBigWig") or not which("bedGraphToBigWig") or not which("bigWigMerge"): print("Warning: wigToBigWig, bigWigMerge or bedGraphToBigWig not found! Signal will not be stored!", file=sys.stderr) if options.outputdir: options.outputdir = npath(options.outputdir) if isdir(options.outputdir) and sum( map(lambda x: x.startswith(options.name), os.listdir(options.outputdir))) > 0: parser.error("Output directory exists and contains files with names starting with your chosen experiment " "name! Do nothing to prevent file overwriting!") if not exists(options.outputdir): os.mkdir(options.outputdir) else: options.outputdir = os.getcwd() options.name = join(options.outputdir, options.name) if isdir(join(options.outputdir, 'report_'+basename(options.name))): parser.error("Folder 'report_"+basename(options.name)+"' already exits in output directory!" "Do nothing to prevent file overwriting! " "Please rename report folder or change working directory of THOR with the option --output-dir") if options.report: os.mkdir(join(options.outputdir, 'report_'+basename(options.name)+"/")) os.mkdir(join(options.outputdir, 'report_'+basename(options.name), 'pics/')) os.mkdir(join(options.outputdir, 'report_'+basename(options.name), 'pics/data/')) global FOLDER_REPORT global FOLDER_REPORT_PICS global FOLDER_REPORT_DATA global OUTPUTDIR global NAME FOLDER_REPORT = join(options.outputdir, 'report_'+basename(options.name)+"/") FOLDER_REPORT_PICS = join(options.outputdir, 'report_'+basename(options.name), 'pics/') FOLDER_REPORT_DATA = join(options.outputdir, 'report_'+basename(options.name), 'pics/data/') OUTPUTDIR = options.outputdir NAME = options.name if not inputs: print("Warning: Do not compute GC-content, as there is no input file", file=sys.stderr) if not genome: print("Warning: Do not compute GC-content, as there is no genome file", file=sys.stderr) if options.exts is None: options.exts = [] if options.exts_inputs is None: options.exts_inputs = [] return options, bamfiles, genome, chrom_sizes, dims, inputs
def write_network(self, targets, out_path, threshold=1): """ If enrichment information has been loaded before (via read_enrichment), this function creates a cytoscape-compatible network into the output folder. *Keyword arguments:* - targets -- Gene targets. - out_path -- Output path. - threshold -- Threshold for motif acceptance. """ self.write_enrichment(out_path + "/pvalue_table_" + str(threshold * 100) + ".txt", threshold) out_path = npath(out_path) _, genes_motifs = self.get_mappings(key_type="gene_names") net_pairs = {} net_tfs = {} all_pairs = set() all_tfs = set() all_genes = set() if targets: filter_targets = True else: filter_targets = False # using genes to motif mapping to get network in all conditions for net_name in self.networks: net = self.networks[net_name] pairs = set() tfs = set() net_pairs[net_name] = pairs net_tfs[net_name] = tfs for tf in genes_motifs: motifs = genes_motifs[tf] for m in motifs: if m in net: for target in net[m]: if not filter_targets or (target in targets): pairs.add((tf, target)) tfs.add(tf) all_genes.add(tf) all_genes.add(target) else: print("motif not in network: " + m + " " + str(tf) + " ") all_pairs = all_pairs.union(pairs) all_tfs = all_tfs.union(tfs) # printing out network for net_name, pairs_aux in net_pairs.items(): f = open(out_path + "/" + net_name + "_targets.txt", "w") for pair in all_pairs: # check if pair is active in the network if pair in pairs_aux: f.write(pair[0] + "\t" + pair[1] + "\tactive\n") else: f.write(pair[0] + "\t" + pair[1] + "\tinactive\n") f.close() f = open(out_path + "/" + net_name + "_genes.txt", "w") for gene in all_genes: # check if gene is tf active in network if gene in net_tfs[net_name]: f.write(gene + "\ttf_active\n") elif gene in all_tfs: f.write(gene + "\ttf_inactive\n") else: f.write(gene + "\ttarget\n") f.close()
parser.add_argument('-f', '--input-format', choices=['jaspar-2014', 'jaspar-2016', 'hocomoco-pcm'], type=str, required=True, help='format of the input file') parser.add_argument('-o', '--output-folder', type=str, required=True, help='name of output Folder') args = parser.parse_args() # read the input file with open(npath(args.input_file), "r") as f: content = f.readlines() n_lines = len(content) output_folder = npath(args.output_folder) # make output directory path, if it doesn't exist os.makedirs(output_folder) ################################################################################################### # JASPAR 2014 ################################################################################################### if args.input_format == "jaspar-2014": for i in range(n_lines / 5):
def handle_input(): parser = HelpfulOptionParser(usage=__doc__) parser.add_option( "-n", "--name", default=None, dest="name", type="string", help="Experiment's name and prefix for all files that are created.") parser.add_option( "-m", "--merge", default=False, dest="merge", action="store_true", help= "Merge peaks which have a distance less than the estimated mean fragment size " "(recommended for histone data). [default: do not merge]") parser.add_option( "--housekeeping-genes", default=None, dest="housekeeping_genes", type="str", help= "Define housekeeping genes (BED format) used for normalizing. [default: %default]" ) parser.add_option( "--output-dir", dest="outputdir", default=None, type="string", help="Store files in output directory. [default: %default]") parser.add_option( "--report", dest="report", default=False, action="store_true", help="Generate HTML report about experiment. [default: %default]") parser.add_option( "--deadzones", dest="deadzones", default=None, help= "Define blacklisted genomic regions avoided for analysis (BED format). [default: %default]" ) parser.add_option( "--no-correction", default=False, dest="no_correction", action="store_true", help= "Do not use multipe test correction for p-values (Benjamini/Hochberg). [default: %default]" ) parser.add_option( "-p", "--pvalue", dest="pcutoff", default=0.1, type="float", help= "P-value cutoff for peak detection. Call only peaks with p-value lower than cutoff. " "[default: %default]") parser.add_option( "--exts", default=None, dest="exts", type="str", action='callback', callback=_callback_list, help= "Read's extension size for BAM files (comma separated list for each BAM file in config " "file). If option is not chosen, estimate extension sizes. [default: %default]" ) parser.add_option( "--factors-inputs", default=None, dest="factors_inputs", type="str", action="callback", callback=_callback_list_float, help= "Normalization factors for input-DNA (comma separated list for each BAM file in config " "file). If option is not chosen, estimate factors. [default: %default]" ) parser.add_option( "--scaling-factors", default=None, dest="scaling_factors_ip", type="str", action='callback', callback=_callback_list_float, help= "Scaling factor for each BAM file (not control input-DNA) as comma separated list for " "each BAM file in config file. If option is not chosen, follow normalization strategy " "(TMM or HK approach) [default: %default]") parser.add_option( "--save-input", dest="save_input", default=False, action="store_true", help="Save input-DNA file if available. [default: %default]") parser.add_option("--version", dest="version", default=False, action="store_true", help="Show script's version.") group = OptionGroup(parser, "Advanced options") group.add_option( "--regions", dest="regions", default=None, type="string", help= "Define regions (BED format) to restrict the analysis, that is, where to train the HMM and " "search for DPs. It is faster, but less precise.") group.add_option( "-b", "--binsize", dest="binsize", default=100, type="int", help= "Size of underlying bins for creating the signal. [default: %default]") group.add_option( "-s", "--step", dest="stepsize", default=50, type="int", help= "Stepsize with which the window consecutively slides across the genome to create the " "signal. [default: %default]") group.add_option( "--debug", default=False, dest="debug", action="store_true", help= "Output debug information. Warning: space consuming! [default: %default]" ) group.add_option( "--no-gc-content", dest="no_gc_content", default=False, action="store_true", help="Do not normalize towards GC content. [default: %default]") group.add_option( "--norm-regions", default=None, dest="norm_regions", type="str", help= "Restrict normalization to particular regions (BED format). [default: %default]" ) group.add_option( "-f", "--foldchange", dest="foldchange", default=1.6, type="float", help= "Fold change parameter to define training set (t_1, see paper). [default: %default]" ) group.add_option( "-t", "--threshold", dest="threshold", default=95, type="float", help= "Minimum signal support for differential peaks to define training set as percentage " "(t_2, see paper). [default: %default]") group.add_option( "--size", dest="size_ts", default=10000, type="int", help= "Number of bins the HMM's training set constists of. [default: %default]" ) group.add_option( "--par", dest="par", default=1, type="int", help="Percentile for p-value postprocessing filter. [default: %default]" ) group.add_option( "--poisson", default=False, dest="poisson", action="store_true", help="Use binomial distribution as emmission. [default: %default]") group.add_option( "--single-strand", default=False, dest="singlestrand", action="store_true", help="Allow single strand BAM file as input. [default: %default]") group.add_option( "--m_threshold", default=80, dest="m_threshold", type="int", help= "Define the M threshold of percentile for training TMM. [default: %default]" ) group.add_option( "--a_threshold", default=95, dest="a_threshold", type="int", help= "Define the A threshold of percentile for training TMM. [default: %default]" ) group.add_option("--rmdup", default=False, dest="rmdup", action="store_true", help="Remove the duplicate reads [default: %default]") parser.add_option_group(group) (options, args) = parser.parse_args() options.save_wig = False options.exts_inputs = None options.verbose = False options.hmm_free_para = False if options.version: print("") print(__version__) sys.exit() if len(args) != 1: parser.error("Please give config file") config_path = npath(args[0]) if not isfile(config_path): parser.error("Config file %s does not exist!" % config_path) bamfiles, genome, chrom_sizes, inputs, dims = input_parser(config_path) if not genome: options.no_gc_content = True if options.exts and len(options.exts) != len(bamfiles): parser.error("Number of Extension Sizes must equal number of bamfiles") if options.exts_inputs and len(options.exts_inputs) != len(inputs): parser.error( "Number of Input Extension Sizes must equal number of input bamfiles" ) if options.scaling_factors_ip and len( options.scaling_factors_ip) != len(bamfiles): parser.error( "Number of scaling factors for IP must equal number of bamfiles") for bamfile in bamfiles: if not isfile(bamfile): parser.error("BAM file %s does not exist!" % bamfile) if not inputs and options.factors_inputs: print("As no input-DNA, do not use input-DNA factors", file=sys.stderr) options.factors_inputs = None if options.factors_inputs and len(options.factors_inputs) != len(bamfiles): parser.error("factors for input-DNA must equal number of BAM files!") if inputs: for bamfile in inputs: if not isfile(bamfile): parser.error("BAM file %s does not exist!" % bamfile) if options.regions: if not isfile(options.regions): parser.error("Region file %s does not exist!" % options.regions) if genome and not isfile(genome): parser.error("Genome file %s does not exist!" % genome) if options.name is None: d = str(datetime.now()).replace("-", "_").replace(":", "_").replace( " ", "_").replace(".", "_").split("_") options.name = "THOR-exp" + "-" + "_".join(d[:len(d) - 1]) if not which("wigToBigWig") or not which("bedGraphToBigWig") or not which( "bigWigMerge"): print( "Warning: wigToBigWig, bigWigMerge or bedGraphToBigWig not found! Signal will not be stored!", file=sys.stderr) if options.outputdir: options.outputdir = npath(options.outputdir) if isdir(options.outputdir) and sum( map(lambda x: x.startswith(options.name), os.listdir(options.outputdir))) > 0: parser.error( "Output directory exists and contains files with names starting with your chosen experiment " "name! Do nothing to prevent file overwriting!") if not exists(options.outputdir): os.mkdir(options.outputdir) else: options.outputdir = os.getcwd() options.name = join(options.outputdir, options.name) if isdir(join(options.outputdir, 'report_' + basename(options.name))): parser.error( "Folder 'report_" + basename(options.name) + "' already exits in output directory!" "Do nothing to prevent file overwriting! " "Please rename report folder or change working directory of THOR with the option --output-dir" ) if options.report: os.mkdir( join(options.outputdir, 'report_' + basename(options.name) + "/")) os.mkdir( join(options.outputdir, 'report_' + basename(options.name), 'pics/')) os.mkdir( join(options.outputdir, 'report_' + basename(options.name), 'pics/data/')) global FOLDER_REPORT global FOLDER_REPORT_PICS global FOLDER_REPORT_DATA global OUTPUTDIR global NAME FOLDER_REPORT = join(options.outputdir, 'report_' + basename(options.name) + "/") FOLDER_REPORT_PICS = join(options.outputdir, 'report_' + basename(options.name), 'pics/') FOLDER_REPORT_DATA = join(options.outputdir, 'report_' + basename(options.name), 'pics/data/') OUTPUTDIR = options.outputdir NAME = options.name if not inputs: print("Warning: Do not compute GC-content, as there is no input file", file=sys.stderr) if not genome: print("Warning: Do not compute GC-content, as there is no genome file", file=sys.stderr) if options.exts is None: options.exts = [] if options.exts_inputs is None: options.exts_inputs = [] return options, bamfiles, genome, chrom_sizes, dims, inputs