예제 #1
0
def input_parser(filepath):
    bamfiles_1 = get_data_block(filepath, "rep1")
    bamfiles_1 = map(npath, bamfiles_1)

    bamfiles_2 = get_data_block(filepath, "rep2")
    bamfiles_2 = map(npath, bamfiles_2)

    # genome is optional, so if we get an empty list
    # we set it to None, otherwise we normalise the path
    genome = get_data_block(filepath, "genome")
    genome = npath(genome) if genome else None

    # the chrom sizes are not optional, but right now it's undefined
    # what happens if the user doesn't specify them, or specifies more
    # than one. So we just relay whatever we got from the file.
    chrom_sizes = npath(get_data_block(filepath, "chrom_sizes"))
    chrom_sizes = npath(chrom_sizes) if chrom_sizes else chrom_sizes

    inputs1 = get_data_block(filepath, "inputs1")
    inputs1 = map(npath, inputs1)

    inputs2 = get_data_block(filepath, "inputs2")
    inputs2 = map(npath, inputs2)

    dims = [len(bamfiles_1), len(bamfiles_2)]
    
    if not inputs1 and not inputs2:
        inputs = None
    else:
        inputs = inputs1 + inputs2

    return bamfiles_1 + bamfiles_2, genome, chrom_sizes, inputs, dims
예제 #2
0
    def read_enrichment(self, enrichment_files, threshold=1):
        """
        Reads current output of motif enrichment analysis to get gene targets.

        *Keyword arguments:*

          - enrichment_files -- One string, or a list of strings, representing enrichment file paths.
          - threshold -- P-value threshold for motif acceptance.
        """

        if isinstance(enrichment_files, list):
            file_list = [
                filename for pattern in enrichment_files
                for filename in glob.glob(npath(pattern))
            ]
        else:
            file_list = glob.glob(npath(enrichment_files))

        # reading networks
        for filename in file_list:
            # use last dir name as name for condition
            condition = os.path.dirname(filename)
            condition = condition.split("/")[-1]
            self.conditions.append(condition)

            network = {}

            f = open(filename, "r")

            # skip header
            next(f)

            for line in f:
                line = line.strip("\n")
                values = line.split("\t")
                motif = values[0]

                if motif in self.motifs_map:
                    p_value = float(values[2])
                    genes = values[9].split(",")

                    if threshold >= p_value:
                        network[motif] = genes

                    if motif in self.motifs_enrichment:
                        self.motifs_enrichment[motif][condition] = p_value
                    else:
                        self.motifs_enrichment[motif] = {condition: p_value}
                else:
                    print("motif not found: " + motif)

            self.networks[condition] = network

            f.close()
예제 #3
0
    def write_enrichment(self, out_file, threshold=1):
        """
        Writes enrichment table for network generation.

        *Keyword arguments:*

          - out_file -- Output file name.
          - threshold -- P-value threshold for motif acceptance.
        """

        f = open(npath(out_file), "w")
        f.write("\t" + ("\t".join(self.conditions)) + "\n")

        for v in self.motifs_enrichment:
            values = self.motifs_enrichment[v]
            filter_p = False
            p_values = []

            for c in self.conditions:
                if c in values:
                    pvalue = values[c]
                    p_values.append(str(pvalue))

                    if pvalue <= threshold:
                        filter_p = True
                else:
                    p_values.append("1")

            if filter_p and (
                    v in self.motifs_map) and self.motifs_map[v].gene_names:
                genes = "|".join(self.motifs_map[v].gene_names)
                f.write(v + "|" + genes + "\t" + ("\t".join(p_values)) + "\n")
예제 #4
0
def merge_output(bamfiles, dims, options, no_bw_files, chrom_sizes):
    for i in range(len(bamfiles)):
        rep = i if i < dims[0] else i - dims[0]
        sig = 1 if i < dims[0] else 2

        temp_bed = npath(options.name + '-s%s-rep%s_temp.bed' % (sig, rep))

        files = [options.name + '-' + str(j) + '-s%s-rep%s.bw' %(sig, rep) for j in no_bw_files]
        if len(no_bw_files) > len(bamfiles):
            files = filter(lambda x: isfile(x), files)
            t = ['bigWigMerge'] + files + [temp_bed]
            c = " ".join(t)
            os.system(c)

            os.system("LC_COLLATE=C sort -k1,1 -k2,2n " + temp_bed + ' > ' + temp_bed +'.sort')

            t = ['bedGraphToBigWig', temp_bed + '.sort', chrom_sizes, options.name + '-s%s-rep%s.bw' % (sig, rep)]
            c = " ".join(t)
            os.system(c)

            for f in files:
                os.remove(f)
            os.remove(temp_bed)
            os.remove(temp_bed + ".sort")
        else:
            ftarget = [options.name + '-s%s-rep%s.bw' %(sig, rep) for j in no_bw_files]
            for i in range(len(ftarget)):
                c = ['mv', files[i], ftarget[i]]
                c = " ".join(c)
                os.system(c)
예제 #5
0
    def write_enrichment(self, out_file, threshold=1):
        """
        Writes enrichment table for network generation.

        *Keyword arguments:*

          - out_file -- Output file name.
          - threshold -- P-value threshold for motif acceptance.
        """

        f = open(npath(out_file), "w")
        f.write("\t" + ("\t".join(self.conditions)) + "\n")

        for v in self.motifs_enrichment:
            values = self.motifs_enrichment[v]
            filter_p = False
            p_values = []

            for c in self.conditions:
                if c in values:
                    pvalue = values[c]
                    p_values.append(str(pvalue))

                    if pvalue <= threshold:
                        filter_p = True
                else:
                    p_values.append("1")

            if filter_p and (v in self.motifs_map) and self.motifs_map[v].gene_names:
                genes = "|".join(self.motifs_map[v].gene_names)
                f.write(v + "|" + genes + "\t" + ("\t".join(p_values)) + "\n")
예제 #6
0
    def read_mtf(self, mtf_filenames):
        """
        Reads TF annotation in mtf (internal format; check manual) format.

        *Keyword arguments:*

          - mtf_filenames -- A string, or a list of strings, representing .mtf file paths.
        """

        if not isinstance(mtf_filenames, list):
            mtf_filenames = [mtf_filenames]

        file_list = [
            filename for pattern in mtf_filenames
            for filename in glob.glob(npath(pattern))
        ]

        # Iterating over the file name list
        for filename in file_list:

            database = os.path.splitext(os.path.basename(filename))[0]

            # Opening MTF file
            mtf_file = open(filename, "r")

            # Reading file
            for line in mtf_file:
                # Processing line
                line_list = line.strip().split("\t")
                tf_id = line_list[0].strip()
                name = line_list[1].strip()
                version = line_list[2].strip()
                gene_names = line_list[3].strip().split("+")
                tf_class = line_list[4].strip()
                uniprot_ids = line_list[5].strip().split(";")
                data_source = line_list[6].strip()
                tax_group = line_list[7].strip()
                species = line_list[8].strip()
                threshold_list = line_list[9].strip().split(",")
                fpr_list = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
                thresholds = {}
                for i in range(0, 6):
                    thresholds[fpr_list[i]] = float(threshold_list[i])

                self.add(
                    MotifAnnotation(tf_id, name, database, version, gene_names,
                                    tf_class, uniprot_ids, data_source,
                                    tax_group, species, thresholds))

            # Termination
            mtf_file.close()
예제 #7
0
    def read_enrichment(self, enrichment_files, threshold=1):
        """
        Reads current output of motif enrichment analysis to get gene targets.

        *Keyword arguments:*

          - enrichment_files -- One string, or a list of strings, representing enrichment file paths.
          - threshold -- P-value threshold for motif acceptance.
        """

        if isinstance(enrichment_files, list):
            file_list = [filename for pattern in enrichment_files for filename in glob.glob(npath(pattern))]
        else:
            file_list = glob.glob(npath(enrichment_files))

        # reading networks
        for filename in file_list:
            # use last dir name as name for condition
            condition = os.path.dirname(filename)
            condition = condition.split("/")[-1]
            self.conditions.append(condition)

            network = {}

            f = open(filename, "r")

            # skip header
            next(f)

            for line in f:
                line = line.strip("\n")
                values = line.split("\t")
                motif = values[0]

                if motif in self.motifs_map:
                    p_value = float(values[2])
                    genes = values[9].split(",")

                    if threshold >= p_value:
                        network[motif] = genes

                    if motif in self.motifs_enrichment:
                        self.motifs_enrichment[motif][condition] = p_value
                    else:
                        self.motifs_enrichment[motif] = {condition: p_value}
                else:
                    print("motif not found: " + motif)

            self.networks[condition] = network

            f.close()
예제 #8
0
    def read_mtf(self, mtf_filenames):
        """
        Reads TF annotation in mtf (internal format; check manual) format.

        *Keyword arguments:*

          - mtf_filenames -- A string, or a list of strings, representing .mtf file paths.
        """

        if not isinstance(mtf_filenames, list):
            mtf_filenames = [mtf_filenames]

        file_list = [filename for pattern in mtf_filenames for filename in glob.glob(npath(pattern))]

        # Iterating over the file name list
        for filename in file_list:

            database = os.path.splitext(os.path.basename(filename))[0]

            # Opening MTF file
            mtf_file = open(filename, "r")

            # Reading file
            for line in mtf_file:
                # Processing line
                line_list = line.strip().split("\t")
                tf_id = line_list[0].strip()
                name = line_list[1].strip()
                version = line_list[2].strip()
                gene_names = line_list[3].strip().split("+")
                tf_class = line_list[4].strip()
                uniprot_ids = line_list[5].strip().split(";")
                data_source = line_list[6].strip()
                tax_group = line_list[7].strip()
                species = line_list[8].strip()
                threshold_list = line_list[9].strip().split(",")
                fpr_list = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
                thresholds = {}
                for i in range(0, 6):
                    thresholds[fpr_list[i]] = float(threshold_list[i])

                self.add(MotifAnnotation(tf_id, name, database, version, gene_names, tf_class, uniprot_ids, data_source,
                                         tax_group, species, thresholds))

            # Termination
            mtf_file.close()
예제 #9
0
def merge_output(bamfiles, dims, options, no_bw_files, chrom_sizes):
    for i in range(len(bamfiles)):
        rep = i if i < dims[0] else i - dims[0]
        sig = 1 if i < dims[0] else 2

        temp_bed = npath(options.name + '-s%s-rep%s_temp.bed' % (sig, rep))

        files = [
            options.name + '-' + str(j) + '-s%s-rep%s.bw' % (sig, rep)
            for j in no_bw_files
        ]
        if len(no_bw_files) > len(bamfiles):
            files = filter(lambda x: isfile(x), files)
            t = ['bigWigMerge'] + files + [temp_bed]
            c = " ".join(t)
            os.system(c)

            os.system("LC_COLLATE=C sort -k1,1 -k2,2n " + temp_bed + ' > ' +
                      temp_bed + '.sort')

            t = [
                'bedGraphToBigWig', temp_bed + '.sort', chrom_sizes,
                options.name + '-s%s-rep%s.bw' % (sig, rep)
            ]
            c = " ".join(t)
            os.system(c)

            for f in files:
                os.remove(f)
            os.remove(temp_bed)
            os.remove(temp_bed + ".sort")
        else:
            ftarget = [
                options.name + '-s%s-rep%s.bw' % (sig, rep)
                for j in no_bw_files
            ]
            for i in range(len(ftarget)):
                c = ['mv', files[i], ftarget[i]]
                c = " ".join(c)
                os.system(c)
예제 #10
0
    def read_mtf(self, mtf_filenames):
        """
        Reads TF annotation in mtf (internal format; check manual) format.

        *Keyword arguments:*

          - file_name_list -- A string, or a list of strings, representing .mtf file paths.
        """

        if isinstance(mtf_filenames, list):
            file_list = [filename for pattern in mtf_filenames for filename in glob.glob(npath(pattern))]
        else:
            file_list = glob.glob(npath(mtf_filenames))

        # Iterating over the file name list
        for filename in file_list:

            # Opening MTF file
            mtf_file = open(filename, "r")

            # Reading file
            for line in mtf_file:
                # Processing line
                line_list = line.strip().split("\t")
                tf_id = line_list[0].strip()
                name = line_list[1].strip()
                database = line_list[2].strip()
                version = int(line_list[3].strip())
                gene_names = line_list[4].strip().split("+")
                tf_class = line_list[5].strip()
                uniprot_ids = line_list[6].strip().split(";")
                data_source = line_list[7].strip() if len(line_list) > 7 else ""

                self.add(MotifAnnotation(tf_id, name, database, version, gene_names, tf_class, uniprot_ids, data_source))

            # Termination
            mtf_file.close()
예제 #11
0
    def write_network(self, targets, out_path, threshold=1):
        """
        If enrichment information has been loaded before (via read_enrichment), this function creates
        a cytoscape-compatible network into the output folder.

        *Keyword arguments:*

          - targets -- Gene targets.
          - out_path -- Output path.
          - threshold -- Threshold for motif acceptance.
        """

        self.write_enrichment(
            out_path + "/pvalue_table_" + str(threshold * 100) + ".txt",
            threshold)

        out_path = npath(out_path)

        _, genes_motifs = self.get_mappings(key_type="gene_names")

        net_pairs = {}
        net_tfs = {}
        all_pairs = set()
        all_tfs = set()
        all_genes = set()

        if targets:
            filter_targets = True
        else:
            filter_targets = False

        # using genes to motif mapping to get network in all conditions
        for net_name in self.networks:
            net = self.networks[net_name]
            pairs = set()
            tfs = set()
            net_pairs[net_name] = pairs
            net_tfs[net_name] = tfs
            for tf in genes_motifs:
                motifs = genes_motifs[tf]
                for m in motifs:
                    if m in net:
                        for target in net[m]:
                            if not filter_targets or (target in targets):
                                pairs.add((tf, target))
                                tfs.add(tf)
                                all_genes.add(tf)
                                all_genes.add(target)
                    else:
                        print("motif not in network: " + m + " " + str(tf) +
                              " ")

            all_pairs = all_pairs.union(pairs)
            all_tfs = all_tfs.union(tfs)

        # printing out network
        for net_name, pairs_aux in net_pairs.items():
            f = open(out_path + "/" + net_name + "_targets.txt", "w")

            for pair in all_pairs:
                # check if pair is active in the network
                if pair in pairs_aux:
                    f.write(pair[0] + "\t" + pair[1] + "\tactive\n")
                else:
                    f.write(pair[0] + "\t" + pair[1] + "\tinactive\n")

            f.close()

            f = open(out_path + "/" + net_name + "_genes.txt", "w")

            for gene in all_genes:
                # check if gene is tf active in network
                if gene in net_tfs[net_name]:
                    f.write(gene + "\ttf_active\n")
                elif gene in all_tfs:
                    f.write(gene + "\ttf_inactive\n")
                else:
                    f.write(gene + "\ttarget\n")

            f.close()
예제 #12
0
def handle_input():
    parser = HelpfulOptionParser(usage=__doc__)

    parser.add_option("-n", "--name", default=None, dest="name", type="string",
                      help="Experiment's name and prefix for all files that are created.")
    parser.add_option("-m", "--merge", default=False, dest="merge", action="store_true",
                      help="Merge peaks which have a distance less than the estimated mean fragment size "
                           "(recommended for histone data). [default: do not merge]")
    parser.add_option("--housekeeping-genes", default=None, dest="housekeeping_genes", type="str",
                      help="Define housekeeping genes (BED format) used for normalizing. [default: %default]")
    parser.add_option("--output-dir", dest="outputdir", default=None, type="string",
                      help="Store files in output directory. [default: %default]")
    parser.add_option("--report", dest="report", default=False, action="store_true",
                      help="Generate HTML report about experiment. [default: %default]")
    parser.add_option("--deadzones", dest="deadzones", default=None,
                      help="Define blacklisted genomic regions avoided for analysis (BED format). [default: %default]")
    parser.add_option("--no-correction", default=False, dest="no_correction", action="store_true",
                      help="Do not use multipe test correction for p-values (Benjamini/Hochberg). [default: %default]")
    parser.add_option("-p", "--pvalue", dest="pcutoff", default=0.1, type="float",
                      help="P-value cutoff for peak detection. Call only peaks with p-value lower than cutoff. "
                           "[default: %default]")
    parser.add_option("--exts", default=None, dest="exts", type="str", action='callback', callback=_callback_list,
                      help="Read's extension size for BAM files (comma separated list for each BAM file in config "
                           "file). If option is not chosen, estimate extension sizes. [default: %default]")
    parser.add_option("--factors-inputs", default=None, dest="factors_inputs", type="str", action="callback",
                      callback=_callback_list_float,
                      help="Normalization factors for input-DNA (comma separated list for each BAM file in config "
                           "file). If option is not chosen, estimate factors. [default: %default]")
    parser.add_option("--scaling-factors", default=None, dest="scaling_factors_ip", type="str", action='callback',
                      callback=_callback_list_float,
                      help="Scaling factor for each BAM file (not control input-DNA) as comma separated list for "
                           "each BAM file in config file. If option is not chosen, follow normalization strategy "
                           "(TMM or HK approach) [default: %default]")
    parser.add_option("--save-input", dest="save_input", default=False, action="store_true",
                      help="Save input-DNA file if available. [default: %default]")
    parser.add_option("--version", dest="version", default=False, action="store_true",
                      help="Show script's version.")

    group = OptionGroup(parser, "Advanced options")
    group.add_option("--regions", dest="regions", default=None, type="string",
                     help="Define regions (BED format) to restrict the analysis, that is, where to train the HMM and "
                          "search for DPs. It is faster, but less precise.")
    group.add_option("-b", "--binsize", dest="binsize", default=100, type="int",
                     help="Size of underlying bins for creating the signal. [default: %default]")
    group.add_option("-s", "--step", dest="stepsize", default=50, type="int",
                     help="Stepsize with which the window consecutively slides across the genome to create the "
                          "signal. [default: %default]")
    group.add_option("--debug", default=False, dest="debug", action="store_true",
                     help="Output debug information. Warning: space consuming! [default: %default]")
    group.add_option("--no-gc-content", dest="no_gc_content", default=False, action="store_true",
                     help="Do not normalize towards GC content. [default: %default]")
    group.add_option("--norm-regions", default=None, dest="norm_regions", type="str",
                     help="Restrict normalization to particular regions (BED format). [default: %default]")
    group.add_option("-f", "--foldchange", dest="foldchange", default=1.6, type="float",
                     help="Fold change parameter to define training set (t_1, see paper). [default: %default]")
    group.add_option("-t", "--threshold", dest="threshold", default=95, type="float",
                     help="Minimum signal support for differential peaks to define training set as percentage "
                          "(t_2, see paper). [default: %default]")
    group.add_option("--size", dest="size_ts", default=10000, type="int",
                     help="Number of bins the HMM's training set constists of. [default: %default]")
    group.add_option("--par", dest="par", default=1, type="int",
                     help="Percentile for p-value postprocessing filter. [default: %default]")
    group.add_option("--poisson", default=False, dest="poisson", action="store_true",
                     help="Use binomial distribution as emmission. [default: %default]")
    group.add_option("--single-strand", default=False, dest="singlestrand", action="store_true",
                     help="Allow single strand BAM file as input. [default: %default]")
    group.add_option("--m_threshold", default=80, dest="m_threshold", type="int",
                     help="Define the M threshold of percentile for training TMM. [default: %default]")
    group.add_option("--a_threshold", default=95, dest="a_threshold", type="int",
                     help="Define the A threshold of percentile for training TMM. [default: %default]")
    group.add_option("--rmdup", default=False, dest="rmdup", action="store_true",
                     help="Remove the duplicate reads [default: %default]")
    parser.add_option_group(group)

    (options, args) = parser.parse_args()
    options.save_wig = False
    options.exts_inputs = None
    options.verbose = False
    options.hmm_free_para = False

    if options.version:
        print("")
        print(__version__)
        sys.exit()

    if len(args) != 1:
        parser.error("Please give config file")

    config_path = npath(args[0])

    if not isfile(config_path):
        parser.error("Config file %s does not exist!" % config_path)

    bamfiles, genome, chrom_sizes, inputs, dims = input_parser(config_path)

    if not genome:
        options.no_gc_content = True

    if options.exts and len(options.exts) != len(bamfiles):
        parser.error("Number of Extension Sizes must equal number of bamfiles")

    if options.exts_inputs and len(options.exts_inputs) != len(inputs):
        parser.error("Number of Input Extension Sizes must equal number of input bamfiles")

    if options.scaling_factors_ip and len(options.scaling_factors_ip) != len(bamfiles):
        parser.error("Number of scaling factors for IP must equal number of bamfiles")

    for bamfile in bamfiles:
        if not isfile(bamfile):
            parser.error("BAM file %s does not exist!" % bamfile)

    if not inputs and options.factors_inputs:
        print("As no input-DNA, do not use input-DNA factors", file=sys.stderr)
        options.factors_inputs = None

    if options.factors_inputs and len(options.factors_inputs) != len(bamfiles):
        parser.error("factors for input-DNA must equal number of BAM files!")

    if inputs:
        for bamfile in inputs:
            if not isfile(bamfile):
                parser.error("BAM file %s does not exist!" % bamfile)

    if options.regions:
        if not isfile(options.regions):
            parser.error("Region file %s does not exist!" % options.regions)

    if genome and not isfile(genome):
        parser.error("Genome file %s does not exist!" % genome)

    if options.name is None:
        d = str(datetime.now()).replace("-", "_").replace(":", "_").replace(" ", "_").replace(".", "_").split("_")
        options.name = "THOR-exp" + "-" + "_".join(d[:len(d) - 1])

    if not which("wigToBigWig") or not which("bedGraphToBigWig") or not which("bigWigMerge"):
        print("Warning: wigToBigWig, bigWigMerge or bedGraphToBigWig not found! Signal will not be stored!",
              file=sys.stderr)

    if options.outputdir:
        options.outputdir = npath(options.outputdir)
        if isdir(options.outputdir) and sum(
                map(lambda x: x.startswith(options.name), os.listdir(options.outputdir))) > 0:
            parser.error("Output directory exists and contains files with names starting with your chosen experiment "
                         "name! Do nothing to prevent file overwriting!")
        if not exists(options.outputdir):
            os.mkdir(options.outputdir)
    else:
        options.outputdir = os.getcwd()

    options.name = join(options.outputdir, options.name)

    if isdir(join(options.outputdir, 'report_'+basename(options.name))):
        parser.error("Folder 'report_"+basename(options.name)+"' already exits in output directory!" 
                     "Do nothing to prevent file overwriting! "
                     "Please rename report folder or change working directory of THOR with the option --output-dir")

    if options.report:
        os.mkdir(join(options.outputdir, 'report_'+basename(options.name)+"/"))
        os.mkdir(join(options.outputdir, 'report_'+basename(options.name), 'pics/'))
        os.mkdir(join(options.outputdir, 'report_'+basename(options.name), 'pics/data/'))

    global FOLDER_REPORT
    global FOLDER_REPORT_PICS
    global FOLDER_REPORT_DATA
    global OUTPUTDIR
    global NAME

    FOLDER_REPORT = join(options.outputdir, 'report_'+basename(options.name)+"/")
    FOLDER_REPORT_PICS = join(options.outputdir, 'report_'+basename(options.name), 'pics/')
    FOLDER_REPORT_DATA = join(options.outputdir, 'report_'+basename(options.name), 'pics/data/')
    OUTPUTDIR = options.outputdir
    NAME = options.name

    if not inputs:
        print("Warning: Do not compute GC-content, as there is no input file", file=sys.stderr)

    if not genome:
        print("Warning: Do not compute GC-content, as there is no genome file", file=sys.stderr)

    if options.exts is None:
        options.exts = []

    if options.exts_inputs is None:
        options.exts_inputs = []

    return options, bamfiles, genome, chrom_sizes, dims, inputs
예제 #13
0
    def write_network(self, targets, out_path, threshold=1):
        """
        If enrichment information has been loaded before (via read_enrichment), this function creates
        a cytoscape-compatible network into the output folder.

        *Keyword arguments:*

          - targets -- Gene targets.
          - out_path -- Output path.
          - threshold -- Threshold for motif acceptance.
        """

        self.write_enrichment(out_path + "/pvalue_table_" + str(threshold * 100) + ".txt", threshold)

        out_path = npath(out_path)

        _, genes_motifs = self.get_mappings(key_type="gene_names")

        net_pairs = {}
        net_tfs = {}
        all_pairs = set()
        all_tfs = set()
        all_genes = set()

        if targets:
            filter_targets = True
        else:
            filter_targets = False

        # using genes to motif mapping to get network in all conditions
        for net_name in self.networks:
            net = self.networks[net_name]
            pairs = set()
            tfs = set()
            net_pairs[net_name] = pairs
            net_tfs[net_name] = tfs
            for tf in genes_motifs:
                motifs = genes_motifs[tf]
                for m in motifs:
                    if m in net:
                        for target in net[m]:
                            if not filter_targets or (target in targets):
                                pairs.add((tf, target))
                                tfs.add(tf)
                                all_genes.add(tf)
                                all_genes.add(target)
                    else:
                        print("motif not in network: " + m + " " + str(tf) + " ")

            all_pairs = all_pairs.union(pairs)
            all_tfs = all_tfs.union(tfs)

        # printing out network
        for net_name, pairs_aux in net_pairs.items():
            f = open(out_path + "/" + net_name + "_targets.txt", "w")

            for pair in all_pairs:
                # check if pair is active in the network
                if pair in pairs_aux:
                    f.write(pair[0] + "\t" + pair[1] + "\tactive\n")
                else:
                    f.write(pair[0] + "\t" + pair[1] + "\tinactive\n")

            f.close()

            f = open(out_path + "/" + net_name + "_genes.txt", "w")

            for gene in all_genes:
                # check if gene is tf active in network
                if gene in net_tfs[net_name]:
                    f.write(gene + "\ttf_active\n")
                elif gene in all_tfs:
                    f.write(gene + "\ttf_inactive\n")
                else:
                    f.write(gene + "\ttarget\n")

            f.close()
예제 #14
0
parser.add_argument('-f',
                    '--input-format',
                    choices=['jaspar-2014', 'jaspar-2016', 'hocomoco-pcm'],
                    type=str,
                    required=True,
                    help='format of the input file')
parser.add_argument('-o',
                    '--output-folder',
                    type=str,
                    required=True,
                    help='name of output Folder')

args = parser.parse_args()

# read the input file
with open(npath(args.input_file), "r") as f:
    content = f.readlines()

n_lines = len(content)

output_folder = npath(args.output_folder)

# make output directory path, if it doesn't exist
os.makedirs(output_folder)

###################################################################################################
# JASPAR 2014
###################################################################################################

if args.input_format == "jaspar-2014":
    for i in range(n_lines / 5):
예제 #15
0
def handle_input():
    parser = HelpfulOptionParser(usage=__doc__)

    parser.add_option(
        "-n",
        "--name",
        default=None,
        dest="name",
        type="string",
        help="Experiment's name and prefix for all files that are created.")
    parser.add_option(
        "-m",
        "--merge",
        default=False,
        dest="merge",
        action="store_true",
        help=
        "Merge peaks which have a distance less than the estimated mean fragment size "
        "(recommended for histone data). [default: do not merge]")
    parser.add_option(
        "--housekeeping-genes",
        default=None,
        dest="housekeeping_genes",
        type="str",
        help=
        "Define housekeeping genes (BED format) used for normalizing. [default: %default]"
    )
    parser.add_option(
        "--output-dir",
        dest="outputdir",
        default=None,
        type="string",
        help="Store files in output directory. [default: %default]")
    parser.add_option(
        "--report",
        dest="report",
        default=False,
        action="store_true",
        help="Generate HTML report about experiment. [default: %default]")
    parser.add_option(
        "--deadzones",
        dest="deadzones",
        default=None,
        help=
        "Define blacklisted genomic regions avoided for analysis (BED format). [default: %default]"
    )
    parser.add_option(
        "--no-correction",
        default=False,
        dest="no_correction",
        action="store_true",
        help=
        "Do not use multipe test correction for p-values (Benjamini/Hochberg). [default: %default]"
    )
    parser.add_option(
        "-p",
        "--pvalue",
        dest="pcutoff",
        default=0.1,
        type="float",
        help=
        "P-value cutoff for peak detection. Call only peaks with p-value lower than cutoff. "
        "[default: %default]")
    parser.add_option(
        "--exts",
        default=None,
        dest="exts",
        type="str",
        action='callback',
        callback=_callback_list,
        help=
        "Read's extension size for BAM files (comma separated list for each BAM file in config "
        "file). If option is not chosen, estimate extension sizes. [default: %default]"
    )
    parser.add_option(
        "--factors-inputs",
        default=None,
        dest="factors_inputs",
        type="str",
        action="callback",
        callback=_callback_list_float,
        help=
        "Normalization factors for input-DNA (comma separated list for each BAM file in config "
        "file). If option is not chosen, estimate factors. [default: %default]"
    )
    parser.add_option(
        "--scaling-factors",
        default=None,
        dest="scaling_factors_ip",
        type="str",
        action='callback',
        callback=_callback_list_float,
        help=
        "Scaling factor for each BAM file (not control input-DNA) as comma separated list for "
        "each BAM file in config file. If option is not chosen, follow normalization strategy "
        "(TMM or HK approach) [default: %default]")
    parser.add_option(
        "--save-input",
        dest="save_input",
        default=False,
        action="store_true",
        help="Save input-DNA file if available. [default: %default]")
    parser.add_option("--version",
                      dest="version",
                      default=False,
                      action="store_true",
                      help="Show script's version.")

    group = OptionGroup(parser, "Advanced options")
    group.add_option(
        "--regions",
        dest="regions",
        default=None,
        type="string",
        help=
        "Define regions (BED format) to restrict the analysis, that is, where to train the HMM and "
        "search for DPs. It is faster, but less precise.")
    group.add_option(
        "-b",
        "--binsize",
        dest="binsize",
        default=100,
        type="int",
        help=
        "Size of underlying bins for creating the signal. [default: %default]")
    group.add_option(
        "-s",
        "--step",
        dest="stepsize",
        default=50,
        type="int",
        help=
        "Stepsize with which the window consecutively slides across the genome to create the "
        "signal. [default: %default]")
    group.add_option(
        "--debug",
        default=False,
        dest="debug",
        action="store_true",
        help=
        "Output debug information. Warning: space consuming! [default: %default]"
    )
    group.add_option(
        "--no-gc-content",
        dest="no_gc_content",
        default=False,
        action="store_true",
        help="Do not normalize towards GC content. [default: %default]")
    group.add_option(
        "--norm-regions",
        default=None,
        dest="norm_regions",
        type="str",
        help=
        "Restrict normalization to particular regions (BED format). [default: %default]"
    )
    group.add_option(
        "-f",
        "--foldchange",
        dest="foldchange",
        default=1.6,
        type="float",
        help=
        "Fold change parameter to define training set (t_1, see paper). [default: %default]"
    )
    group.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        default=95,
        type="float",
        help=
        "Minimum signal support for differential peaks to define training set as percentage "
        "(t_2, see paper). [default: %default]")
    group.add_option(
        "--size",
        dest="size_ts",
        default=10000,
        type="int",
        help=
        "Number of bins the HMM's training set constists of. [default: %default]"
    )
    group.add_option(
        "--par",
        dest="par",
        default=1,
        type="int",
        help="Percentile for p-value postprocessing filter. [default: %default]"
    )
    group.add_option(
        "--poisson",
        default=False,
        dest="poisson",
        action="store_true",
        help="Use binomial distribution as emmission. [default: %default]")
    group.add_option(
        "--single-strand",
        default=False,
        dest="singlestrand",
        action="store_true",
        help="Allow single strand BAM file as input. [default: %default]")
    group.add_option(
        "--m_threshold",
        default=80,
        dest="m_threshold",
        type="int",
        help=
        "Define the M threshold of percentile for training TMM. [default: %default]"
    )
    group.add_option(
        "--a_threshold",
        default=95,
        dest="a_threshold",
        type="int",
        help=
        "Define the A threshold of percentile for training TMM. [default: %default]"
    )
    group.add_option("--rmdup",
                     default=False,
                     dest="rmdup",
                     action="store_true",
                     help="Remove the duplicate reads [default: %default]")
    parser.add_option_group(group)

    (options, args) = parser.parse_args()
    options.save_wig = False
    options.exts_inputs = None
    options.verbose = False
    options.hmm_free_para = False

    if options.version:
        print("")
        print(__version__)
        sys.exit()

    if len(args) != 1:
        parser.error("Please give config file")

    config_path = npath(args[0])

    if not isfile(config_path):
        parser.error("Config file %s does not exist!" % config_path)

    bamfiles, genome, chrom_sizes, inputs, dims = input_parser(config_path)

    if not genome:
        options.no_gc_content = True

    if options.exts and len(options.exts) != len(bamfiles):
        parser.error("Number of Extension Sizes must equal number of bamfiles")

    if options.exts_inputs and len(options.exts_inputs) != len(inputs):
        parser.error(
            "Number of Input Extension Sizes must equal number of input bamfiles"
        )

    if options.scaling_factors_ip and len(
            options.scaling_factors_ip) != len(bamfiles):
        parser.error(
            "Number of scaling factors for IP must equal number of bamfiles")

    for bamfile in bamfiles:
        if not isfile(bamfile):
            parser.error("BAM file %s does not exist!" % bamfile)

    if not inputs and options.factors_inputs:
        print("As no input-DNA, do not use input-DNA factors", file=sys.stderr)
        options.factors_inputs = None

    if options.factors_inputs and len(options.factors_inputs) != len(bamfiles):
        parser.error("factors for input-DNA must equal number of BAM files!")

    if inputs:
        for bamfile in inputs:
            if not isfile(bamfile):
                parser.error("BAM file %s does not exist!" % bamfile)

    if options.regions:
        if not isfile(options.regions):
            parser.error("Region file %s does not exist!" % options.regions)

    if genome and not isfile(genome):
        parser.error("Genome file %s does not exist!" % genome)

    if options.name is None:
        d = str(datetime.now()).replace("-", "_").replace(":", "_").replace(
            " ", "_").replace(".", "_").split("_")
        options.name = "THOR-exp" + "-" + "_".join(d[:len(d) - 1])

    if not which("wigToBigWig") or not which("bedGraphToBigWig") or not which(
            "bigWigMerge"):
        print(
            "Warning: wigToBigWig, bigWigMerge or bedGraphToBigWig not found! Signal will not be stored!",
            file=sys.stderr)

    if options.outputdir:
        options.outputdir = npath(options.outputdir)
        if isdir(options.outputdir) and sum(
                map(lambda x: x.startswith(options.name),
                    os.listdir(options.outputdir))) > 0:
            parser.error(
                "Output directory exists and contains files with names starting with your chosen experiment "
                "name! Do nothing to prevent file overwriting!")
        if not exists(options.outputdir):
            os.mkdir(options.outputdir)
    else:
        options.outputdir = os.getcwd()

    options.name = join(options.outputdir, options.name)

    if isdir(join(options.outputdir, 'report_' + basename(options.name))):
        parser.error(
            "Folder 'report_" + basename(options.name) +
            "' already exits in output directory!"
            "Do nothing to prevent file overwriting! "
            "Please rename report folder or change working directory of THOR with the option --output-dir"
        )

    if options.report:
        os.mkdir(
            join(options.outputdir, 'report_' + basename(options.name) + "/"))
        os.mkdir(
            join(options.outputdir, 'report_' + basename(options.name),
                 'pics/'))
        os.mkdir(
            join(options.outputdir, 'report_' + basename(options.name),
                 'pics/data/'))

    global FOLDER_REPORT
    global FOLDER_REPORT_PICS
    global FOLDER_REPORT_DATA
    global OUTPUTDIR
    global NAME

    FOLDER_REPORT = join(options.outputdir,
                         'report_' + basename(options.name) + "/")
    FOLDER_REPORT_PICS = join(options.outputdir,
                              'report_' + basename(options.name), 'pics/')
    FOLDER_REPORT_DATA = join(options.outputdir,
                              'report_' + basename(options.name), 'pics/data/')
    OUTPUTDIR = options.outputdir
    NAME = options.name

    if not inputs:
        print("Warning: Do not compute GC-content, as there is no input file",
              file=sys.stderr)

    if not genome:
        print("Warning: Do not compute GC-content, as there is no genome file",
              file=sys.stderr)

    if options.exts is None:
        options.exts = []

    if options.exts_inputs is None:
        options.exts_inputs = []

    return options, bamfiles, genome, chrom_sizes, dims, inputs