Пример #1
0
    def get_motif_list(self, pseudocounts=1.0, fpr=0.0001):

        motif_list = []

        # iterate over all available PWM files
        for motif_dir_path in self.motif_data.pwm_list:

            # iterate over all motif elements in this set
            for motif_name, ma in self.motifs_map.items():
                motif_file_name = os.path.join(motif_dir_path,
                                               motif_name + ".pwm")

                # if the motif annotation has a corresponding PWM file, add to return list
                if os.path.isfile(motif_file_name):
                    # check whether ma provides the motif matching threshold for the given fpr
                    # recalculate (and store) it otherwise
                    if fpr in ma.thresholds and ma.thresholds[fpr]:
                        threshold = ma.thresholds[fpr]
                    else:
                        pfm = parsers.pfm(str(motif_file_name))
                        bg = tools.flat_bg(
                            len(pfm)
                        )  # total number of "points" to add, not per-row
                        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
                        threshold = tools.threshold_from_p(pssm, bg, fpr)
                        ma.thresholds[fpr] = threshold

                    motif_list.append(
                        Motif(motif_file_name, pseudocounts, threshold))

        return motif_list
Пример #2
0
    def get_motif_list(self, pseudocounts=1.0, fpr=0.0001):

        motif_list = []

        # iterate over all available PWM files
        for motif_dir_path in self.motif_data.pwm_list:

            # iterate over all motif elements in this set
            for motif_name, ma in self.motifs_map.items():
                motif_file_name = os.path.join(motif_dir_path, motif_name + ".pwm")

                # if the motif annotation has a corresponding PWM file, add to return list
                if os.path.isfile(motif_file_name):
                    # check whether ma provides the motif matching threshold for the given fpr
                    # recalculate (and store) it otherwise
                    if fpr in ma.thresholds and ma.thresholds[fpr]:
                        threshold = ma.thresholds[fpr]
                    else:
                        pfm = parsers.pfm(str(motif_file_name))
                        bg = tools.flat_bg(len(pfm))  # total number of "points" to add, not per-row
                        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
                        threshold = tools.threshold_from_p(pssm, bg, fpr)
                        ma.thresholds[fpr] = threshold

                    motif_list.append(Motif(motif_file_name, pseudocounts, threshold))

        return motif_list
Пример #3
0
    def __init__(self, input_file_name, pseudocounts, fpr, thresholds):
        """ 
        Initializes Motif.

        Fields:
        pfm -- Position Frequency Matrix.
        bg -- Background frequencies.
        pssm -- Position Specific Scoring Matrix.
        alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"]
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])
        repository = input_file_name.split("/")[-2]

        # Creating PFM & PSSM
        self.pfm = parsers.pfm(str(input_file_name))
        self.bg = tools.flat_bg(len(
            self.pfm))  # total number of "points" to add, not per-row
        self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2)
        self.pssm_rc = tools.reverse_complement(self.pssm)

        # how many bases this motif has
        self.len = len(self.pfm[0])

        # maximum value found in the whole PSSM
        self.max = max([max(e) for e in self.pssm])

        # we only support pure DNA or methylated DNA, for now.
        self.alphabet = ["Aa", "Cc", "Gg", "Tt"]
        if len(self.pfm) == 6:
            self.alphabet += ["m", "1"]

        # Evaluating threshold
        try:
            if pseudocounts != 1.0:
                raise ValueError()
            self.threshold = thresholds.dict[repository][self.name][fpr]
        except Exception:
            # FIXME: this requires a modified version of MOODS. Not sure if we actually need it.
            # self.threshold = tools.threshold_from_p(self.pssm, self.bg, fpr, 2000.0)  # 10000.0 would take too long
            self.threshold = tools.threshold_from_p(self.pssm, self.bg, fpr)
            print(">>> recomputing threshold for %s: %f" %
                  (self.name, self.threshold))

        self.threshold_rc = tools.threshold_from_p(self.pssm_rc, self.bg, fpr)

        self.consensus = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm, axis=0)])
        self.consensus_rc = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)])

        # Evaluating if motif is palindromic
        self.is_palindrome = self.consensus == self.consensus_rc
Пример #4
0
    def __init__(self, input_file_name, pseudocounts, threshold):
        """ 
        Initializes Motif.

        Fields:
        pfm -- Position Frequency Matrix.
        bg -- Background frequencies.
        pssm -- Position Specific Scoring Matrix.
        alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"]
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])

        # Creating PFM & PSSM
        self.pfm = parsers.pfm(str(input_file_name))
        self.bg = tools.flat_bg(len(
            self.pfm))  # total number of "points" to add, not per-row
        self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2)
        self.pssm_rc = tools.reverse_complement(self.pssm)

        # how many bases this motif has
        self.len = len(self.pfm[0])

        # maximum value found in the whole PSSM
        self.max = max([max(e) for e in self.pssm])

        # we only support pure DNA or methylated DNA, for now.
        self.alphabet = ["Aa", "Cc", "Gg", "Tt"]
        if len(self.pfm) == 6:
            self.alphabet += ["m", "1"]

        self.threshold = threshold

        self.consensus = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm, axis=0)])
        self.consensus_rc = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)])

        # Evaluating if motif is palindromic
        self.is_palindrome = self.consensus == self.consensus_rc
Пример #5
0
    def __init__(self, input_file_name, pseudocounts, threshold):
        """ 
        Initializes Motif.

        Fields:
        pfm -- Position Frequency Matrix.
        bg -- Background frequencies.
        pssm -- Position Specific Scoring Matrix.
        alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"]
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])

        # Creating PFM & PSSM
        self.pfm = parsers.pfm(str(input_file_name))
        self.bg = tools.flat_bg(len(self.pfm))  # total number of "points" to add, not per-row
        self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2)
        self.pssm_rc = tools.reverse_complement(self.pssm)

        # how many bases this motif has
        self.len = len(self.pfm[0])

        # maximum value found in the whole PSSM
        self.max = max([max(e) for e in self.pssm])

        # we only support pure DNA or methylated DNA, for now.
        self.alphabet = ["Aa", "Cc", "Gg", "Tt"]
        if len(self.pfm) == 6:
            self.alphabet += ["m", "1"]

        self.threshold = threshold

        self.consensus = "".join([self.alphabet[i][0] for i in argmax(self.pssm, axis=0)])
        self.consensus_rc = "".join([self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)])

        # Evaluating if motif is palindromic
        self.is_palindrome = self.consensus == self.consensus_rc
Пример #6
0
        version = ".".join(pwm_name.split(".")[2:])
        gene_names = hocomoco_anno[pwm_name][0]
        group = hocomoco_anno[pwm_name][1]
        if not group:
            group = "."
        uniprot = hocomoco_anno[pwm_name][2]
        data_source = hocomoco_anno[pwm_name][3]
        taxGroup = "vertebrates"
        species = (pwm_name.split("_")[1]).split(".")[0]
        if species == "HUMAN":
            species = "H**o sapiens"
        elif species == "MOUSE":
            species = "Mus musculus"

        # Creating PSSM
        pfm = parsers.pfm(inputFileName)
        bg = tools.flat_bg(len(pfm))  # total number of "points" to add, not per-row
        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
        threshold_list = []

        # Evaluating thresholds
        for fpr in fprList:
            # Note: this requires a modified version of MOODS. Only use it if you know what you are doing
            # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0)))
            threshold = tools.threshold_from_p(pssm, bg, fpr)
            threshold_list.append(str(threshold))
        threshold = ",".join(threshold_list)

        resultMatrix.append([matrix_id, pwm_name, version, gene_names, group, uniprot, data_source, taxGroup,
                             species, threshold])
Пример #7
0
        version = ".".join(pwm_name.split(".")[2:])
        gene_names = hocomoco_anno[pwm_name][0]
        group = hocomoco_anno[pwm_name][1]
        if not group:
            group = "."
        uniprot = hocomoco_anno[pwm_name][2]
        data_source = hocomoco_anno[pwm_name][3]
        taxGroup = "vertebrates"
        species = (pwm_name.split("_")[1]).split(".")[0]
        if species == "HUMAN":
            species = "H**o sapiens"
        elif species == "MOUSE":
            species = "Mus musculus"

        # Creating PSSM
        pfm = parsers.pfm(inputFileName)
        bg = tools.flat_bg(
            len(pfm))  # total number of "points" to add, not per-row
        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
        threshold_list = []

        # Evaluating thresholds
        for fpr in fprList:
            # Note: this requires a modified version of MOODS. Only use it if you know what you are doing
            # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0)))
            threshold = tools.threshold_from_p(pssm, bg, fpr)
            threshold_list.append(str(threshold))
        threshold = ",".join(threshold_list)

        resultMatrix.append([
            matrix_id, pwm_name, version, gene_names, group, uniprot,
Пример #8
0
inFolder = sys.argv[1]
outFileName = sys.argv[2]

# Parameters
fprList = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
pseudocounts = 1.0

outFile = open(outFileName, "w")
outFile.write("\t".join(["MOTIF"] + [str(e) for e in fprList]) + "\n")

# Iterating on all PWMs
for pwmFileName in sorted(glob(inFolder + "*.pwm")):
    # Creating PSSM
    name = ".".join(basename(pwmFileName).split(".")[:-1])

    pfm = parsers.pfm(pwmFileName)
    bg = tools.flat_bg(
        len(pfm))  # total number of "points" to add, not per-row
    pssm = tools.log_odds(pfm, bg, pseudocounts, 2)

    # Evaluating thresholds
    resVec = [name]
    for fpr in fprList:
        # Note: this requires a modified version of MOODS. Only use it if you know what you are doing
        # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0)))
        resVec.append(str(tools.threshold_from_p(pssm, bg, fpr)))

    # Writing results
    outFile.write("\t".join(resVec) + "\n")

outFile.close()