def __read_alphabet_meme(motif_file: str, ifstream, debug: bool) -> List[str]: """Read alphabet from MEME files. ... Parameters ---------- motif_file : str path to motif PWM ifstream : _io.TextIOWrapper input stream debug : bool trace the full error stack Returns ------- list alphabet """ for line in ifstream: if line.startswith("ALPHABET"): break else: errmsg = "Unexpected EOF reached, unable to parse {}.\n" exception_handler(EOFError, errmsg.format(motif_file), debug) if not line.startswith("ALPHABET"): errmsg = "No line stores alphabet in {}.\n" exception_handler(ValueError, errmsg.format(motif_file), debug) line = line.strip().replace("ALPHABET= ", "") if line == "ACGT": alphabet = sorted(list(line)) else: errmsg = "The motif is not built on DNA alphabet.\n" exception_handler(ValueError, errmsg, debug) assert isListEqual(alphabet, DNA_ALPHABET) return alphabet
def __init__(self, count_matrix, width, alphabet, motif_id, motif_name): if count_matrix.empty: errmsg = "\n\nERROR: attempt to initialize the motif object with an empty count matrix" raise NotValidMotifMatrixException(errmsg) if not isinstance(count_matrix, pd.DataFrame): raise NoDataFrameException( "\n\nERROR: the given value is not a pandas.DatFrame instance") if not isinstance(width, int) or width < 0: errmsg = "\n\nERROR: attempt to initialize motif without a valid width" raise WrongMotifWidthException(errmsg) if not isinstance(motif_id, str) or not motif_id: raise WrongMotifIDException( "\n\nERROR: cannot initialize the motif with the given ID") if not isinstance(motif_name, str) or not motif_name: raise WrongMotifNameException( "\n\nERROR: cannot initialize the motif with the given name") if not isinstance(alphabet, list) or not isListEqual( alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: cannot initialize a motif object with a wrong alphabet" raise NotValidAlphabetException(errmsg) self._count_matrix = count_matrix self._width = width self._motif_id = motif_id self._motif_name = motif_name self._alphabet = alphabet
def setAlphabet(self, alphabet: List[str]) -> None: if not isinstance(alphabet, list): errmsg = "\n\nERROR: Expected list, got {}.\n" raise TypeError(errmsg.format(type(alphabet).__name__)) if len(alphabet) == 0: errmsg = "\n\nERROR: Empty motif alphabet.\n" raise ValueError(errmsg) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: The motif is not built on DNA alphabet.\n" raise ValueError(errmsg) self.alphabet = alphabet
def setAlphabet(self, alphabet: List[str]) -> None: if not isinstance(alphabet, list): errmsg = "\n\nERROR: the given alphabet is not in a list" raise NotValidAlphabetException(errmsg) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: the given alphabet is not a valid DNA alphabet" raise NotValidAlphabetException(errmsg) self.alphabet = alphabet
def setAlphabet(self, alphabet): if not isinstance(alphabet, list): raise NotValidAlphabetException( "\n\nERROR: the given alphabet is not in a list") if not isListEqual(alphabet, DNA_ALPHABET): raise NotValidAlphabetException( "\n\nERROR: the given alphabet is not a valid DNA alphabet") self.alphabet = alphabet
def __init__(self, count_matrix: np.ndarray, width: int, alphabet: List[str], motif_id: str, motif_name: str, nucsmap: dict): if not isinstance(count_matrix, np.ndarray): errmsg = "\n\nERROR: Expected numpy.ndarray, got {}.\n" raise TypeError(errmsg.format(type(count_matrix).__name__)) if count_matrix.size == 0 or sum(sum(count_matrix)) == 0: errmsg = "\n\nERROR: Empty motif count matrix.\n" raise NotValidMotifMatrixException(errmsg) if not isinstance(width, int): errmsg = "\n\nERROR: Expected int, got {}.\n" raise TypeError(errmsg.format(type(width).__name__)) if width <= 0: errmsg = "\n\nERROR: Forbidden motif width {}.\n" raise ValueError(errmsg.format(width)) if not isinstance(motif_id, str): errmsg = "\n\nERROR: Expected str, got {}.\n" raise TypeError(errmsg.format(type(motif_id).__name__)) if not motif_id: errmsg = "\n\nERROR: Not valid motif ID.\n" raise ValueError(errmsg) if not isinstance(motif_name, str): errmsg = "\n\nERROR: Expected str, got {}.\n" raise TypeError(errmsg.format(type(motif_name).__name__)) if not motif_name: errmsg = "\n\nERROR: Not valid motif name.\n" raise ValueError(errmsg) if not isinstance(alphabet, list): errmsg = "\n\nERROR: Expected list, got {}.\n" raise TypeError(errmsg.format(type(alphabet).__name__)) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: The motif is not built on DNA alphabet.\n" raise ValueError(errmsg) if not isinstance(nucsmap, dict): errmsg = "\n\nERROR: Expected dict, got {}.\n" raise TypeError(errmsg.format(type(nucsmap).__name__)) self._count_matrix = count_matrix self._width = width self._motif_id = motif_id self._motif_name = motif_name self._alphabet = alphabet self._nucsmap = nucsmap
def scale_pwm(motif_matrix: np.ndarray, alphabet: List[str], motif_width: int, nucsmap: dict, debug: bool) -> Tuple[np.ndarray, int, int, int, np.double]: """Scale the motif log-odds matrix scores to integer values. The values are scaled in the range [0, 1000]. The scaling improves computational speed while scoring potential motif occurrences, and allows constant time p-value estimatimation. ... Parameters ---------- motif_matrix : numpy.ndarray motif log-odds matrix alphabet: list DNA motif alphabet motif_width: int motif width nucsmap: dict nucleotide index map debug : bool trace the full error stack Returns ------- numpy.ndarray scaled motif score matrix int minimum value of the scaled score matrix int maximum value of the scaled score matrix int scaling factor numpy.double scaling offset """ if not isinstance(motif_matrix, np.ndarray): errmsg = "Expected numpy.ndarray, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_matrix).__name__), debug) if motif_matrix.size == 0 or sum(sum(motif_matrix)) == 0: errmsg = "The motif log-odds natrix is empty.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(alphabet, list): errmsg = "Expected list, got {}.\n" exception_handler(TypeError, errmsg.format(type(alphabet).__name__), debug) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "The motif is not built on DNA alphabet.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(motif_width, int): errmsg = "Expected int, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_width).__name__), debug) if motif_width <= 0: errmsg = "Forbidden motif width.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(nucsmap, dict): errmsg = "Expected dict, got {}.\n" exception_handler(TypeError, errmsg.format(type(nucsmap).__name__), debug) min_val = motif_matrix.min() max_val = motif_matrix.max() motif_matrixsc = np.zeros(motif_matrix.shape, dtype=np.double) lower: int = min_val upper: int = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values scaled in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix[nucsmap[nuc], j] - (offset)) * scale_factor) motif_matrixsc[nucsmap[nuc], j] = scaled_score # make sure the values are integers motif_matrixsc = motif_matrixsc.astype(int) min_val = int(motif_matrixsc.min()) # scaled min max_val = int(motif_matrixsc.max()) # scaled max return motif_matrixsc, min_val, max_val, int(scale_factor), offset
def scale_pwm(motif_matrix: pd.DataFrame, alphabet: List[str], motif_width: int) -> Tuple[np.ndarray, int, int, int, np.double]: """Scale the log-odds values of the motif scoring matrix. The values are scaled in the range [0, 1000]. The scaling improves computational speed while computing the score for each motif occurrence candidate, and allows a constant time computation of the corresponding P-value. Parameters ---------- motif_matrix : pd.DataFrame motif log-odds matrix alphabet: list DNA motif alphabet motif_width: int motif width Returns ------- numpy.ndarray scaled motif scoring matrix int minimum value of the scaled scoring matrix int maximum value of the scaled scoring matrix int scaling factor numpy.double scaling offset """ errmsg: str if not isinstance(motif_matrix, pd.DataFrame): errmsg = "\n\nERROR: The given motif matrix must be an instance of pandas.DataFrame" raise NoDataFrameException(errmsg) if motif_matrix.empty: errmsg = "\n\nERROR: The given motif matrix is empty" raise NotValidMotifMatrixException(errmsg) if not isinstance(alphabet, list): errmsg = "\n\nERROR: The alphabet given is not in a list" raise NotValidAlphabetException(errmsg) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: The alphabet given is not a valid DNA alphabet" raise NotValidAlphabetException(errmsg) assert motif_width > 0 min_val: int max_val: int motif_matrix_sc: pd.DataFrame min_val = min(motif_matrix.min()) max_val = max(motif_matrix.max()) motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index), columns=list(motif_matrix.columns), data=0) lower: int = min_val upper: int = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) offset: np.double scale_factor: int lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values will be in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix.loc[nuc, j] - (offset)) * scale_factor) motif_matrix_sc.loc[nuc, j] = scaled_score # end for # end for # make sure the values are integers motif_matrix_sc[:] = motif_matrix_sc[:].astype(int) # now they are scaled min_val = min(motif_matrix_sc.min()) max_val = max(motif_matrix_sc.max()) return motif_matrix_sc, min_val, max_val, int(scale_factor), offset
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool) -> List[Motif]: """Read a motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Since a MEME file can contain one or more motifs, for each PWM contained is built the corresponding motif object. The resulting set of motifs are then stored in a list. Parameters ---------- motif_file : str path to the motif PWM bg_file : str path to the background probability distribution pseudocount : float pseudocount to add to the PWM values no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information Returns ------- List[Motif] List of Motif objects storing the data contained in motif_file """ try: with open(motif_file, 'r') as in_mtf: # open the motif file # flag to keep track were the infos about the motif begin infostart: bool # flag to keep track were the motif data begin datastart: bool # number of motifs found in the MEME file motifs_found: int # list of the found motif IDs motifID_lst: List[str] # list of the found motif names motifName_lst: List[str] # list of the found motif widths motif_width_lst: List[int] # list of the found motif site counts site_counts_lst: List[int] # list of the found motif alphabet lengths alphalen_lst: List[int] # list of the found motif probability matrices motif_probs_lst: List[pd.DataFrame] # list of the found As probabilities for each motif a_lst: List[np.double] # list of the found Cs probabilities for each motif c_lst: List[np.double] # list of the found Gs probabilities for each motif g_lst: List[np.double] # list of the found Ts probabilities for each motif t_lst: List[np.double] infostart = False datastart = False motifs_found = 0 motifID_lst = list() motifName_lst = list() motif_width_lst = list() site_counts_lst = list() alphalen_lst = list() motif_probs_lst = list() a_lst = list() c_lst = list() g_lst = list() t_lst = list() motif_width = None pos_read = 0 for line in in_mtf: if line[0:8] == 'ALPHABET': alphabet: List = sorted(list(set(line[10:-1]))) assert isListEqual(alphabet, DNA_ALPHABET) if line[0:5] == 'MOTIF': if verbose: start_rm: float = time.time() # read motif ID and full name motif_header: str = line.split() assert len(motif_header) > 0 # there are two ways to define the motif name line # in MEME file # (refer to http://meme-suite.org/doc/meme-format.html?man_type=web): # 1 - MOTIF motif_alternate_name # 2 - MOTIF motif_identifier motif_alternate_name motifID: str motifName: str if len(motif_header) == 2: # support case (1) motifID = motif_header[1] motifName = motif_header[1] else: # support case (2) motifID, motifName = motif_header[1:3] # end if motifID_lst.append(motifID) motifName_lst.append(motifName) # the informations about motif start here infostart = True continue # end if if infostart and len(line.strip()) != 0: infos: str = line[26:] infosplit: List[str] = infos.split() alphalen: int = int(infosplit[1]) alphalen_lst.append(alphalen) assert alphalen == len(alphabet) motif_width: int = int(infosplit[3]) site_counts: int = int(infosplit[5]) infostart = False # informations end here # allocate space for the motif probability matrix motif_probs: pd.DataFrame = pd.DataFrame( index=alphabet, columns=range(motif_width), data=np.double(0)) motif_width_lst.append(motif_width) site_counts_lst.append(site_counts) motif_probs_lst.append(motif_probs) datastart = True # at next step begin data # initialize nucleotide data a = list() c = list() g = list() t = list() continue # end if if datastart and pos_read < motif_width: freqs = line.split() a.append(np.double(freqs[0])) c.append(np.double(freqs[1])) g.append(np.double(freqs[2])) t.append(np.double(freqs[3])) pos_read += 1 # end if # we read all current motif data if pos_read == motif_width: a_lst.append(a) c_lst.append(c) g_lst.append(g) t_lst.append(t) # update stats about found motifs motifs_found += 1 # clear the statistics pos_read: int = 0 motif_width = None datastart = False alphalen = -1 datastart = False if verbose: end_rm: float = time.time() msg: str = ''.join([ "Read motif ", motifID, " in ", str(end_rm - start_rm), "s" ]) print(msg) # end if # end if except: # something went wrong errmsg: str = ' '.join(["Unable to read file", motif_file]) raise FileReadingException(errmsg) else: bgs: dict # read the background if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: errmsg = "\n\nERROR: unable to find the given background file" raise NotValidBGException(errmsg) # end if bgs = pseudo_bg(bgs, no_reverse) motif_lst: List[Motif] motif_lst = list() for i in range(motifs_found): mp: pd.DataFrame = motif_probs_lst[i] mp.loc['A'] = a_lst[i] mp.loc['C'] = c_lst[i] mp.loc['G'] = g_lst[i] mp.loc['T'] = t_lst[i] mw: int = motif_width_lst[i] sc: int = site_counts_lst[i] mp = norm_motif(mp, mw, alphabet) mp = apply_pseudocount_meme(mp, pseudocount, sc, mw, bgs, alphabet) motif: Motif = Motif(mp, mw, alphabet, motifID_lst[i], motifName_lst[i]) motif.setBg(bgs) motif_lst.append(motif) # end for return motif_lst finally: in_mtf.close() # close the file anyway
def scale_pwm(motif_matrix, alphabet, motif_width): """ Scale the motif matrix values ---- Parameters: motif_matrix (str) : count matrix alphabet (str) : motif alphabet motif_width (int) : motif width ---- Returns: motif_matrix_sc (np.ndarray) : scaled motif matrix min_val (int) : lowest value in the scaled motif matrix max_val (int) : higest value in the scaled motif matrix scale_factor (int) offset (int) """ if not isinstance(motif_matrix, pd.DataFrame): raise NoDataFrameException( "The given motif matrix must be an instance of pandas.DataFrame") die(1) if motif_matrix.empty: raise NotValidMotifMatrixException("The given motif matrix is empty") die(1) if not isinstance(alphabet, list): raise NotValidAlphabetException("The alphabet given is not in a list") die(1) if not isListEqual(alphabet, DNA_ALPHABET): raise NotValidAlphabetException( "The alphabet given is not a valid DNA alphabet") die(1) assert motif_width > 0 min_val = min(motif_matrix.min()) max_val = max(motif_matrix.max()) motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index), columns=list(motif_matrix.columns), data=0) lower = min_val upper = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values will be in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix.loc[nuc, j] - (offset)) * scale_factor) motif_matrix_sc.loc[nuc, j] = scaled_score # end for # end for # make sure the values are integers motif_matrix_sc[:] = motif_matrix_sc[:].astype(int) # now they are scaled min_val = min(motif_matrix_sc.min()) max_val = max(motif_matrix_sc.max()) return motif_matrix_sc, min_val, max_val, int(scale_factor), offset
def read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose): """ Read the motif file in MEME format and build a motif object from it. Note that a MEME file can contain a variable number of motifs ---- Params: motif_file (str) : path to the motif file bg_file (str) : path to the background file pseudocount (np.double) : pseudocount to add to motif frequencies no_reverse (bool) : if set to True, only data related to forward strand will be used ---- Returns: motif (Motif) : returns a Motif object """ try: with open(motif_file, 'r') as in_mtf: # open the motif file infostart = False # flag to keep track were the infos about the motif begin datastart = False # flag to keep track were the motif data begin motifs_found = 0 # number of motifs found in the MEME file motifID_lst = [] # list of the found motif IDs motifName_lst = [] # list of the found motif names motif_width_lst = [] # list of the found motif widths site_counts_lst = [] # list of the found motif site counts alphalen_lst = [] # list of the found motif alphabet lengths motif_probs_lst = [ ] # list of the found motif probability matrices a_lst = [] # list of the found As probabilities for each motif c_lst = [] # list of the found Cs probabilities for each motif g_lst = [] # list of the found Gs probabilities for each motif t_lst = [] # list of the found Ts probabilities for each motif motif_width = None pos_read = 0 for line in in_mtf: if line[0:8] == 'ALPHABET': alphabet = sorted(list(set(line[10:-1]))) assert isListEqual(alphabet, DNA_ALPHABET) if line[0:5] == 'MOTIF': if verbose: start_rm = time.time() motifID, motifName = line.split()[1:3] motifID_lst.append(motifID) motifName_lst.append(motifName) # the informations about motif start here infostart = True continue # end if if infostart and len(line.strip()) != 0: infos = line[26:] infosplit = infos.split() alphalen = int(infosplit[1]) alphalen_lst.append(alphalen) assert alphalen == len(alphabet) motif_width = int(infosplit[3]) site_counts = int(infosplit[5]) infostart = False # informations end here # allocate space for the motif probability matrix motif_probs = pd.DataFrame(index=alphabet, columns=range(motif_width), data=np.double(0)) motif_width_lst.append(motif_width) site_counts_lst.append(site_counts) motif_probs_lst.append(motif_probs) datastart = True # at next step begin data # initialize nucleotide data a = [] c = [] g = [] t = [] continue # end if if datastart and pos_read < motif_width: freqs = line.split() a.append(np.double(freqs[0])) c.append(np.double(freqs[1])) g.append(np.double(freqs[2])) t.append(np.double(freqs[3])) pos_read += 1 # end if # we read all current motif data if pos_read == motif_width: a_lst.append(a) c_lst.append(c) g_lst.append(g) t_lst.append(t) # update stats about found motifs motifs_found += 1 # clear the statistics pos_read = 0 motif_width = None datastart = False alphalen = -1 datastart = False if verbose: end_rm = time.time() msg = ''.join([ "Read motif ", motifID, " in ", str(end_rm - start_rm), "s" ]) print(msg) # end if # end if except: # something went wrong errmsg = ' '.join(["Unable to read file", motif_file]) raise FileReadingException(errmsg) else: # read the background if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: raise NotValidBGException( "\n\nERROR: unable to find the given background file") # end if bgs = pseudo_bg(bgs, no_reverse) motif_lst = [] # list of found motifs for i in range(motifs_found): mp = motif_probs_lst[i] mp.loc['A'] = a_lst[i] mp.loc['C'] = c_lst[i] mp.loc['G'] = g_lst[i] mp.loc['T'] = t_lst[i] mw = motif_width_lst[i] sc = site_counts_lst[i] mp = norm_motif(mp, mw, alphabet) mp = apply_pseudocount_meme(mp, pseudocount, sc, mw, bgs, alphabet) motif = Motif(mp, mw, alphabet, motifID_lst[i], motifName_lst[i]) motif.setBg(bgs) motif_lst.append(motif) # end for return motif_lst finally: in_mtf.close() # close the file anyway