def pseudo_bg(bgs, no_reverse): """ Add the pseudocount to the background frequencies ---- Parameters: bgs (dict) : dictionary of the background frequencies no_reverse (bool) : if set to True, the background frequencies will be averaged with the reverse complement frequencies ---- Returns: bgs_proc (dict) : normalized (and averaged) background frequencies """ if not isinstance(bgs, dict): raise NotValidBGException( "\n\nERROR: unable to add the pseudocount to the background") if not isinstance(no_reverse, bool): raise ValueException(' '.join( ["Boolean value required, got", str(type(no_reverse))])) if not no_reverse: bgs_avg = average_bg_with_rc(bgs) else: bgs_avg = bgs bgs_proc = norm_bg(bgs_avg) return bgs_proc
def print_scoring_msg(no_reverse, motif): """Print a message to display on terminal during scoring step of GRAFIMO analysis. Parameters ---------- no_reverse : bool if True will be considered only the forward DNA strand motif : Motif Motif object """ if not isinstance(motif, Motif): errmsg: str = '\n\nERROR: The given motif is not an instance of Motif' raise ValueException(errmsg) motif_id: str = motif.getMotifID() fw_id: str = ''.join(['+', motif_id]) # we take into account also the reverse complement if not no_reverse: rev_id: str = ''.join(['-', motif_id]) print('\nScoring hits for motif', fw_id) # if we score also the reverse complement if not no_reverse: print('Scoring hits for motif', rev_id, end="\n\n")
def compute_qvalues(pvalues: List[np.double]) -> List[np.double]: """Compute q-values for a given list of P-values. The q-values are obtained using the Benjamini-Hochberg method. Parameters ---------- pvalues : list list of P-values Returns ------- list list of q-values """ if not isinstance(pvalues, list): errmsg: str = "\n\nERROR: P-values must be in a list" raise ValueException(errmsg) print("\nComputing q-values...\n") # use Benjamini-Hochberg procedure to correct P-values mt_obj: Tuple[np.ndarray, np.ndarray, np.double, float] mt_obj = multipletests(pvalues, method="fdr_bh") qvalues: List[float] = list(mt_obj[1]) return qvalues
def setScale(self, scale: int) -> None: if not isinstance(scale, int): raise ValueException("\n\nERROR: the scale factor must be an int") assert scale > 0 self._scale = scale
def setMax_val(self, max_val: int) -> None: if max_val >= np.inf: errmsg = ' '.join([ "\n\nERROR: impossible to assign", max_val, "to Motif.max_val" ]) raise ValueException(errmsg) self._max_val = max_val
def setMin_val(self, min_val): if min_val <= -np.inf: errmsg = ' '.join([ "\n\nERROR: impossible to assign", min_val, "to Motif.min_val" ]) raise ValueException(errmsg) self._min_val = min_val
def pseudo_bg(bgs: Dict, no_reverse: bool) -> Dict: """Add a pseudocount and normalize the background probabilities of nucleotides used to build the motif scoring matrix. A pseudocount value is added to the background probability distribution. If are to be considered both the forward and the reverse strand the background probabilities are averaged for the two strands. The resulting background probabilities are then normalized. Parameters ---------- bgs : dict background probability distribution no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered Returns ------- dict normalized background probablity distribution """ bgs_avg: Dict bgs_proc: Dict errmsg: str if not isinstance(bgs, dict): errmsg = "\n\nERROR: unable to add the pseudocount to the background" raise NotValidBGException(errmsg) if not isinstance(no_reverse, bool): errmsg = ' '.join( ["Boolean value required, got", str(type(no_reverse))]) raise ValueException(errmsg) if not no_reverse: bgs_avg = average_bg_with_rc(bgs) else: bgs_avg = bgs bgs_proc = norm_bg(bgs_avg) return bgs_proc
def print_scoring_msg(no_reverse, motif): if not isinstance(motif, Motif): raise ValueException( '\n\nERROR: The given motif is not an instance of Motif') motif_id = motif.getMotifID() fw_id = ''.join(['+', motif_id]) # we take into account also the reverse complement if not no_reverse: rev_id = ''.join(['-', motif_id]) print() # newline print('Scoring hits for motif', fw_id) # if we score also the reverse complement if not no_reverse: print('Scoring hits for motif', rev_id) print() # newline
def compute_qvalues(pvalues): """ Compute the q-values for a given list of P-values, using the Benjamini-Hochberg method ---- Parameters: pvalues (list) : list of P-values ---- Returns: qvalues (list) : list of computed q-values """ if not isinstance(pvalues, list): errmsg = "\n\nERROR: P-values must be in a list" raise ValueException(errmsg) print("\nComputing q-values...\n") # use Benjamini-Hochberg procedure to correct P-values mt_obj = multipletests(pvalues, method="fdr_bh") qvalues = list(mt_obj[1]) return qvalues
def build_df(motif: Motif, seqnames: List[str], starts: List[int], stops: List[int], strands: List[str], scores: List[np.double], pvalues: List[np.double], qvalues: List[np.double], sequences: List[str], frequencies: List[int], references: List[str], threshold: float, qval_t: bool, no_qvalue: bool, recomb: bool ) -> pd.DataFrame: """Build the results summary report. The results are stored in a pandas DataFrame object. The motif occurrence candidates are filtered applying a threshold on the P-value or on the q-value. The remaining entries are reported in the final results. Parameters ---------- motif : Motif Motif object seqnames : list sequence names starts : list starting coordinates stops : list stopping coordinates strands : list DNA strands pvalues: list P-values qvalues : list q-values sequences : list sequences references : list flag values stating if the sequences contain genomi variants threshold : float threshold to apply on P-values or q-values in order to filter the motif occurrence candidates to report qval_t : bool if True the threshold will be applied on q-values rather on P-values no_qvalue: if True the q-values have not been computed recomb : bool if True will be reported also sequences which can be built with the given set of genomic variants but do not appear in the available samples haplotypes Returns ------- pandas.DataFrame final results report """ errmsg: str = "\n\nERROR: unknown data-type for motif" if not isinstance(motif, Motif): raise ValueException(errmsg) if not isinstance(seqnames, list): raise ValueException(errmsg) if not isinstance(starts, list): raise ValueException(errmsg) if not isinstance(stops, list): raise ValueException(errmsg) if not isinstance(strands, list): raise ValueException(errmsg) if not isinstance(pvalues, list): raise ValueException(errmsg) if not isinstance(qvalues, list): raise ValueException(errmsg) if not isinstance(sequences, list): raise ValueException(errmsg) if not isinstance(references, list): raise ValueException(errmsg) if not isinstance(references, list): raise ValueException(errmsg) if not isinstance(qval_t, bool): raise ValueException(errmsg) if not isinstance(no_qvalue, bool): raise ValueException(errmsg) if not isinstance(recomb, bool): raise ValueException(errmsg) lst_len: int = len(seqnames) assert len(starts) == lst_len assert len(stops) == lst_len assert len(strands) == lst_len assert len(scores) == lst_len assert len(pvalues) == lst_len assert len(sequences) == lst_len assert len(frequencies) == lst_len assert len(references) == lst_len # check if we want also the q-values if not no_qvalue: assert len(qvalues) == lst_len # apply the threshold on the q-values rather than on P-values if qval_t: assert (not no_qvalue) assert len(qvalues) > 0 seqnames_thresh: List[str] = list() starts_thresh: List[int] = list() ends_thresh: List[int] = list() strands_thresh: List[str] = list() scores_thresh: List[np.double] = list() pvalues_thresh: List[np.double] = list() sequences_thresh: List[str] = list() frequencies_thresh: List[int] = list() references_thresh: List[str] = list() if not no_qvalue: qvalues_thresh: List[np.double] = list() for i in range(lst_len): # ignore binding site candidates which does not appear in any sample # if not required by tyhe user to analyze them if not recomb and int(frequencies[i]) == 0: continue if not qval_t: # apply threshold on P-values pvalue: np.double = pvalues[i] if pvalue < threshold: # only the sequences with a P-value under the threshold survive seqnames_thresh.append(seqnames[i]) starts_thresh.append(starts[i]) ends_thresh.append(stops[i]) strands_thresh.append(strands[i]) scores_thresh.append(scores[i]) pvalues_thresh.append(pvalues[i]) sequences_thresh.append(sequences[i]) frequencies_thresh.append(frequencies[i]) references_thresh.append(references[i]) if not no_qvalue: qvalues_thresh.append(qvalues[i]) # end if else: # apply threshold on q-values qvalue: np.double = qvalues[i] if qvalue < threshold: # only the sequences with a q-value under the threshold survive seqnames_thresh.append(seqnames[i]) starts_thresh.append(starts[i]) ends_thresh.append(stops[i]) strands_thresh.append(strands[i]) scores_thresh.append(scores[i]) pvalues_thresh.append(pvalues[i]) sequences_thresh.append(sequences[i]) frequencies_thresh.append(frequencies[i]) references_thresh.append(references[i]) # the last control statement, in the if, in this case is not # necessary (we must have the q-values) # otherwise we should not be here qvalues_thresh.append(qvalues[i]) # end if # end if # end for df_len: int = len(seqnames_thresh) # TF's name and ID list motif_ids: List[str] = [motif.getMotifID()] * df_len motif_names: List[str] = [motif.getMotifName()] * df_len df = pd.DataFrame() df['motif_id'] = motif_ids df['motif_alt_id'] = motif_names df['sequence_name'] = seqnames_thresh df['start'] = starts_thresh df['stop'] = ends_thresh df['strand'] = strands_thresh df['score'] = scores_thresh df['p-value'] = pvalues_thresh # add the q-values to the final data frame if they have been computed if not no_qvalue: df['q-value'] = qvalues_thresh # finish to build the data frame df['matched_sequence'] = sequences_thresh df['haplotype_frequency'] = frequencies_thresh df['reference'] = references_thresh # sort entries by p-value df = df.sort_values(['p-value'], ascending=True) # reindex the data frame in order to have indexes in range [1, (df_len + 1)] df.index = list(range(1, (df_len + 1))) return df
def get_references(self): if not self._references: errmsg: str = "\n\nERROR: attempting to access an empty attribute" raise ValueException(errmsg) return self._references
def build_df(motif, seqnames, starts, stops, strands, scores, pvalues, qvalues, sequences, references, threshold, qval_t, no_qvalue): """ Build a pandas DataFrame to summarize the results of GRAFIMO analysis ---- Parameters: motif (Motif) : motif seqnames (list) : list of sequence names starts (list) : list of sequence starting positions stops (list) : list of sequence ending positions strands (list) : list of sequence strands scores (list) : list of sequence scores pvalues (list) : list of sequence score P-values qvalues (list) : list of sequence q-values sequences (list) : list of sequences references (list) : list of sequence flag values. If 'ref', then the sequence belong to the reference genome, if 'non.ref', then the sequence contains variants threshold (float) : threshold to apply on the P-value (default behavior) or on the q-values qval_t (bool) : if set to True, the threshold will be applied on the q-values, on the P-values otherwise ---- Returns: df (pd.DataFrame) """ if not isinstance(motif, Motif): errmsg = "\n\nERROR: unknown data-type for motif" raise ValueException(errmsg) if not isinstance(seqnames, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(starts, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(stops, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(strands, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(pvalues, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(qvalues, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(sequences, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(references, list): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(qval_t, bool): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) if not isinstance(no_qvalue, bool): errmsg = "\n\nERROR: unknown data-type, cannot proceed" raise ValueException(errmsg) # all lists must have the same length lst_len = len(seqnames) assert len(starts) == lst_len assert len(stops) == lst_len assert len(strands) == lst_len assert len(scores) == lst_len assert len(pvalues) == lst_len assert len(sequences) == lst_len assert len(references) == lst_len # check if we want also the q-values if not no_qvalue: # we want the q-values assert len(qvalues) == lst_len if qval_t: # apply the threshold on the q-values rather than on P-values assert (not no_qvalue) assert len(qvalues) > 0 # we must have computed them seqnames_thresh = [] starts_thresh = [] ends_thresh = [] strands_thresh = [] scores_thresh = [] pvalues_thresh = [] sequences_thresh = [] references_thresh = [] if not no_qvalue: qvalues_thresh = [] for i in range(lst_len): if not qval_t: # apply threshold on P-values pvalue = pvalues[i] if pvalue < threshold: # only the sequences with a P-value under the threshold survive seqnames_thresh.append(seqnames[i]) starts_thresh.append(starts[i]) ends_thresh.append(stops[i]) strands_thresh.append(strands[i]) scores_thresh.append(scores[i]) pvalues_thresh.append(pvalues[i]) sequences_thresh.append(sequences[i]) references_thresh.append(references[i]) if not no_qvalue: qvalues_thresh.append(qvalues[i]) # end if else: # apply threshold on q-values qvalue = qvalues[i] if qvalue < threshold: # only the sequences with a q-value under the threshold survive seqnames_thresh.append(seqnames[i]) starts_thresh.append(starts[i]) ends_thresh.append(stops[i]) strands_thresh.append(strands[i]) scores_thresh.append(scores[i]) pvalues_thresh.append(pvalues[i]) sequences_thresh.append(sequences[i]) references_thresh.append(references[i]) # the last control statement, in the if, in this case is not # necessary (we must have the q-values) # otherwise we should not be here qvalues_thresh.append(qvalues[i]) # end if # end if # end for df_len = len(seqnames_thresh) # TF's name and ID list motif_ids = [motif.getMotifID()] * df_len motif_names = [motif.getMotifName()] * df_len """ build the final data frame structure: |motif_id|motif_alt_id|sequence_name|start|stop|strand|score|p-value|q-value|matched_sequence|reference| """ df = pd.DataFrame() df['motif_id'] = motif_ids df['motif_alt_id'] = motif_names df['sequence_name'] = seqnames_thresh df['start'] = starts_thresh df['stop'] = ends_thresh df['strand'] = strands_thresh df['score'] = scores_thresh df['p-value'] = pvalues_thresh # add the q-values to the final data frame if they have been computed if not no_qvalue: df['q-value'] = qvalues_thresh # finish to build the data frame df['matched_sequence'] = sequences_thresh df['reference'] = references_thresh # sort entries by p-value df = df.sort_values(['p-value'], ascending=True) # reindex the data frame in order to have indexes in range [1, (df_len + 1)] df.index = list(range(1, (df_len + 1))) return df
def get_strands(self): if not self._strands: errmsg = "\n\nERROR: attempting to access an empty attribute" raise ValueException(errmsg) return self._strands