def plot_motif(motif_name, figsize, ylab='bits', information_content=True): """ Plot motifs from encode motifs file """ motif_letter_heights = loaded_motifs.getPwm(motif_name).getRows() return plot_pwm(motif_letter_heights, figsize, ylab=ylab, information_content=information_content)
def get_motif_scores(encoded_sequences, motif_names, max_scores=None, return_positions=False, GC_fraction=0.4): """ Computes pwm log odds. Parameters ---------- encoded_sequences : 4darray motif_names : list of strings max_scores : int, optional return_positions : boolean, optional GC_fraction : float, optional Returns ------- (num_samples, num_motifs, seq_length) complete score array by default. If max_scores, (num_samples, num_motifs*max_scores) max score array. If max_scores and return_positions, (num_samples, 2*num_motifs*max_scores) array with max scores and their positions. """ encoded_sequences = np.transpose(encoded_sequences, (0, 1, 3, 2)) num_samples, _, _, seq_length = encoded_sequences.shape scores = np.ones((num_samples, len(motif_names), seq_length)) for j, motif_name in enumerate(motif_names): pwm = loaded_motifs.getPwm(motif_name).getRows().T log_pwm = np.log(pwm) gc_pwm = 0.5 * np.array( [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] * len(pwm[0])).T gc_log_pwm = np.log(gc_pwm) scores[:, j, :] = get_pssm_scores(encoded_sequences, log_pwm) - get_pssm_scores( encoded_sequences, gc_log_pwm) if max_scores is not None: sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores] if return_positions: sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores] return np.concatenate( (sorted_scores.reshape( (num_samples, len(motif_names) * max_scores)), sorted_positions.reshape( (num_samples, len(motif_names) * max_scores))), axis=1) else: return sorted_scores.reshape( (num_samples, len(motif_names) * max_scores)) else: return scores
def get_motif_scores(encoded_sequences, motif_names, max_scores=None, return_positions=False, GC_fraction=0.4): """ Computes pwm log odds. Parameters ---------- encoded_sequences : 4darray motif_names : list of strings max_scores : int, optional return_positions : boolean, optional GC_fraction : float, optional Returns ------- (num_samples, num_motifs, seq_length) complete score array by default. If max_scores, (num_samples, num_motifs*max_scores) max score array. If max_scores and return_positions, (num_samples, 2*num_motifs*max_scores) array with max scores and their positions. """ num_samples, _, _, seq_length = encoded_sequences.shape scores = np.ones((num_samples, len(motif_names), seq_length)) for j, motif_name in enumerate(motif_names): pwm = loaded_motifs.getPwm(motif_name).getRows().T log_pwm = np.log(pwm) gc_pwm = 0.5 * np.array( [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] * len( pwm[0])).T gc_log_pwm = np.log(gc_pwm) scores[:, j, :] = get_pssm_scores(encoded_sequences, log_pwm) - get_pssm_scores( encoded_sequences, gc_log_pwm) if max_scores is not None: sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores] if return_positions: sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores] return np.concatenate( (sorted_scores.reshape((num_samples, len(motif_names) * max_scores)), sorted_positions.reshape( (num_samples, len(motif_names) * max_scores))), axis=1) else: return sorted_scores.reshape((num_samples, len(motif_names) * max_scores)) else: return scores
def get_motif_scores(encoded_sequences, motif_names, max_scores=None, return_positions=False, GC_fraction=0.4, pfm=None, log_pfm=None, include_rc=True): """ Computes pfm log odds. Parameters ---------- encoded_sequences : 4darray motif_names : list of strings max_scores : int, optional return_positions : boolean, optional GC_fraction : float, optional pfm: position weight matrix for the motif, optional log_pfm: log(pfm), optional, this is the format that HOCOMOCO Provides in their PFM download links include_rc: boolean indicating whether both the forward strand and the reverse complement of the motif should be used (default True) Returns ------- (num_samples, num_motifs, seq_length) complete score array by default. If max_scores, (num_samples, num_motifs*max_scores) max score array. If max_scores and return_positions, (num_samples, 2*num_motifs*max_scores) array with max scores and their positions. """ encoded_sequences = np.transpose(encoded_sequences, (0, 1, 3, 2)) num_samples, _, _, seq_length = encoded_sequences.shape scores = np.ones((num_samples, len(motif_names), seq_length)) for j, motif_name in enumerate(motif_names): if (pfm is None) and (log_pfm is None): pfm = loaded_motifs.getPwm(motif_name).getRows().T log_pfm = np.log(pfm) elif log_pfm is None: log_pfm = np.log(pfm) #get the background pfm either based on GC fraction or on shuffling the input sequence background_pfm = 0.5 * np.array( [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] * len(log_pfm[0])).T background_log_pfm = np.log(background_pfm) scores[:, j, :] = get_pssm_scores( encoded_sequences, log_pfm, include_rc=include_rc) - get_pssm_scores( encoded_sequences, background_log_pfm, include_rc=include_rc) if max_scores is not None: sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores] if return_positions: sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores] return np.concatenate( (sorted_scores.reshape( (num_samples, len(motif_names) * max_scores)), sorted_positions.reshape( (num_samples, len(motif_names) * max_scores))), axis=1) else: return sorted_scores.reshape( (num_samples, len(motif_names) * max_scores)) else: return scores