def list_from_file( cls, motifs_file, file_format, threshold=None, pseudocounts="jaspar", relative_threshold=None, ): """Return a list of PSSM patterns from a file in JASPAR, MEME, etc. Parameters ---------- motifs_file Path to a motifs file, or file handle. file_format File format. one of "jaspar", "meme", "TRANSFAC". pseudocounts Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic pseudocounts from the Biopython.motifs.jaspar module (recommended), or None for no pseudocounts at all (not recommended!) threshold locations of the sequence with a PSSM score above this value will be considered matches. For convenience, a relative_threshold can be given instead. relative_threshold Value between 0 and 1 from which the threshold will be auto-computed. 0 means "match everything", 1 means "only match the one (or several) sequence(s) with the absolute highest possible score". """ if isinstance(motifs_file, str): with open("./jaspar.txt", "r") as f: motifs_list = motifs.parse(f, format=file_format) else: motifs_list = motifs.parse(motifs_file, format=file_format) if pseudocounts is not None: for motif in motifs_list: cls.apply_pseudocounts(motif, pseudocounts) return [ MotifPssmPattern( pssm, threshold=threshold, relative_threshold=relative_threshold, ) for pssm in motifs_list ]
def process_data(data, data_type='counts', seq_type='dna'): if data_type == 'counts': pfm, total = count_to_pfm(data) ic = calc_relative_information(pfm, total) elif data_type in ['fasta', 'stockholm']: #motif, ic = read_alignment(data, data_type, seq_type) #pfm = motif.counts.normalize(pseudocounts=1) data, total = read_alignment(data, data_type, seq_type) pfm, _ = count_to_pfm(data) ic = calc_relative_information(pfm, total) elif data_type in [ 'alignace', 'meme', 'mast', 'transfac', 'pfm', 'sites', 'jaspar' ]: if data_type in ['jaspar', 'transfac']: motif = motifs.parse(open(data, 'r'), data_type.upper())[0] pfm = dict(motif.counts.normalize()) total = sum(list(motif.counts.values())[0]) else: motif = motifs.read(open(data, 'r'), data_type) try: pfm = motif.counts.normalize(psuedocounts=1) except: pfm = motif.counts.normalize() total = motif.counts ic = calc_relative_information(pfm, total) return (format_matrix(pfm), format_matrix(ic))
def get_summary(job_id, meme_file, peaks): """ Write summary in a json file """ summary = {} # Number of occurences in peak summary['motif_occurrences'] = {} # Number of peaks summary['original_peaks'] = peaks summary['peaks'] = min(MAX_PEAKS_TO_KEEP, peaks) records = motifs.parse(open(meme_file), 'meme') num_occurrences = [] for index, record in enumerate(records): num_occurrences.append( int(getattr(record, 'num_occurrences', 'Unknown'))) sorted_occurences = sorted(enumerate(num_occurrences), key=lambda x: x[1]) summary['motif_occurrences'] = { 'motif{}'.format(index + 1): value for index, value in sorted_occurences } fp = os.path.join(STATIC_PATH, job_id, 'summary.json') with open(fp, 'w') as f: json.dump(summary, f) print summary return summary
def jaspar2pfm(jasparFile, outDir): with open(jasparFile) as handle: for m in motifs.parse(handle, "jaspar"): fileName = outDir + "/" + str(m.name).replace(":", "_").upper() + ".pfm" with open(fileName, "w") as output: output.write(m.format("jaspar"))
def main(argv): parser = argparse.ArgumentParser(description='Process meme files') parser.add_argument('-i', '--meme', metavar='<meme_out>', help='Meme input file', required=True) parser.add_argument('-m', '--motif', metavar='<motif_no>', help='Motif number', required=True, type=int) parser.add_argument('-c', '--phylo', metavar='<phylo_out>', help='PhyloP conservation scores', required=True) parsed = parser.parse_args(argv) handle = open(parsed.meme) records = motifs.parse(handle, 'meme') record = records[parsed.motif-1] phylo_data = csv.reader(open(parsed.phylo,'r'), delimiter='\t') phylo_scores = [] for line in phylo_data: phylo_scores.append(float(line[2])) print "Motif length", record.length print "phylo length", len(phylo_scores) profile = position_wise_profile(record.counts, record.length) max_occur = find_max_occurence(profile, max_count=1) motif_scores = [] for position in max_occur: motif_scores.append(position[0][1]) pr = pearsonr(np.array(motif_scores), np.array(phylo_scores)) print 'Pearson correlation: {}'.format(pr) fig, ax = plt.subplots() ax= sns.regplot(y=np.array(motif_scores), x=np.array(phylo_scores), scatter=True) ax.set(ylabel="Count of most freq nucleotide", xlabel="PhyloP scores", title='CTCF | pearsonr = {}, p-val={}'.format(pr[0],pr[1])); fig.savefig('{}_motif{}_scatter.png'.format(parsed.phylo, parsed.motif)) x = np.linspace(1,len(phylo_scores)+1,num=len(phylo_scores), endpoint=False) f, (ax1, ax2) = plt.subplots(2, 1) x1 = sns.barplot(x,y=np.array(motif_scores), ax=ax1) x2 = sns.barplot(x,y=np.array(phylo_scores), ax=ax2) x1.set(ylabel='Counts of most freq nucleotide', xlabel='Position in motif') x2.set(ylabel='Phylop Score', xlabel='Position in motif') f.tight_layout() f.savefig('{}_motif{}_trend.png'.format(parsed.phylo, parsed.motif))
def get_jaspar_motif(tfName): with open(Config.get("data", "pfm_db_jaspar")) as handle: for m in motifs.parse(handle, "jaspar"): if str(m.name).upper() == str(tfName).upper(): return m # if not found return None
def fit(self, fasta_file=''): """Save the output of MEME and parse it.""" if not fasta_file: raise NameError('Input fasta file not specified') cmd_params = self._make_param_string() self._command_exec(fasta_file, cmd_params) # parsing meme output file with Biopython filename = os.path.join(self.output_dir, 'meme.txt') handle = open(filename) record = motifs.parse(handle, 'meme') handle.close() # store names of sequences given as input to fit() self.seq_names = record.sequences[:] self.n_seqs = len(record.sequences) # create a list of motives, each represented by an object motives_db = list() for i in range(self.nmotifs): motives_db.append(record[i]) self.motives_db = motives_db[:] # store length, number of occurences and e-value of each motif self._get_stats(self.nmotifs) # get string representation of motives self.motives_list = list(self._get_motives_list()) # create PWMs motives_list = self.motives_list[:] super(Meme, self).fit(motives=motives_list)
def read_memefile(meme_file): """Summariser for MEME file Read meme file Parameters ---------- meme_file: str Location of MEME file Returns ------- summary: dict A summary containing the following details: - motif_occurences: dict contatining number of type each motif occurs. dict is indexed by key: 'motif1', 'motif2' etc - motif_records: List of Biopython motif objects summary: """ summary = {} summary['motif_occurrences'] = {} records = motifs.parse(open(os.path.abspath(meme_file)), 'meme') summary['total_motifs'] = len(records) num_occurrences = [] for index, record in enumerate(records): num_occurrences.append(int(getattr(record,'num_occurrences','Unknown'))) sorted_occurences = sorted(enumerate(num_occurrences), key=lambda x: x[1]) summary['motif_occurrences'] = {'motif{}'.format(index+1):value for index,value in sorted_occurences} summary['motif_records'] = records ### Read background frequenceies H since bioppython does not support them bg_frequencies = get_motif_bg_freq(meme_file) summary['bg_frequencies'] = bg_frequencies return summary
def main(motif_file, motif_outfile, d_th, pc, bp, ow, fpr, pe): thresholds = [] background = {'A': bp[0], 'C': bp[1], 'T': bp[2], 'G': bp[3]} print(("Baseline nucleotide frequencies:\n\t" + str(background))) print(("Calculating thresholds (" + timeString() + "). This could take a while.")) sys.stdout.flush() idx = 0 print_exponent = 1 # Calculate thresholds using biopython fh = open(motif_file) for m in motifs.parse(fh, "jaspar"): pwm = m.counts.normalize(pseudocounts=pc) # creates dictionary like representation pssm = pwm.log_odds(background) # converts to log_odds vs background # Precision argument of 4 was recommended by biopython's documentation (slow step) # YYY - JA - 05/05/2017 - This could be sped up by using TFM-PVALUE's C++ functions for # determining thresholds. May consider implementing, calculates precise p-values with no # errors much more quickly and can also generate p-values from a score. distribution = pssm.distribution(background=background, precision=10 ** pe) m_th = distribution.threshold_fpr(fpr) thresholds.append(m_th) # print progress idx += 1 if (idx >= 10 ** print_exponent): print((str(idx) + " thresholds calculated... at " + timeString())) print_exponent += 1 sys.stdout.flush() print(("Total motifs read: " + str(len(thresholds)))) print("Outputing thresholds") motif.get_put_motifs(motif_file, motif_outfile, d_th, ow, thresholds) print(("Done (" + timeString() + ")"))
def suspect_site_extractor(example_seq, compute_motifs, num_sites, motifs_path, extension=''): sites_collector = {} df_recombination = find_recombination_sites(str(example_seq).upper(), num_sites) print('finished finding RMD sites') df_slippage = find_slippage_sites(str(example_seq).upper(), num_sites) print('finished finding SSR sites') sites_collector['df_recombination' + extension] = df_recombination sites_collector['df_slippage' + extension] = df_slippage ### do methylation only if requested if compute_motifs == True: with open(motifs_path, "r") as handle: relevant_motifs = motifs.parse(handle, "minimal") df_motifs = find_motif_sites(example_seq, num_sites, relevant_motifs) print('finished finding motif sites') sites_collector['df_motifs' + extension] = df_motifs return sites_collector
def motifs_list(jasp_motifs_file): jasp_motifs = open(jasp_motifs_file, 'r') motifs_list = [] for m in motifs.parse(jasp_motifs, "jaspar"): motifs_list.append(m) jasp_motifs.close() return motifs_list
def tffm_from_meme(meme_output, kind, name="TFFM"): """ Construct a TFFM from the output of MEME on ChIP-seq data. :arg meme_output: File containing the output of MEME. :type meme_output: str :arg kind: Type of TFFM to construct between '1st-order' and 'detailed'. :type kind: str :arg name: Name of the TFFM (default: "TFFM") :type name: str :returns: The TFFM initialized from MEME results. :rtype: :class:`TFFM` :note: As the PFM is used to initialize the TFFM, a pseudocount of 1 is added to all the values in the PFM """ record = motifs.parse(open(meme_output), 'MEME') if record.alphabet != IUPAC.unambiguous_dna: sys.exit("### Wrong alphabet used in MEME ###") motif = record[0] nb_seq, nb_res, first_letters = utils.get_sequences_info(record.datafile) if kind == TFFM_KIND.FIRST_ORDER: hmm = create_1storder_hmm(nb_seq, nb_res, first_letters, motif) elif kind == TFFM_KIND.DETAILED: hmm = create_detailed_hmm(nb_seq, nb_res, first_letters, motif) else: # 0-order HMM here hmm = create_0order_hmm(nb_seq, nb_res, first_letters, motif) return TFFM(hmm.emissionDomain, hmm.distribution, hmm.cmodel, kind, name)
def __init__(self, fasta, motifs_input, bg=None): self.all_motifs = [] with open(motifs_input, "r") as infile: self.all_motifs = list(motifs.parse(infile, "jaspar")) # for large sequence header, only keep the text before the first space self.genome_seq = pyfasta.Fasta(fasta, key_fn=lambda x: x.split()[0]) self.bg = bg
def read_jaspar_pwms(file='jaspar/pfm_vertebrates.txt', dir='d:/sequence'): with open(os.path.join(dir, file), 'r') as h: jaspar = { m.name.strip().split(':')[0].upper(): m.counts.normalize().log_odds() for m in motifs.parse(h, 'jaspar') } return jaspar
def process_meme_output(meme_out_folder, pfm_filename): """ Parse MEME search result into JASPAR .pfm files. """ with open(f'{meme_out_folder}/meme.txt') as meme: ms = motifs.parse(meme, 'MINIMAL') with open(pfm_filename, 'w+') as pfm: pfm.write(motifs.write(ms, 'jaspar'))
def read_pfm(jaspar_motifs_file, tf_name): motif = None with open(jaspar_motifs_file) as handle: for m in motifs.parse(handle, "jaspar"): if m.name == tf_name: motif = m break return motif
def getmotiflist(pwmfilename, filetype="TRANSFAC", prefix=None): with open(pwmfilename, 'r') as mhandle: motiflist = motifs.parse(mhandle, filetype) result = [] for motifpwm in motiflist: if prefix == None or \ motifpwm['ID'].strip().startswith(prefix): result.append(motifpwm) return result
def pwm2pval(motifName, seq): with open(Config.get("data", "pfm_db_jaspar")) as handle: for m in motifs.parse(handle, "jaspar"): if str(m.name).upper() == str(motifName).upper(): ppm = m.counts.normalize(pseudocounts=C_PSEUDOCOUNTS) pwm = ppm.log_odds(background=C_BACKGROUND) writePWM(m) break ### stop if motif is found print "PWM:" print pwm ### scale raw PWM scores scaled_pwm = np.zeros(shape=(4, len(m))) for i, nt in enumerate(['A', 'C', 'G', 'T']): scaled_pwm[i] = pwm[nt] # print scaled_pwm # subtract by min PWM score, to make non-negative matrix scale_const = np.min(scaled_pwm) nonneg_pwm = scaled_pwm - scale_const # scale scale_factor = (C_PRECISION / np.max(nonneg_pwm)) scaled_pwm = nonneg_pwm * scale_factor # round to nearest integer scaled_pwm = np.rint(scaled_pwm).astype(int) print "Scaled PWM:" print scaled_pwm ### score distribution score_distribution = np.zeros(shape=(len(m), len(m) * C_PRECISION + 1), dtype=np.int) print np.shape(score_distribution) # init first row for i, nt in enumerate(['A', 'C', 'G', 'T']): score_distribution[0, scaled_pwm[i, 0]] += 1 # print "first row:",score_distr[0,:] # proc rest of motif for j in range(1, len(m)): ### j: 1 to length of motif print "MOTIF pos:", j for k in scaled_pwm[:, j]: for idx, count in enumerate(score_distribution[j - 1, :]): if count > 0: score_distribution[j, idx + k] += count scaled_score = int(seq2score(seq, scaled_pwm)) raw_score = seq2score(seq, pwm) print "scaled score:", scaled_score print "raw score:", raw_score pval = score2pval(scaled_score, score_distribution) print "pval:", pval
def build_pssm(meme): with open(meme) as f: record = motifs.parse(f, 'minimal') motif = record[0] motif.pseudocounts = motif.background pssm = motif.pssm return pssm
def get_motif(pcm_file): try: with open(pcm_file) as handle: m = motifs.parse(handle, HOCOMOCO_PARSE_FORMAT) if (m is not None and len(m) > 0): return m[0] else: return None except IOError as e: logger.error(ERROR_MSG_IO.format(pcm_file, e.strerror))
def create_pfm_from_meme(self, memexml, nameforpfm, tf_name, species_name): with open(memexml) as handle: record = motifs.parse(handle, "meme") motif = record[0] motif_base = str(motif.counts) with open(nameforpfm, 'w') as transition: transition.write('>{}\t{}\n'.format(tf_name, species_name)) for line in motif_base.split('\n')[1:5]: base, values = line.strip().split(':') values = [str(round(float(i))) for i in values.split(' ') if i != ''] transition.write('{} [ {} ]\n'.format(base, ' '.join(values)))
def get_chipseq_ranges(chip_fimo, chip_all, motif_file): chip_all_data = pd.read_csv(chip_all) chip_fimo_data = pd.read_table(chip_fimo) motif_id = set(chip_fimo_data['#pattern name']).pop() with open(motif_file, 'r') as fid: motif_record = motifs.parse(fid, 'MEME') mot_len = int(motif_record[motif_id - 1].length) mot_range = [] for pos in list(chip_all_data['Position']): mot_range.append((pos - mot_len - 10, pos + mot_len + 10)) return mot_range
def __init__(self, ref_path, bg=None): ref_manager = ReferenceManager(ref_path) self.all_motifs = [] if ref_manager.motifs is not None: with open(ref_manager.motifs, "r") as infile: self.all_motifs = list(motifs.parse(infile, "jaspar")) # for large sequence header, only keep the text before the first space self.genome_seq = pyfasta.Fasta(ref_manager.fasta, key_fn=lambda x: x.split()[0]) self.bg = bg
def build_motif_db(): motif_dict= {} background = {'A': 0.3, 'C': 0.2, 'T': 0.3, 'G': 0.2} jf = open("jaspar_curated.pfm") for m in motifs.parse(jf,"jaspar"): pwm = m.counts.normalize(pseudocounts={'A': 0.6, 'C': 0.4, 'T': 0.6, 'G': 0.4}) pssm = pwm.log_odds(background) distribution = pssm.distribution(background=background, precision = 10**3) # threshold = distribution.threshold_patser() threshold = distribution.threshold_fpr(0.001) motif_dict[m.name] = (pssm,threshold) return motif_dict
def load_meme(filename): """ Return a parsed MEME output file using Biopython. """ gapped = Gapped(ExtendedIUPACDNA(), '-') with open(filename, "r") as f: records = [] records = motifs.parse(f, "MEME") records.sort(key=lambda x: x.evalue) # Sort entries by evalue records = convert(records, gapped) return records
def MEMERIS(args): from Bio import motifs records = [] fastaPath = os.path.join(args.indir, "sequences.fa") with open(os.path.join(args.indir, 'results.txt')) as f: for motif in motifs.parse(f, 'MEME', strict=False): for instance in motif.instances: start = instance.start - 1 end = start + len(str(instance)) records.append((instance.sequence_name, start, end)) locations = pd.DataFrame.from_records(records) locations.columns = ["sequence_id", "start", "end"] locations = locations.set_index("sequence_id") nameMapping = pd.Series(extendSeqName(list(locations.index), fastaPath)) locations.index = nameMapping.loc[locations.index] return locations
def motif_search_calc(input_jaspar,lincRNA_seq,output_motif): with open (input_jaspar,"r") as fm: for m in motifs.parse(fm,"jaspar"): with open(lincRNA_seq,"r") as lincs: for lincseq in SeqIO.parse(lincs,'fasta',alphabet=IUPAC.unambiguous_dna): for pos, score in m.pssm.search(lincseq.seq, threshold=7.0): d=lincseq.id.split("|") linc_id = d[1].strip("\" ") position = abs(pos) final_position = (int(position)/int(d[-2])) all_positions.append(final_position) with open(output_motif,"w") as f: f.write("For lincRNA" + "\t" + str(d[0]) + "\t" + str(d[1]) + "\t" + "total length:" + "\t" + str(d[-2]) + "\n") f.write("Motif" + "\t" + str(m.name) + "\t" + "binds at position" + "\t" + str(pos) + "\t" + " with score: " + "\t" + str(score) + "\n") return all_positions
def plot_pwm(filename, output=None): """ Plots a mononuc probability matrix from a JASPAR file Caution: only ONE motif in the file !!! The output must be given as the name of a PDF file, otherwise automatic from the input name """ if output is None: output = filename.split("/")[-1].split(".")[0] + ".pdf" # Uses WebLogo from BioPython to plot the matrix in pdf format # Requires an internet connection fh = open(filename) for m in motifs.parse(fh, "jaspar"): try: m.weblogo(output, format="PDF") except: print "ERROR trying to plot the sequence logo using the online website WebLogo (http://weblogo.berkeley.edu/). A possible cause is the absence of a working internet connection, otherwise the pwm file %s may be corrupted. If you wish to plot the sequence logo, consider using the alternate website STAMP (http://www.benoslab.pitt.edu/stamp/)" % filename fh.close() return 0
def motif_identifier(tf, meme_folder): meme_file = meme_folder + '/meme.txt' with open(meme_file, 'r') as fid: try: records = motifs.parse(fid, 'MEME') except ValueError: return '1' motif_list = [] evalue_list = [] for record in records: curr_evalue = record.evalue for curr_motif in record.instances: # if curr_motif.sequence_name == tf.lower(): motif_list.append(curr_motif.motif_name) evalue_list.append(curr_evalue) motif_list = zip(motif_list, evalue_list) sorted_list = sorted(motif_list, key=lambda x: x[1]) motif_name = sorted_list[0][0] return motif_name[-1]
def find_pwm_hits(narrow_peak, reference, pfm, output, treat_cov): """ Search each peak for the best match against the specified position frequency matrix Args: narrow_peak (str) - path to the narrowPeak file output by MACS2 reference (str) - file path to the reference genome pfm (str) - file path to the position frequency matrix output (str) - prefix for the output file """ # Open the peaks and reference genome files with open(narrow_peak, "r") as peaks, open(reference, "r") as ref: # Parse the reference genome into a dictionary records = SeqIO.parse(ref, "fasta", alphabet=IUPAC.unambiguous_dna) ref_seq = {record.id: record for record in records} # Open and parse the position frequency matrix with open(pfm, "r") as pfm: matrix = motifs.parse(pfm, "jaspar")[0] pwm = matrix.counts.normalize(pseudocounts=.5) pssm = pwm.log_odds() # Open the output file with open(output + "_centeredpeaks.bed", "w") as out_bed, \ open(output + "_centeredpeaks.fasta", "w") as out_fasta: # Write a line for each centered peak in the output file for peak in peaks: split_peak = peak.strip().split("\t") peak_chrom = split_peak[0] peak_start = int(split_peak[1]) peak_end = int(split_peak[2]) seq = ref_seq[peak_chrom].seq[peak_start:peak_end] hits = [(pos, score) for pos, score in pssm.search(seq)] hits.sort(key=lambda hit: hit[1], reverse=True) recenter_peak(out_bed, out_fasta, ref_seq, peak_chrom, peak_start, peak_end, 100, hits, matrix, treat_cov)
def motif_search(input_jaspar, lincRNA_seq): with open(input_jaspar, "r") as fm, open(lincRNA_seq, "r") as lincs: for m in motifs.parse(fm, "jaspar"): for lincseq in SeqIO.parse(lincs, 'fasta', alphabet=IUPAC.unambiguous_dna): d = lincseq.id.split("|") linc_id = d[0].strip("\" ") #print(linc_id) for pos, score in m.pssm.search(lincseq.seq): one_position = int(abs(pos)) list_positions = [one_position] #print(pos) if linc_id in motif_positions.keys(): motif_positions[linc_id].append(one_position) else: empty_values = motif_positions.get(linc_id, None) motif_positions[linc_id] = list_positions print(motif_positions) return motif_positions
def get_summary(job_id, meme_file, peaks): """ Write summary in a json file """ summary = {} # Number of occurences in peak summary['motif_occurrences'] = {} # Number of peaks summary['original_peaks'] = peaks summary['peaks'] = min(MAX_PEAKS_TO_KEEP, peaks) records = motifs.parse(open(meme_file), 'meme') num_occurrences = [] for index, record in enumerate(records): num_occurrences.append(int(getattr(record,'num_occurrences','Unknown'))) sorted_occurences = sorted(enumerate(num_occurrences), key=lambda x: x[1]) summary['motif_occurrences'] = {'motif{}'.format(index+1):value for index,value in sorted_occurences} fp = os.path.join(STATIC_PATH, job_id, 'summary.json') with open(fp, 'w') as f: json.dump(summary, f) print summary return summary
def build_pssm(meme): with open(meme) as f: record = motifs.parse(f, 'minimal') motif = record[0] motif.pseudocounts = motif.background pssm = motif.pssm mean = motif.pssm.mean() std = motif.pssm.std() consensus = motif.consensus #distribution = motif.pssm.distribution(background = motif.background) max_value = motif.pssm.max min_value = motif.pssm.min #print(pssm, mean, std,consensus, max_value, min_value) cutoff_1 = [mean - std, 0][(mean - std) < 0] cutoff_2 = [mean - 2 * std, 0][(mean - 2 * std) < 0] cutoff_3 = [mean + std, 0][(mean + std) < 0] cutoff_4 = [mean + 2 * std, 0][(mean + 2 * std) < 0] return pssm, mean, std, max_value, cutoff_1, cutoff_2, cutoff_3, cutoff_4
def get_motifscores(df): with open('../additional/ATtRACT/pwm_transposed.txt', 'r') as f: records = parse(f, 'jaspar') Xs_sel = pd.read_pickle( '/net/mraid08/export/genie/Runs/Martin/ATtRACT/irmotifs_scores.pkl') mtfs = [] for i in Xs_sel.columns: if i.split('__')[0] not in mtfs: mtfs.append(i.split('__')[0]) def find_motifs_ir(varid): motifs = pd.Series() for pos, seq in mm.counts.log_odds().search(Seq(df.sequence[varid], \ alphabet=IUPAC.IUPACUnambiguousDNA()), threshold=0, both=False): motifs.loc[pos] = seq motifs_up = motifs[motifs.index < df.intronstart[varid]] motifs_alt = motifs[(motifs.index > df.intronstart[varid]) & (motifs.index < df.intronend[varid])] motifs_down = motifs[motifs.index > df.intronend[varid]] return list([motifs_up.sum(), motifs_alt.sum(), motifs_down.sum()]) junirmotifs = pd.DataFrame(index=df.index) database = pd.read_table('../additional/ATtRACT/ATtRACT_db.txt') database.drop_duplicates('Matrix_id', inplace=True) database.set_index('Matrix_id', inplace=True) for mm in records: # if (mm.name in database[database.Organism=='Homo_sapiens'].index): if (mm.name in mtfs): mm.counts.__class__ = matrix.PositionWeightMatrix junirmotifs['motifs'] = df.index.map(lambda x: find_motifs_ir(x)) junirmotifs[[str(mm.name) + '__score_motifs_up',str(mm.name) + '__score_motifs_alt',\ str(mm.name) + '__score_motifs_down']]=junirmotifs.motifs.apply(lambda x: pd.Series(x)) junirmotifs.drop('motifs', axis=1, inplace=True) return junirmotifs[Xs_sel.columns]
def motif_format_checker(motif_infile): from Bio import motifs if motif_infile is None: return None try: all_motifs = motifs.parse(open(motif_infile), "jaspar") except: sys.exit("Motif file is not in JASPAR format.") nmotif = 0 with open(motif_infile) as motif_in: for i, line in enumerate(motif_in): if line.startswith('>'): nmotif += 1 if len(line.split('\t')) > 1: sys.exit( "Motif name cannot contain tabs('\t') at line {} in {}.".format(i+1, motif_infile) ) if nmotif != len(all_motifs): sys.exit("Motif file is not in JASPAR format.")
def get_motifscores(df): with open('../additional/ATtRACT/pwm_transposed.txt', 'r') as f: records = parse(f, 'jaspar') Xs_sel = pd.read_pickle( './dataframes/ml/three/Xs_three_motifs_rna_sel_.pkl') mtfs = [] for i in Xs_sel.columns: if i.split('__')[0] not in mtfs: mtfs.append(i.split('__')[0]) def find_motifs(varid): motifs = pd.Series() for pos, seq in mm.counts.log_odds().search(Seq(df.sequence[varid], \ alphabet=IUPAC.IUPACUnambiguousDNA()), threshold=0, both=False): motifs.loc[pos] = seq motifs_up = motifs[motifs.index < df.acceptor1[varid]] motifs_alt = motifs[(motifs.index > df.acceptor1[varid]) & (motifs.index < df.acceptor2[varid])] motifs_down = motifs[motifs.index > df.acceptor2[varid]] return list([motifs_up.sum(), motifs_alt.sum(), motifs_down.sum()]) motifscores = pd.DataFrame(index=df.index) database = pd.read_table('../additional/ATtRACT/ATtRACT_db.txt') database.drop_duplicates('Matrix_id', inplace=True) database.set_index('Matrix_id', inplace=True) for mm in records: if (mm.name in mtfs): mm.counts.__class__ = matrix.PositionWeightMatrix motifscores['motifs'] = df.index.map(lambda x: find_motifs(x)) motifscores[[str(mm.name) + '__score_motifs_up',str(mm.name) + '__score_motifs_alt',\ str(mm.name) + '__score_motifs_down']]=motifscores.motifs.apply(lambda x: pd.Series(x)) motifscores.drop('motifs', axis=1, inplace=True) return motifscores[Xs_sel.columns]
def parseMastOut(mastOut): from Bio import motifs handle = StringIO.StringIO(mastOut) record = motifs.parse(handle, "mast") handle.close() return record
def main(argv): handle = open(argv[0], 'r') records = motifs.parse(handle, 'meme') print "Total motifs present: {}".format(len(records)) for i, record in enumerate(records): print "Motif {} \t Length: {} \t, Seq: {}".format(i, len(record.consensus), record.consensus)
# concatenate all files all_states = "all_states_all_lines.bed" os.system("cat *.bed > {0}".format(all_states)) # Get CD19 perypheral blood HMM state annotation roadmap_15statesHMM_CD19 = "E032_15_coreMarks_mnemonics.bed.gz" os.system("tar zxvf {0} {1}".format(roadmap_15statesHMM, roadmap_15statesHMM_CD19)) os.system("gzip -d {0}".format(roadmap_15statesHMM_CD19)) os.system("mv E032_15_coreMarks_mnemonics.bed ../data/E032_15_coreMarks_mnemonics.bed") # Footprinting # get all jaspar motifs "wget http://jaspar.genereg.net/html/DOWNLOAD/JASPAR_CORE/pfm/nonredundant/pfm_all.txt" jaspar = motifs.parse(open("data/external/pfm_all.txt", 'r'), "jaspar") # motif annotation "wget http://jaspar.genereg.net/html/DOWNLOAD/database/MATRIX.txt" annot = pd.read_table("data/external/MATRIX.txt", names=["index", "db", "id", 0, "TF"]) # get species annotation "wget http://jaspar.genereg.net/html/DOWNLOAD/database/MATRIX_SPECIES.txt" spec = pd.read_table("data/external/MATRIX_SPECIES.txt", names=["index", "species_id"]) # merge both annot = pd.merge(annot, spec, on=['index']) # get ids of only human motifs human_annot = annot[annot['species_id'] == "9606"] # filter out any not uniquely mappable gene name human_annot = human_annot[ (~human_annot['TF'].str.contains("\(")) &
def read_jaspar_pwms(file='jaspar/pfm_vertebrates.txt', dir='d:/sequence'): with open(os.path.join(dir, file), 'r') as h: jaspar = {m.name.strip().split(':')[0].upper(): m.counts.normalize().log_odds() for m in motifs.parse(h, 'jaspar')} return jaspar
def create_plot( meme_file, motif_number, flanking_sites, sample_phylop_file, control_phylop_file, sample_gerp_file, control_gerp_file, peak_file, fimo_file, annotate, ): handle = open(meme_file) records = motifs.parse(handle, "meme") record = records[motif_number - 1] num_occurrences = getattr(record, "num_occurrences", "Unknown") sample_phylo_data = None control_phylo_data = None sample_gerp_data = None control_gerp_data = None annotate_dict = None if annotate == "" or annotate == " ": annotate = None elif annotate: with open(annotate) as f: annotate_dict = json.load(f) handle = open(sample_phylop_file, "r") sample_phylo_data = csv.reader(handle, delimiter="\t") handle = open(control_phylop_file, "r") control_phylo_data = csv.reader(handle, delimiter="\t") if sample_gerp_file and control_gerp_file: handle = open(sample_gerp_file, "r") sample_gerp_data = csv.reader(handle, delimiter="\t") handle = open(control_gerp_file, "r") control_gerp_data = csv.reader(handle, delimiter="\t") sample_phylo_scores = [] for line in sample_phylo_data: sample_phylo_scores.append(float(line[1])) control_phylo_scores = [] for line in control_phylo_data: control_phylo_scores.append(float(line[1])) if sample_gerp_data: sample_gerp_scores = [] for line in sample_gerp_data: sample_gerp_scores.append(float(line[1])) control_gerp_scores = [] for line in control_gerp_data: control_gerp_scores.append(float(line[1])) assert len(sample_phylo_scores) == len(control_phylo_scores) handle.close() profile = position_wise_profile(getattr(record, score_type), record.length) max_occur = find_max_occurence(profile, max_count=1) ## motif_scores is tn array of scores of the max occuring base at each position of the motif motif_scores = [] for position in max_occur: motif_scores.append(position[0][1]) motif_scores = np.asarray(motif_scores) sample_phylo_scores = np.asarray(sample_phylo_scores) control_phylo_scores = np.asarray(control_phylo_scores) if sample_gerp_data: sample_gerp_scores = np.asarray(sample_gerp_scores) control_gerp_scores = np.asarray(control_gerp_scores) motif_junk = [0 for i in range(0, flanking_sites)] motif_junk = np.asarray(motif_junk) motif_concat = np.concatenate((motif_junk, motif_scores)) motif_concat = np.concatenate((motif_concat, motif_junk)) ##Mean of flanking sites ms_p = np.mean(np.concatenate((sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:]))) mc_p = np.mean(np.concatenate((control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:]))) if sample_gerp_data: ms_g = np.mean(np.concatenate((sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:]))) mc_g = np.mean(np.concatenate((control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:]))) flanking_sample_gerp_scores = np.concatenate( (sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:]) ) flanking_control_gerp_scores = np.concatenate( (control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:]) ) motif_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites] motif_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites] flanking_sample_phylo_scores = np.concatenate( (sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:]) ) flanking_control_phylo_scores = np.concatenate( (control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:]) ) motif_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites] motif_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites] if flanking_sites > 0: shifted_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites] - ms_p shifted_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites] - mc_p if sample_gerp_data: shifted_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites] - ms_g shifted_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites] - mc_g else: shifted_sample_phylo_scores = sample_phylo_scores shifted_control_phylo_scores = control_phylo_scores if sample_gerp_data: shifted_sample_gerp_scores = sample_gerp_scores shifted_control_gerp_scores = control_gerp_scores pr_p = pearsonr(motif_scores, motif_sample_phylo_scores) if sample_gerp_data: pr_g = pearsonr(motif_scores, motif_sample_gerp_scores) ## H_0: Mean phylop scores for motif sites and flanking sites are the same ## H_!: Mean phylop score for motif sites > Mean phylop score of flanking sites ## NOTE: the perform_t_test functions returns a 2 tailed p-value forn independet t-test with unequal sample size, eqaul variances T_deltaphylop, p_deltaphylop = perform_t_test(motif_sample_phylo_scores, flanking_sample_phylo_scores) delta_phylop = np.mean(motif_sample_phylo_scores) - np.mean( flanking_sample_phylo_scores ) # -shifted_control_phylo_scores) if sample_gerp_data: T_deltagerp, p_deltagerp = perform_t_test(motif_sample_gerp_scores, flanking_sample_gerp_scores) delta_gerp = np.mean(motif_sample_gerp_scores) - np.mean(flanking_sample_gerp_scores) if T_deltagerp < 0: p_deltagerp = 1 - p_deltagerp * 0.5 else: p_deltagerp = p_deltagerp * 0.5 if T_deltaphylop < 0: p_deltaphylop = 1 - p_deltaphylop * 0.5 else: p_deltaphylop = p_deltaphylop * 0.5 ## Ordinary least square fit for phylop scores and motif_scores reg_phylop_sample = sm.OLS(motif_sample_phylo_scores, sm.add_constant(motif_scores)).fit() if len(reg_phylop_sample.params) < 2: y_reg_phylop_sample = motif_scores else: y_reg_phylop_sample = motif_scores * reg_phylop_sample.params[1] + reg_phylop_sample.params[0] reg_phylop_control = sm.OLS(motif_control_phylo_scores, sm.add_constant(motif_scores)).fit() if len(reg_phylop_control.params) < 2: y_reg_phylop_control = motif_scores else: y_reg_phylop_control = motif_scores * reg_phylop_control.params[1] + reg_phylop_control.params[0] if sample_gerp_data: reg_gerp_sample = sm.OLS(motif_sample_gerp_scores, sm.add_constant(motif_scores)).fit() if len(reg_gerp_sample.params) == 1: y_reg_gerp_sample = motif_scores else: y_reg_gerp_sample = motif_scores * reg_gerp_sample.params[1] + reg_gerp_sample.params[0] reg_gerp_control = sm.OLS(motif_control_gerp_scores, sm.add_constant(motif_scores)).fit() if len(reg_gerp_control.params) == 1: y_reg_gerp_control = motif_scores else: y_reg_gerp_control = motif_scores * reg_gerp_control.params[1] + reg_gerp_control.params[0] motif = record motif_length = motif.length meme_dir = os.path.dirname(meme_file) X = [40 + 15] ## this is by trial and error, the position for the first base logo logo = plt.imread(os.path.join(meme_dir, "logo{}.png".format(motif_number))) ## Generate all other X coordinates fs = flanking_sites for j in range(1, len(motif) + 2 * fs): t = X[j - 1] + a + 1.9 X.append(t) motif_bits = [] for i in range(0, motif.length): s = 0 for base in bases: s = s + -motif.pwm[base][i] * log(motif.pwm[base][i], 2) if motif.pwm[base][i] != 0 else s s = 2 - s motif_bits.append(s) y_phylop_pixels = [__scale__ * x for x in sample_phylo_scores] # [fs:-fs]]#[flanking_sites:-flanking_sites]] ##FIXME This is a big dirty hacl to get thegenerate plots for the Reverse complement logo too logo_name = ["logo{}.png".format(motif_number), "logo_rc{}.png".format(motif_number)] for ln in logo_name: if "rc" in ln: y_phylop_pixels.reverse() logo = plt.imread(os.path.join(meme_dir, ln)) height_px = logo.shape[0] # Should be 212 if sample_gerp_data: if annotate: total_px = X[-1] + 8 * height_px + 140 right = (8 * height_px + 10 + 140 - 0.2 * height_px) / total_px else: total_px = X[-1] + 6 * height_px + 140 right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px else: if annotate: total_px = X[-1] + 6 * height_px + 140 right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px else: total_px = X[-1] + 4 * height_px + 140 right = (4 * height_px + 10 + 140 - 0.2 * height_px) / total_px figsize = (total_px / 100, (2 * height_px) / 100 + 0.6) gs = gridspec.GridSpec(2, 1) # , width_ratios=[1, right], height_ratios=[1,1]) gs.update( top=1.0, bottom=0.14, left=0.08, right=1 - right ) # , right=0.8)#, left=0.06)#, right=right, wspace=0.025, hspace=0.03, wd) f = plt.figure(figsize=figsize, dpi=dpi, facecolor="w", edgecolor="k") # ax => Logo # stem_plot => Trend # gerp_scatter_plot => Phylop # enrichment_plot => Gerp logo_plot = plt.Subplot(f, gs[0]) ##TODO Check this if motif_length > 45: XSCALE_FACTOR = motif_length / 1.9 z = 2 elif motif_length > 40: XSCALE_FACTOR = motif_length / 2.25 z = 2.5 elif motif_length > 36: XSCALE_FACTOR = motif_length / 1.95 z = 2 elif motif_length > 21: XSCALE_FACTOR = motif_length / 5 z = 3 else: XSCALE_FACTOR = 4.5 z = 3 logo_plot.imshow( logo, extent=[40 + 15 + z * (a + 1.9), logo.shape[1] + 15 + XSCALE_FACTOR * (a + 1.9), 0, logo.shape[0]] ) logo_plot.set_axis_off() f.add_subplot(logo_plot) stem_plot = plt.Subplot(f, gs[1], sharex=logo_plot) markerline, stemlines, baseline = stem_plot.stem( X[:fs], [y for y in y_phylop_pixels[:fs]], markerfmt="_", linefmt="-", markerfacecolor=flankingstemcolor, color=greycolor, ) setp(stemlines, "color", flankingstemcolor) setp(markerline, "markerfacecolor", flankingstemcolor) setp(markerline, "color", flankingstemcolor) setp(stemlines, "linewidth", linewidth) setp(markerline, "markersize", markersize) setp(baseline, "linewidth", linewidth - 0.5) setp(markerline, "markeredgewidth", markeredgewidth) markerline, stemlines, baseline = stem_plot.stem( X[fs:-fs], [y for y in y_phylop_pixels[fs:-fs]], markerfmt="g_", linefmt="g-", basefmt="r-" ) setp(stemlines, "linewidth", linewidth) setp(markerline, "markersize", markersize) setp(markerline, "markeredgewidth", markeredgewidth) setp(baseline, "linewidth", linewidth - 0.5) markerline, stemlines, baseline = stem_plot.stem( X[-fs:], [y for y in y_phylop_pixels[-fs:]], markerfmt="_", linefmt="-", markerfacecolor=flankingstemcolor, color=greycolor, ) setp(stemlines, "color", flankingstemcolor) setp(markerline, "markerfacecolor", flankingstemcolor) setp(stemlines, "linewidth", linewidth) setp(markerline, "markersize", markersize) setp(markerline, "markeredgewidth", markeredgewidth) setp(markerline, "color", flankingstemcolor) setp(baseline, "linewidth", linewidth - 0.5) indices_str = [] indices1 = np.linspace(-fs, -1, 2) for i in indices1: indices_str.append("") indices2 = np.arange(0, len(X) - 2 * fs, 5) for i in indices2: indices_str.append("${}$".format(int(i) + 1)) indices3 = np.linspace(motif_length, motif_length + fs - 1, 2) for i in indices3: indices_str.append("") indices12 = np.concatenate((indices1, indices2)) indices = np.concatenate((indices12, indices3)) xticks = [X[int(i) + fs] for i in indices] max_yticks = 3 yloc = plt.MaxNLocator(max_yticks) stem_plot.yaxis.set_major_locator(yloc) # ticks_and_labels = np.linspace(1.02*min(min(y_phylop_pixels), -0.1), 1.02*max(y_phylop_pixels), num = 5, endpoint=True) # stem_plot.set_yticks(ticks_and_labels) # stem_plot.set_yticklabels(['$%.2f$' %x for x in ticks_and_labels])#(["%0.2f"%(min(y_phylop_pixels)/__scale__), "%0.2f"%(np.mean(y_phylop_pixels)/__scale__), "%0.2f"%(max(y_phylop_pixels)/__scale__)], fontsize=fontsize) stem_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Position}$", fontsize=fontsize, fontweight="bold") stem_plot.set_xlim([1.2 * a, X[-1] + linewidth * 1.8]) stem_plot.set_ylim([min(np.min(y_phylop_pixels), -0.01) - 0.03, np.max(y_phylop_pixels, 0.01)]) stem_plot.get_xaxis().tick_bottom() stem_plot.get_yaxis().tick_left() stem_plot.set_xticks(xticks) stem_plot.set_xticklabels(indices_str, fontsize=fontsize) stem_plot.spines["top"].set_visible(False) stem_plot.spines["right"].set_visible(False) stem_plot.yaxis.set_ticks_position("left") stem_plot.xaxis.set_ticks_position("bottom") stem_plot.spines["left"].set_position("zero") # stem_plot.spines['bottom'].set_position(matplotlib.transforms.Bbox(array([[0.125,0.63],[0.25,0.25]]))) stem_plot.get_yaxis().set_tick_params(direction="out") stem_plot.get_xaxis().set_tick_params(direction="out") stem_plot.tick_params(axis="y", which="major", pad=tickpad) stem_plot.tick_params(axis="x", which="major", pad=tickpad) stem_plot.tick_params("both", length=ticklength, width=2, which="major") stem_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize) f.add_subplot(stem_plot) if sample_gerp_data: if annotate: gs1 = gridspec.GridSpec(2, 4, height_ratios=[1, 4], width_ratios=[1, 1, 1, 1]) gerp_header_subplot_gs = gs1[0, 1] gerp_subplot_gs = gs1[1, 1] histogram_header_subplot_gs = gs1[0, 2] histogram_subplot_gs = gs1[1, 2] ann_header_subplot_gs = gs1[0, 3] ann_subplot_gs = gs1[1, 3] else: gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1]) gerp_header_subplot_gs = gs1[0, 1] gerp_subplot_gs = gs1[1, 1] histogram_header_subplot_gs = gs1[0, 2] histogram_subplot_gs = gs1[1, 2] else: if annotate: gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1]) histogram_header_subplot_gs = gs1[0, 1] histogram_subplot_gs = gs1[1, 1] ann_header_subplot_gs = gs1[0, 2] ann_subplot_gs = gs1[1, 2] else: gs1 = gridspec.GridSpec(2, 2, height_ratios=[1, 4], width_ratios=[1, 1]) histogram_header_subplot_gs = gs1[0, 1] histogram_subplot_gs = gs1[1, 1] gs1.update(bottom=0.14, right=0.95, left=1 - right * 0.85, wspace=0.5) phlyop_plots_leg = plt.Subplot(f, gs1[0, 0], autoscale_on=True) pearsonr_pval = str("%.1g" % pr_p[1]) if "e" in pearsonr_pval: pearsonr_pval += "}" pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-") score_pval = str("%.1g" % p_deltaphylop) if "e" in score_pval: score_pval += "}" score_pval = score_pval.replace("e", "*10^{").replace("-0", "-") textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{Phylop}=%.2f$($p=%s$)\\~\\" % ( pr_p[0], pearsonr_pval, delta_phylop, score_pval, ) # , reg_phylop_control.rsquared, num_occurrences*reg_phylop_control.params[1]) txtx = 1 - legend_xmultiplier * len(textstr) / 100.0 phlyop_plots_leg.set_frame_on(False) phlyop_plots_leg.set_xticks([]) phlyop_plots_leg.set_yticks([]) phlyop_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize) f.add_subplot(phlyop_plots_leg) phylop_scatter_plot = plt.Subplot(f, gs1[1, 0], autoscale_on=True) fit = np.polyfit(motif_scores, motif_sample_phylo_scores, 1) fit_fn = np.poly1d(fit) phylop_scatter_plot.scatter( motif_scores, motif_sample_phylo_scores, color="g", s=[pointsize for i in motif_scores] ) phylop_scatter_plot.plot( motif_scores, y_reg_phylop_sample, "g", motif_scores, fit_fn(motif_scores), color="g", linewidth=plot_linewidth, ) phylop_scatter_plot.scatter( motif_scores, motif_control_phylo_scores, color=greycolor, s=[pointsize for i in motif_scores] ) phylop_scatter_plot.plot(motif_scores, y_reg_phylop_control, color=greycolor, linewidth=plot_linewidth) ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True) phylop_scatter_plot.set_xticks(ticks_and_labels) ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels] phylop_scatter_plot.set_xticklabels(ticks_and_labels) ##max_xticks = 5 ##xloc = plt.MaxNLocator(max_xticks) ##print xloc ##phylop_scatter_plot.xaxis.set_major_locator(xloc) # ticks_and_labels = np.linspace(1.02*min(min(shifted_sample_phylo_scores), min(shifted_control_phylo_scores)), 1.02*max(max(shifted_sample_phylo_scores),max(shifted_control_phylo_scores)), # num = 4, endpoint=True) # phylop_scatter_plot.set_yticks(ticks_and_labels) # phylop_scatter_plot.set_yticklabels(["$%0.2f$"%x for x in ticks_and_labels]) max_yticks = 4 yloc = plt.MaxNLocator(max_yticks) phylop_scatter_plot.yaxis.set_major_locator(yloc) phylop_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold") phylop_scatter_plot.get_xaxis().tick_bottom() phylop_scatter_plot.get_yaxis().tick_left() phylop_scatter_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold") phylop_scatter_plot.tick_params(axis="y", which="major", pad=tickpad) phylop_scatter_plot.tick_params(axis="x", which="major", pad=tickpad) phylop_scatter_plot.get_yaxis().set_tick_params(direction="out") phylop_scatter_plot.get_xaxis().set_tick_params(direction="out") phylop_scatter_plot.tick_params("both", length=ticklength, width=2, which="major") f.add_subplot(phylop_scatter_plot) gerp_plots_leg = plt.Subplot(f, gerp_header_subplot_gs, autoscale_on=True) gerp_plots_leg.set_frame_on(False) gerp_plots_leg.set_xticks([]) gerp_plots_leg.set_yticks([]) pearsonr_pval = str("%.1g" % pr_p[1]) if "e" in pearsonr_pval: pearsonr_pval += "}" pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-") if sample_gerp_data: score_pval = str("%.1g" % p_deltagerp) if "e" in score_pval: score_pval += "}" score_pval = score_pval.replace("e", "*10^{").replace("-0", "-") textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{{Gerp}}=%.2f$($p=%s$)\\~\\" % ( pr_g[0], pearsonr_pval, delta_gerp, score_pval, ) txtx = 1 - legend_xmultiplier * len(textstr) / 100.0 gerp_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize) f.add_subplot(gerp_plots_leg) gerp_scatter_plot = plt.Subplot(f, gerp_subplot_gs, autoscale_on=True) gerp_scatter_plot.scatter( motif_scores, motif_sample_gerp_scores, color="g", s=[pointsize for i in motif_scores] ) gerp_scatter_plot.plot(motif_scores, y_reg_gerp_sample, color="g", linewidth=plot_linewidth) gerp_scatter_plot.scatter( motif_scores, motif_control_gerp_scores, color=greycolor, s=[pointsize for i in motif_scores] ) gerp_scatter_plot.plot(motif_scores, y_reg_gerp_control, color=greycolor, linewidth=plot_linewidth) ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True) gerp_scatter_plot.set_xticks(ticks_and_labels) ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels] gerp_scatter_plot.set_xticklabels(ticks_and_labels) ##max_xticks = 5 ##xloc = plt.MaxNLocator(max_xticks) ##gerp_scatter_plot.xaxis.set_major_locator(xloc) max_yticks = 4 yloc = plt.MaxNLocator(max_yticks) gerp_scatter_plot.yaxis.set_major_locator(yloc) gerp_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold") gerp_scatter_plot.set_ylabel("$\mathrm{GERP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold") gerp_scatter_plot.get_xaxis().tick_bottom() gerp_scatter_plot.get_yaxis().tick_left() gerp_scatter_plot.get_yaxis().set_tick_params(direction="out") gerp_scatter_plot.get_xaxis().set_tick_params(direction="out") gerp_scatter_plot.tick_params(axis="y", which="major", pad=tickpad) gerp_scatter_plot.tick_params(axis="x", which="major", pad=tickpad) gerp_scatter_plot.tick_params("both", length=ticklength, width=2, which="major") f.add_subplot(gerp_scatter_plot) enrichment_plot4 = plt.Subplot(f, histogram_header_subplot_gs, autoscale_on=True) enrichment_plot4.set_frame_on(False) enrichment_plot4.set_xticks([]) enrichment_plot4.set_yticks([]) all_distances = get_motif_distances(peak_file, fimo_file) fimo_dir = os.path.dirname(fimo_file) motifs_within_100 = filter(lambda x: x <= 100 and x >= -100, all_distances) motifs_within_100_200 = filter(lambda x: (x < 200 and x > 100) or (x > -200 and x < -100), all_distances) if len(motifs_within_100_200) > 0: enrichment = len(motifs_within_100) / (len(motifs_within_100_200)) # +len(motifs_within_100)) else: enrichment = 1 enrichment_pval = 0 number_of_sites = len(motifs_within_100) + len(motifs_within_100_200) # fimo_sites_intersect(parsed.fimo_file) probability = 200 / (ENRICHMENT_SEQ_LENGTH - motif_length) enrichment_pval = binom.sf(len(motifs_within_100), number_of_sites, probability) enrichment_pval = str("%.1g" % enrichment_pval) if "e" in enrichment_pval: enrichment_pval += "}" enrichment_pval = enrichment_pval.replace("e", "*10^{").replace("-0", "-") textstr = r"\noindent$Enrichment={0:.2f}$\\~\\$(p={1})$".format(enrichment, enrichment_pval) txtx = 0.1 * len(textstr) / 100.0 enrichment_plot4.text(txtx, txty, textstr, fontsize=legend_fontsize) f.add_subplot(enrichment_plot4) enrichment_plot = plt.Subplot(f, histogram_subplot_gs, autoscale_on=True) enrichment_plot.hist(all_distances, histogram_nbins, color="white", alpha=0.8, range=[-200, 200]) enrichment_plot.set_xticks([-200, -100, 0, 100, 200]) max_yticks = 3 yloc = plt.MaxNLocator(max_yticks) enrichment_plot.yaxis.set_major_locator(yloc) # enrichment_plot.set_yticks(range(1,6)) ticks_and_labels = [-200, -100, 0, 100, 200] all_distances = np.asarray(all_distances) enrichment_plot.set_xticklabels(["${}$".format(x) for x in ticks_and_labels]) enrichment_plot.tick_params(axis="y", which="major", pad=tickpad) enrichment_plot.tick_params(axis="x", which="major", pad=tickpad) enrichment_plot.tick_params("both", length=ticklength, width=2, which="major") enrichment_plot.get_xaxis().tick_bottom() enrichment_plot.get_yaxis().tick_left() enrichment_plot.get_yaxis().set_tick_params(direction="out") enrichment_plot.get_xaxis().set_tick_params(direction="out") enrichment_plot.axvline(x=-100, linewidth=3, color="red", linestyle="-.") enrichment_plot.axvline(x=100, linewidth=3, color="red", linestyle="-.") f.add_subplot(enrichment_plot) if "rc" not in ln: out_file = os.path.join(fimo_dir, "motif{}Combined_plots.png".format(motif_number)) out_file = "motif{}Combined_plots.png".format(motif_number) else: out_file = os.path.join(fimo_dir, "motif{}Combined_plots_rc.png".format(motif_number)) out_file = "motif{}Combined_plots_rc.png".format(motif_number) if annotate: filename = r"$" + annotate[0] + "$" try: a_motif = r"$" + annotate[1] + "$" except IndexError: a_motif = "" try: cell_line = r"$" + annotate[2] + "$" except IndexError: cell_line = "" try: assay = r"$" + annotate[3] + "$" except IndexError: assay = "" # data = [[r'$Filename$', filename], [r'$Motif$', a_motif], [r'$Cell\ Line$', cell_line], [r'Assay', assay]] keys = ["title", "gene_name", "dataset", "assembly"] data = [[r"$" + key.replace("_", " ").upper() + "$", r"$" + annotate_dict[key] + "$"] for key in keys] ann_header = plt.Subplot(f, ann_header_subplot_gs, autoscale_on=True) ann_header.set_frame_on(False) ann_header.set_xticks([]) ann_header.set_yticks([]) f.add_subplot(ann_header) textstr = r"$Metadata$" txtx = 1.7 * len(textstr) / 100.0 ann_header.text(txtx, txty, textstr, fontsize=legend_fontsize) ann_plot = plt.Subplot(f, ann_subplot_gs, autoscale_on=True) ann_plot.set_xticks([]) ann_plot.set_yticks([]) ann_plot.set_frame_on(False) table = ann_plot.table(cellText=data, loc="center") table.scale(1, 2) fontproperties = FontProperties(size=legend_fontsize * 8) # , family='serif' ) for key, cell in table.get_celld().items(): row, col = key if row > 0 and col > 0: cell.set_text_props(fontproperties=fontproperties) table.set_fontsize(legend_fontsize * 8) f.add_subplot(ann_plot) f.savefig(out_file, figsize=figsize, dpi=dpi)
if line.strip(): fixed_pfm_file.write(line) fixed_pfm_file.close() # Output is printed to stdout to enable pipes to other process, etc. The output gives the effect of each possible mutation # for each jaspar pfm. The file is tab delimited, with a header to make for easy reading into R or other downstream analyses. # Each line of the output has the following fields: # # 1) name -- the name of the pfm in JASPAR # 2) pos -- the relative position within the matrix. The value is from -1 to 1, where 0 is the center of the motif. # 3-8) NN -- the change in pssm score associated with each possible mutation at that position in the motif. # print "name\tpos\tIC\tDegCons\tAG\tCT\tAC\tAT\tCG\tGT" with open("pfm_all.fixed.txt") as handle: for m in motifs.parse(handle, "jaspar"): # # Get the counts and the consensus motif for the pfm # counts = m.counts cons = m.consensus deg_cons = m.degenerate_consensus # # convert to pssm, adding a pseudocount of 0.1 to each base. # pssm = m.counts.normalize(pseudocounts=0.1).log_odds() cons_score = pssm.calculate(cons) cons_list = list(cons) cons_str = str(cons)
# read in name of input newick file and motif file inputnewick_file = sys.argv[1] inputmotifs = sys.argv[2] # read in name of output file outputfile = sys.argv[3] # read in name of mRNA gene = sys.argv[4] # open file handle = open(inputmotifs) # read in motif information results = motifs.parse(handle, "meme") # close file after reading in motif info handle.close() # create list to store motifs motif_branch_lengths_list = [] # read in tree tree = Phylo.read(inputnewick_file, "newick") # create and print cutoff value cutoff = (60/ (math.log10(0.01) + 100)) print ("cutoff = " + str(cutoff)) # function to calculate branch length of Drosophila with motifs
def get_motifs(meme_file): handle = open(meme_file, 'r') records = motifs.parse(handle, 'meme') total_motifs = len(records) return total_motifs
from uuid import uuid4 from celery import Celery from peaks_processor_celery import run_conservation_analysis, run_motif_analysis, run_analysis from flask.ext.sqlalchemy import SQLAlchemy import shutil import json from celery import group from config_processor import read_config from encode_peak_file_downloader import get_encode_peakfiles, get_metadata_for_peakfile import subprocess from bed_operations.format_peakfile import convert_to_scorefile from query import get_async_id, encode_job_exists, insert_encode_job, update_job_status, insert_new_job, get_encode_metadata, get_filename, get_job_status, job_exists, encode_job_status, get_encode_jobid, is_job_type_encode,get_encode_from_jobid, get_all_encode_results from database import SqlAlchemyTask import operator from Bio import motifs jaspar_motifs = motifs.parse(open('../data/pfm_vertebrates.txt'), 'jaspar') server_config = read_config('Server') path_config = read_config('StaticPaths') app = Flask(__name__) app.config['CELERY_BROKER_URL'] = server_config['celery_broker_url'] app.config['CELERY_RESULT_BACKEND'] = server_config['celery_result_backend'] app.config['SQLALCHEMY_DATABASE_URI'] = server_config['sqlalchemy_database_uri'] app.config['CELERYD_MAX_TASKS_PER_CHILD'] = server_config['celery_max_tasks_per_child'] app.config['CELERY_IMPORTS'] = ('app',) app.config['CELERYD_TASK_TIME_LIMIT'] = 1000000 app.url_map.strict_slashes = False
import Bio sys.path.insert(0,'/mnt/lustre/home/cusanovich/Programs/lib/python2.6/site-packages/Bio') from Bio import motifs from Bio.Seq import Seq from Bio.Alphabet import IUPAC import numpy from datetime import datetime fasta = pybedtools.BedTool('/mnt/lustre/data/share/HumanGenome/allhg18_norandom.fasta') matrices = open('/data/share/TRANSFAC/2011.3_nb/dat/matrix.dat','r') bed = open('/mnt/lustre/home/cusanovich/centipede/jack_centipede_sorted.bed','r') outbed = open('/mnt/lustre/home/cusanovich/centipede/jack_centipede_sorted_pwms_timedright2.bed','w') #liner = ['chr1','847521','847534','M01066','0.9675','-'] d = pybedtools.BedTool("""chr1 840146 840165""", from_string=True) genome = d.sequence(fi=fasta) test = motifs.parse(matrices,"TRANSFAC") motifers = [] for i in range(len(test)): test[i].pseudocounts = 0.5 motifers.append(test[i]['AC']) matrices.close() x=1 pssms = {} for line in bed: liner = line.strip().split() motifed = motifers.index(liner[3]) #For some reason, Jack and Roger's bed files seem to be off in coordinates?!?!?! testbed = liner[0] + ':' + str(int(liner[1])-2) + '-' + str(int(liner[2])+2) testseq = Seq(genome.seq(testbed,fasta),IUPAC.unambiguous_dna) if liner[5] == '-':
## fetch_motifs method: fetch those which match some criteria ## any of the meta, min_ic (minimum information content), minimum length of matrix, minimum num of sites to construct it #motfs = jdb.fetch_motifs( collection='CORE', tax_group=['vertebrates','insects'], min_ic=12) #for motif in motfs: #print 'do something with the motif' print motifs.jaspar.calculate_pseudocounts(arnt) # create new calculated pseudocounts print arnt.pseudocounts # usually zeros arnt.pseudocounts = motifs.jaspar.calculate_pseudocounts(arnt) print arnt.pseudocounts print arnt.counts#, arnt.pssm() #MEME # input DNA or protein seqs, output number of motifs requested with open('meme.txt','r') as handle: motfs = motifs.parse(handle,'meme') # motif meme format #motfs is an object of Bio.motifs.meme.Record class, list of Motif objects #attributes print motfs.version, motfs.datafile, motfs.command, motfs.alphabet print motfs.sequences #list of names print len(motfs) # number of motifs for motif in motfs: #attributes print motif.name, motif.num_occurrences, motif.length, motif.evalue print motif.consensus print motif.degenerate_consensus print len(motif.instances), motif.instances[0], motif.instances[0].start print motif.instances[0].pvalue print motfs[0] # by index print motfs['Motif 2'] # by name