def test_memeprofile(self): """ Test entropy caclulations bits 2.3 2.1 1.8 * 1.6 ** Relative 1.4 * * ** Entropy 1.2 * ** *** * * ** (23.5 bits) 0.9 ** * * ** ******* ** ** 0.7 **** * ** ******* ***** 0.5 ********** ************** 0.2 ************************* 0.0 ------------------------- """ motifs = read_memefile(self.meme_file) record = motifs['motif_records'][0] motif_ic = get_motif_ic(self.meme_file, 0) target = np.array([1.2,0.9,0.7,0.9,0.5, 0.9,0.5,1.2,1.2,0.5, 0.2,1.2,1.2,1.2,0.9, 1.4,1.2,1.2,0.5,0.9, 1.2,1.4,0.7,1.8,1.6]) assert np.allclose(target, motif_ic, atol=0.6)
def test_meme(self): """Test meme runner""" if os.path.exists('tests/data/generated_out/meme_analysis'): shutil.rmtree('tests/data/generated_out/meme_analysis') meme_args = self.pipeline.get_meme_default_params output = self.pipeline.run_meme(fasta_in=self.meme_fasta, out_dir='tests/data/generated_out/meme_analysis', strargs=meme_args.replace(' -p {}'.format(get_cpu_count()), '')) #TODO Check if meme.txt is same and created #TODO This check is too stringent, specially if logos are being produced. #MEME installation leads to hard coded paths print output assert output['exitcode'] == 0 meme_record = read_memefile('tests/data/generated_out/meme_analysis/meme.txt') assert meme_record['total_motifs'] == 5 motif_record1 = meme_record['motif_records'][0] motif_record2 = meme_record['motif_records'][1] motif_record3 = meme_record['motif_records'][2] assert motif_record1.consensus == 'CAGAACGCTGCTGCCAACCCGACCT' assert motif_record2.consensus == 'AGCAGA' assert motif_record3.consensus == 'CAGTTT'
COUNT_TYPE = 'counts' client = MongoClient() db = client.moca_encode_tf db.encode_tf_stats.remove() for d in os.listdir(__root_dir__): results = db.tf_metadata.find({'@id': '/experiments/{}/'.format(d)}) meme_file = os.path.join(__root_dir__, d, 'moca_output', 'meme_out', 'meme.txt') centrimo_dir = os.path.join(__root_dir__, d, 'moca_output', 'centrimo_out') if not os.path.isfile(meme_file): print 'Skipping {}'.format(d) continue meme_info = read_memefile(meme_file) total_sequences = get_total_sequences(meme_file) for i in range(0, meme_info['total_motifs']): record = meme_info['motif_records'][i] max_occur = get_max_occuring_bases(record, max_count=1, count_type=COUNT_TYPE) motif_freq = [] for position in max_occur: motif_freq.append(position[0][1]) motif_freq = np.asarray(motif_freq) fimo_sample = os.path.join(os.path.dirname(meme_file),
def create_plot(meme_file, plot_title, output_dir=None, centrimo_dir=None, motif_number=1, flank_length=5, sample_score_files=[], control_score_files=[], reg_plot_titles=[], annotate=None, save=True): """Create plot Parameters ---------- meme_file: string Path to meme.txt peak_file: string Path to summit file centrimo_dir: string Path to centrimo's output directory motif_number: int 1-based number of motif in the motif file sample_score_files: list Path to conservation scores files for sample control_score_files: list Path to conservation score files for control legend_titles: list List of legend titles """ meme_record = read_memefile(meme_file) total_sequences = get_total_sequences(meme_file) record = meme_record['motif_records'][motif_number-1] num_occurrences = getattr(record, 'num_occurrences', 'Unknown') all_meme_occurrences = [] for motif_record in meme_record['motif_records']: all_meme_occurrences.append(getattr(motif_record, 'num_occurrences', 'Unknown')) meme_dir = os.path.abspath(os.path.dirname(meme_file)) if not output_dir: output_dir = os.path.join(os.path.join(meme_dir, '..'), 'moca_plots') safe_makedir(output_dir) subplot_ncols = 1 if len(sample_score_files) == 0: raise MocaException('Found no sample score files') elif len(control_score_files) == 0: raise MocaException('Found no control score filees') elif len(sample_score_files)!=len(control_score_files): raise MocaException('Found unequal size of sample and control score files') if annotate == "" or annotate == ' ': annotate = None subplot_ncols +=1 max_occur = get_max_occuring_bases(record, max_count=1, count_type=COUNT_TYPE) motif_freq = [] for position in max_occur: motif_freq.append(position[0][1]) motif_freq = np.asarray(motif_freq) sample_conservation_scores = [] control_conservation_scores = [] for i in range(0, len(sample_score_files)): sample_conservation_scores.append(np.loadtxt(sample_score_files[i])) for i in range(0, len(control_score_files)): control_conservation_scores.append(np.loadtxt(control_score_files[i])) motif = record motif_length = motif.length motif_evalue = motif.evalue meme_dir = os.path.abspath(os.path.dirname(meme_file)) X_values = [40+15] ## this is by trial and error, the position for the first base logo ## Generate all other X coordinates for j in range(1,len(motif)+2*flank_length): X_values.append( X_values[j-1]+OFFSET+1.9 ) if centrimo_dir: subplot_ncols +=1 centrimo_dir = os.path.abspath(centrimo_dir) centrimo_txt = os.path.join(centrimo_dir, 'centrimo.txt') centrimo_stats = os.path.join(centrimo_dir, 'site_counts.txt') plot_title += r' \# {}'.format(motif_number) ##FIXME This is a big dirty hacl to get thegenerate plots for the Reverse complement logo too logo_name =['logo{}.png'.format(motif_number), 'logo_rc{}.png'.format(motif_number)] figures = [] for sample_score, control_score, subplot_legend_title in zip(sample_conservation_scores, control_conservation_scores, reg_plot_titles): for logo_filename in logo_name: setup_matplotlib() if 'rc'in logo_filename: sample_score = sample_score[::-1] matplot_dict = init_figure(meme_dir=meme_dir, X_values=X_values, motif=motif_number, subplot_ncols=subplot_ncols, annotate=annotate) f = matplot_dict['figure'] gs = matplot_dict['gs'] figsize = matplot_dict['figsize'] right_margin = matplot_dict['right_margin'] #total_px= matplot_dict['total_px'] title = r'\textbf{' + '\\underline{'+'{}'.format(plot_title)+'}}' f.suptitle(title, fontsize=LEGEND_FONTSIZE) logo_plot = create_logo_plot({'figure':f, 'gridspec': gs[0]}, meme_dir, logo_filename, motif_length) subgrid = gridspec.GridSpec(2, subplot_ncols, height_ratios=[1,2], width_ratios=[1]*subplot_ncols) subgrid.update(bottom=0.14, right=0.9, left=1-right_margin*0.85, wspace=0.58) X_left, X_center, X_right = create_stemplot({'figure': f, 'gridspec': gs[1], 'shareX': logo_plot}, X_values, sample_score, motif_length, flank_length=flank_length, legend_title=subplot_legend_title) create_bar_plot(logo_plot, X_right, matplot_dict['height_px'], total_sequences, all_meme_occurrences, motif_number, motif_evalue) create_ols_legend_plot({'figure':f, 'gridspec': subgrid[0,0]}, motif_freq, sample_score, control_score, flank_length, legend_title=subplot_legend_title) create_scatter_plot({'figure':f, 'gridspec': subgrid[1,0]}, motif_freq, sample_score, control_score, flank_length, num_occurrences, y_label=subplot_legend_title) if centrimo_dir: create_enrichment_plot({'figure': f, 'gridspec_header': subgrid[0,1], 'gridspec_body': subgrid[1,1]}, motif_number, centrimo_txt, centrimo_stats) if 'rc' not in logo_filename: out_file = os.path.join(output_dir,'moca_{}_{}.png'.format(subplot_legend_title, motif_number)) else: out_file = os.path.join(output_dir,'moca_{}_{}_rc.png'.format(subplot_legend_title, motif_number)) if annotate: create_annnotation_plot({'figure': f, 'gridspec_header': subgrid[0,-1], 'gridspec_body': subgrid[1,-1]}, annotate) if save: f.savefig(out_file, figsize=figsize, dpi=DPI) figures.append(f) plt.close('all') return figures
def find_motifs(bedfile, oc, configuration, slop_length, flank_motif, n_motif, cores, genome_build, show_progress): """Run meme to locate motifs and create conservation stacked plots""" root_dir = os.path.dirname(os.path.abspath(bedfile)) if not oc: moca_out_dir = os.path.join(os.getcwd(), 'moca_output') else: moca_out_dir = oc moca_pipeline = pipeline.Pipeline(configuration) genome_data = moca_pipeline.get_genome_data(genome_build) genome_fasta = genome_data['fasta'] genome_table = genome_data['genome_table'] wigfiles = {} for key in list(conservation_wig_keys): try: wigfiles[key] = genome_data['{}_wig'.format(key)] except KeyError: pass safe_makedir(moca_out_dir) bedfile_fn, _ = filename_extension(bedfile) if show_progress: msg_list = ['Extracting Fasta', 'Running MEME', 'Running CENTRIMO'] msg_list_e = ['Generating random Fasta', 'Running fimo random', 'Running fimo main'] + ['Extracting Scores']*len(wigfiles.keys()) + ['Creating PLot'] msg_list = msg_list + msg_list_e*n_motif progress_bar = ProgressBar(msg_list) query_train_fasta = os.path.join(moca_out_dir, bedfile_fn + '_train_flank_{}.fasta'.format(slop_length)) query_test_fasta = os.path.join(moca_out_dir, bedfile_fn + '_test_flank_{}.fasta'.format(slop_length)) if show_progress: progress_bar.show_progress('Extracting Fasta') bed_o = bedoperations.Bedfile(bedfile, genome_table, moca_out_dir) bed_train, bed_test = bed_o.split_train_test_bed(train_peaks_count=500, test_peaks_count=500) bed_train_slopped = bed_o.slop_bed(bed_train, flank_length=slop_length) bed_test_slopped = bed_o.slop_bed(bed_test, flank_length=slop_length) bed_o.extract_fasta(bed_train_slopped, fasta_in=genome_fasta, fasta_out=query_train_fasta) bed_o.extract_fasta(bed_test_slopped, fasta_in=genome_fasta, fasta_out=query_test_fasta) #memechip_out_dir = os.path.join(moca_out_dir, 'memechip_analysis') meme_out_dir = os.path.join(moca_out_dir, 'meme_out') memechip_out_dir = meme_out_dir meme_params = moca_pipeline.get_meme_default_params if cores==1: re.sub(r' -p*', '', meme_params) else: re.sub(r'-p*', '-p {}'.format(cores), meme_params) if show_progress: progress_bar.show_progress('Running MEME') #meme_run_out = moca_pipeline.run_memechip(fasta_in=query_fasta, out_dir=memechip_out_dir) # meme_run_out = moca_pipeline.run_meme(fasta_in=query_train_fasta, out_dir=meme_out_dir, strargs=meme_params) if meme_run_out['stderr']!='': sys.stdout.write('Error running MEME: {}'.format(meme_run_out['stderr'])) sys.exit(1) meme_file = os.path.join(meme_out_dir, 'meme.txt') meme_summary = read_memefile(meme_file) if show_progress: progress_bar.show_progress('Running CENTRIMO') centrimo_main_dir = os.path.join(moca_out_dir, 'centrimo_out') centrimo_main = moca_pipeline.run_centrimo(meme_file=meme_file, fasta_in=query_test_fasta, out_dir=centrimo_main_dir) for motif in range(1, meme_summary['total_motifs']+1): fimo_rand_dir = os.path.join(memechip_out_dir, 'fimo_random_{}'.format(motif)) fimo_main_dir = os.path.join(memechip_out_dir, 'fimo_out_{}'.format(motif)) safe_makedir(fimo_rand_dir) random_fasta = os.path.join(fimo_rand_dir, 'random_{}.fa'.format(motif)) if show_progress: progress_bar.show_progress('Generating Random Fasta: {}'.format(motif)) moca_pipeline.run_fasta_shuffler(fasta_in=query_train_fasta, fasta_out=random_fasta) #Random if show_progress: progress_bar.show_progress('Running FIMO Random') fimo_rand = moca_pipeline.run_fimo(motif_file=meme_file, motif_num=motif, sequence_file=random_fasta, out_dir=fimo_rand_dir) #Main if show_progress: progress_bar.show_progress('Running FIMO Main') fimo_main = moca_pipeline.run_fimo(motif_file=meme_file, motif_num=motif, sequence_file=query_test_fasta, out_dir=fimo_main_dir) fimo_rand_file = os.path.join(fimo_rand_dir, 'fimo.txt') fimo_main_file = os.path.join(fimo_main_dir, 'fimo.txt') main_intervals = get_start_stop_intervals(fimo_main_file, flank_length=flank_motif) random_intervals = get_start_stop_intervals(fimo_rand_file, flank_length=flank_motif) sample_score_files = [] control_score_files = [] for key in list(conservation_wig_keys): wigfile = wigfiles[key] if show_progress: progress_bar.show_progress('Creating plots') sample_score_file = moca_pipeline.save_conservation_scores(main_intervals, wigfile, fimo_main_dir, out_prefix=key) control_score_file = moca_pipeline.save_conservation_scores(random_intervals, wigfile, fimo_rand_dir, out_prefix=key) sample_score_files.append(sample_score_file) control_score_files.append(control_score_file) if show_progress: progress_bar.show_progress('Creating Plot') create_plot(meme_file, bedfile_fn, output_dir=moca_out_dir, centrimo_dir=centrimo_main_dir, motif_number=motif, flank_length=flank_motif, sample_score_files=sample_score_files, control_score_files=control_score_files, reg_plot_titles=[key.capitalize() for key in list(conservation_wig_keys)], annotate=None) if show_progress: progress_bar.close()