def run_experiment(shoals=30, fishes=2, replicas_top=0, replicas_bottom=0, follow_refugia_force=0.2): frequencies = headless_simulations( shoals, fishes=fishes, replicas_top=replicas_top, replicas_bottom=replicas_bottom, follow_refugia_force=follow_refugia_force) fig, ax = plt.subplots(1, 1, constrained_layout=True) plot_histogram(ax, frequencies, f'{replicas_bottom}: {replicas_top}, group size {fishes}') fig.show() return frequencies
def show_hist_colors(x, colors): """ histogram """ for i, c in enumerate(colors): fig = plot_histogram(x, bins=None, color=c, y_log_scale=False, x_log_scale=False, context='None', title='', plot_path=None, name='hist', show_plot=False) # positioning if i >= 3: i, j = i%3, 600 else: i, j = i, 0 fig.canvas.manager.window.setGeometry(i*600, j, 600, 500)
def analyze_damaged_files(self): """ analyze damaged files """ # histogram if len(self.file_energy_list): plot_histogram(self.file_energy_list, bins=np.logspace(np.log10(0.0001),np.log10(10000), 50), y_log_scale=True, x_log_scale=True, context='None', title='Energy', plot_path=self.plot_paths['z_score'], name='energy_hist_n-{}'.format(self.dataset_cfg['n_examples'])) if len(self.file_num_sample_list): plot_histogram(self.file_num_sample_list, bins=20, y_log_scale=True, context='None', title='Num Samples', plot_path=self.plot_paths['z_score'], name='num_sample_hist_n-{}'.format(self.dataset_cfg['n_examples'])) if len(self.damaged_score_list): plot_histogram(self.damaged_score_list, bins=50, y_log_scale=True, context='None', title='Damaged Score', plot_path=self.plot_paths['z_score'], name='z_score_hist_n-{}'.format(self.dataset_cfg['n_examples'])) print("\n--Analyze damaged files of ", self.__class__.__name__) print("too short files num: {}".format(len(self.short_file_list))) print("too weak files num: {}".format(len(self.weak_file_list))) print("damaged files num: {}".format(len(self.damaged_file_list))) # all audio files speakers if self.__class__.__name__ == 'SpeechCommandsDataset': # extract speaker info all_speakers_files = [re.sub(r'(\./)|(\w+/)|(\w+--)|(_nohash_[0-9]+.wav)', '', i) for i in list(np.concatenate(np.concatenate(np.array(self.set_audio_files, dtype='object'))))] # get all unique speakers all_speakers = np.unique(all_speakers_files) # print info print("number of audio files: ", len(all_speakers_files)), print("speakers: ", all_speakers), print("number of speakers: ", len(all_speakers)) # save damaged files for wav, score in self.damaged_file_list: copyfile(wav, self.plot_paths['damaged_files'] + wav.split('/')[-1]) # prints to files with open(self.info_file_damaged, 'w') as f: [print(i, file=f) for i in self.damaged_file_list] with open(self.info_file_short, 'w') as f: [print(i, file=f) for i in self.short_file_list] with open(self.info_file_weak, 'w') as f: [print(i, file=f) for i in self.weak_file_list] with open(self.info_file_strong, 'w') as f: [print(i, file=f) for i in self.strong_file_list] # broken file info plot_damaged_file_score(self.damaged_score_list, plot_path=self.plot_paths['z_score'], name='z_score_n-{}'.format(self.dataset_cfg['n_examples']), enable_plot=True)
import functions as f import os import constant import joblib import plots # f.merge_results() # df = f.read_dataset(constant.CURR_PATH) # pruned_df = f.prune_dataset(df) # f.save_pruned(pruned_df, header=True) # f.train_model() plots.plot_vnfs_inst() plots.plot_histogram(constant.PLOT_PATH) # f.count_bins(constant.PRUNED_PATH) # rf = joblib.load('rf_mod-e.sav') # print(rf.best_params_)
def get_taus(alignment_set, bas_reader, reference_information, output_directory, movie_length, movie_limited_threshold, first_adapter_template_position, second_adapter_template_position, template_min_start, template_max_start, coarse_grain_binsize, subsample_to, is_legacy): """Perform tau analysis, by reference Generate the following plots: 1) template start histogram 2) start time histogram 3) template-start position by start-time 4) survival by template-position 5) termination rate by template-position 6) termination rates w/ 1st, 2nd, and RC-circle tau fits Save a summary CSV storing tau values, by reference """ framerate = alignment_set.resourceReaders()[0].readGroupTable.FrameRate[0] # set up results file by writing header results_file = os.path.join(output_directory, 'results_summary.csv') with open(results_file, 'wb') as file: fieldnames = [ 'referenceName', 'nTotalAlignments', 'nConsideredAlignments', 'nCohortAlignments', 'percentMovieLimited', 'tau1', 'tau2', 'tauRC' ] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() for ref in reference_information: logging.info( ('Processing alignments from reference ' + str(ref['FullName']))) # row in recarray w/ 'ID', 'FullName', and 'SMRTBellSize' if is_legacy: aln_to_ref_indices = np.flatnonzero( alignment_set.index['RefGroupID'] == int(ref['ID'])) else: aln_to_ref_indices = np.flatnonzero( alignment_set.index['tId'] == int(ref['ID'])) if len(aln_to_ref_indices) > 0: # check that there are alignments max_start_time = 5 (template_position_info, pol_rates) = get_template_positions( alignment_set, aln_to_ref_indices, bas_reader, ref, movie_length, float(movie_limited_threshold), template_min_start, template_max_start, max_start_time, framerate, coarse_grain_binsize, subsample_to, is_legacy) pickle.dump( template_position_info, open((output_directory + 'info_' + str(ref['FullName']) + '.pkl'), 'wb')) pickle.dump( pol_rates, open((output_directory + 'rates_' + str(ref['FullName']) + '.pkl'), 'wb')) # remove alignments that started late clean_termination_info = clean_template_positions( template_position_info, ref, template_min_start, template_max_start, max_start_time) # generate template start histogram plot_descriptors = ['templateStartHistogram'] pl.plot_histogram( data=template_position_info['tStart'], output_directory=output_directory, title=ref['FullName'], xlabel='Template start (bp)', ylabel='Counts', bins=[0, np.max(template_position_info['tStart']), 10], # bases binwidth descriptors=plot_descriptors) plot_descriptors = ['templateStartJustifiedHistogram'] pl.plot_histogram( data=template_position_info['tStartJustified'], output_directory=output_directory, title=ref['FullName'], xlabel='Template start (bp)', ylabel='Counts', bins=[ 0, np.max(template_position_info['tStartJustified']), 10 ], # bases binwidth descriptors=plot_descriptors) if movie_length: # generate start time histogram plot_descriptors = ['startTimeHistogram'] pl.plot_histogram( data=template_position_info['startTime'], output_directory=output_directory, title=ref['FullName'], xlabel='Start time (min)', ylabel='Counts', bins=[0, movie_length, 1], # minute binwidth descriptors=plot_descriptors) # generate tStart vs. start time plots, if time info exists plot_descriptors = ['tStart_vs_startTime_unjustified'] pl.plot_line_plot(x=template_position_info['startTime'], y=template_position_info['tStart'], output_directory=output_directory, title=ref['FullName'], datapoint_mode='markers', xlabel='Start time (min)', ylabel='Template start position (bp)', yaxis_range=None, descriptors=plot_descriptors) plot_descriptors = ['tStart_vs_startTime_justified'] pl.plot_line_plot(x=template_position_info['startTime'], y=template_position_info['tStartJustified'], output_directory=output_directory, title=ref['FullName'], datapoint_mode='markers', xlabel='Start time (min)', ylabel='Template start position (bp)', yaxis_range=None, descriptors=plot_descriptors) if not is_legacy and pol_rates: # plot pol rates xyz = [] for tpos in pol_rates: rate, count = pol_rates[tpos] xyz.append((tpos, rate, count)) xyz.sort(key=lambda tup: tup[0]) # sort for line plot tpos, rate, count = zip(*xyz) plot_descriptors = [ 'polrate_by_template_position', 'tStartRange', str(template_min_start), str(template_max_start) ] pl.plot_line_plot(x=tpos, y=rate, output_directory=output_directory, title=ref['FullName'], datapoint_mode='lines', xlabel='Template position (bp)', ylabel='Mean pol rate (bases/sec)', yaxis_range=[0, 4], descriptors=plot_descriptors) plot_descriptors = [ 'numalns_by_template_position', 'tStartRange', str(template_min_start), str(template_max_start) ] pl.plot_line_plot(x=tpos, y=count, output_directory=output_directory, title=ref['FullName'], datapoint_mode='lines', xlabel='Template position (bp)', ylabel='Number of alignments', yaxis_range=[0, len(aln_to_ref_indices)], descriptors=plot_descriptors) minimum_alignment_number = 500 if len(clean_termination_info) > minimum_alignment_number: # produce survival curve (template_start_position, template_end_position, survival) = calculate_survival(clean_termination_info) # plot survival curve plot_descriptors = [ 'survival_by_template_position', 'tStartRange', str(template_min_start), str(template_max_start) ] pl.plot_line_plot(x=template_end_position, y=survival, output_directory=output_directory, title=ref['FullName'], datapoint_mode='lines', xlabel='Template position (bp)', ylabel='Fraction survivors left', yaxis_range=[0, 1], descriptors=plot_descriptors) # produce termination rates (template_position, termination_rate) = calculate_termination_rate( template_end_position, survival, coarse_grain_binsize) # plot termination rates plot_descriptors = [ 'termination_rates', 'tStartRange', str(template_min_start), str(template_max_start) ] pl.plot_line_plot(x=template_position, y=termination_rate, output_directory=output_directory, title=ref['FullName'], datapoint_mode='lines', xlabel='Template position (bp)', ylabel='Termination rate (per bp)', yaxis_range=None, descriptors=plot_descriptors) # plot taus plot_descriptors = [ 'termination_taus', 'tStartRange', str(template_min_start), str(template_max_start) ] pl.plot_line_plot(x=template_position[termination_rate != 0], y=np.divide( 1, termination_rate[termination_rate != 0], dtype=float), output_directory=output_directory, title=ref['FullName'], datapoint_mode='lines', xlabel='Template position (bp)', ylabel='Tau (bp)', yaxis_range=None, descriptors=plot_descriptors) # fit termination rates to get taus if first_adapter_template_position is None: first_adapter_template_position = np.divide( ref['SMRTBellSize'], 2, dtype=int) if second_adapter_template_position is None: second_adapter_template_position = ref['SMRTBellSize'] tau_1, tau_2, tau_rc = fit_taus( template_position, termination_rate, int(first_adapter_template_position), int(second_adapter_template_position), movie_length) logging.info(('\n\nTau1 is ' + str(tau_1) + ' bases\n' 'Tau2 is ' + str(tau_2) + ' bases\n' 'TauRC is ' + str(tau_rc) + ' bases\n')) # plot termination rate with fits plot_descriptors = ['fitted_termination_rates'] pl.plot_fitted_line_plot(x=template_position, y=termination_rate, t_1=[ np.min(template_position), first_adapter_template_position, np.divide(1., tau_1, dtype=float) ], t_2=[ first_adapter_template_position, second_adapter_template_position, np.divide(1., tau_2, dtype=float) ], t_rc=[ second_adapter_template_position, np.max(template_position), np.divide(1., tau_rc, dtype=float) ], output_directory=output_directory, title=ref['FullName'], xlabel='Template position (bp)', ylabel='Termination rate (per bp)', descriptors=plot_descriptors) # plot termination taus with fits plot_descriptors = ['fitted_termination_taus'] pl.plot_fitted_line_plot( x=template_position[termination_rate != 0], y=np.divide(1., termination_rate[termination_rate != 0], dtype=float), t_1=[ np.min(template_position), first_adapter_template_position, tau_1 ], t_2=[ first_adapter_template_position, second_adapter_template_position, tau_2 ], t_rc=[ second_adapter_template_position, np.max(template_position), tau_rc ], output_directory=output_directory, title=ref['FullName'], xlabel='Template position (bp)', ylabel='Termination tau (bp)', descriptors=plot_descriptors) taus = (tau_1, tau_2, tau_rc) save_taus(alignment_set, aln_to_ref_indices, template_position_info, clean_termination_info, ref, taus, results_file, fieldnames) else: logging.info(('Less than ' + str(minimum_alignment_number) + ' alignments exist against reference ' + str(ref['FullName']) + '. Analysis skipped.')) else: logging.info(('No alignments produced on reference ' + str(ref['FullName']) + '.')) return None
def main(): global show_hist global show_pdfs global show_post_plots global threshold x_seabass = ctrl.create_sample_population(30, 3, 400) x_salmon = ctrl.create_sample_population(32, 2, 600) # Create the dataset objects sea_bass = DataObject(x_seabass) salmon = DataObject(x_salmon) total = DataObject(ctrl.concatenate_data_sets(x_seabass, x_salmon)) # Check p-sums print( "----------------------------------------------------------------------------------------------------------" ) print("Sums probabilities") sum_sea_bass = np.sum(sea_bass.get_probabilities()[0:] * sea_bass.get_bin_width()) sum_salmon = np.sum(salmon.get_probabilities()[0:] * salmon.get_bin_width()) print("Seabass: %f" % sum_sea_bass) print("Salmon: %f" % sum_salmon) print( "----------------------------------------------------------------------------------------------------------" ) # ToDo: calculate decision vector # ToDo: calculate Accuracy # ToDo: Test samples to trained Model print( "----------------------------------------------------------------------------------------------------------" ) print("Probabilities") p_sea_bass = sea_bass.get_probabilities() p_salmon = salmon.get_probabilities() print("Seabass: %s" % str(p_sea_bass)) print("Salmon: %s" % str(p_salmon)) print( "----------------------------------------------------------------------------------------------------------" ) # Posteriors informative print( "----------------------------------------------------------------------------------------------------------" ) print("Informative posteriors") p_sea_bass = sea_bass.get_probabilities() p_salmon = salmon.get_probabilities() post_sea_bass = ctrl.calc_posterior(p_sea_bass, p_salmon, sea_bass.get_data_set(), salmon.get_data_set()) post_salmon = ctrl.calc_posterior(p_salmon, p_sea_bass, salmon.get_data_set(), sea_bass.get_data_set()) print("Seabass: %s" % str(post_sea_bass)) print("Salmon: %s" % str(post_salmon)) print( "----------------------------------------------------------------------------------------------------------" ) # Non informative posteriors print( "----------------------------------------------------------------------------------------------------------" ) print("Non-Informative posteriors") post_sea_bass_non_inf = ctrl.calc_posterior_non_inf(p_sea_bass, p_salmon) post_salmon_non_inf = ctrl.calc_posterior_non_inf(p_salmon, p_sea_bass) print("Seabass: %s" % str(post_sea_bass_non_inf)) print("Salmon: %s" % str(post_salmon_non_inf)) print( "----------------------------------------------------------------------------------------------------------" ) # Create Groundtrough Vector ground_truth_vec = ctrl.create_ground_truth_vector(x_salmon, x_seabass) # Train Model print( "----------------------------------------------------------------------------------------------------------" ) print("Training of Model") result_training = ctrl.train_model(post_salmon_non_inf, total.get_data_set(), salmon.get_bin_width(), salmon.get_bin_center(), ground_truth_vec) print("Accuracy: %s" % str(result_training)) print( "----------------------------------------------------------------------------------------------------------" ) # Plots if show_post_plots: bin_center_total = total.get_bin_center() plots.plot_posteriors(post_sea_bass, post_salmon, bin_center_total, "Seabass", "Salmon") plots.plot_posteriors_non_inf(post_sea_bass_non_inf, post_salmon_non_inf, bin_center_total, "Seabass", "Salmon") if show_pdfs: plots.plot_pdf(total.get_bin_center(), sea_bass.get_probabilities(), salmon.get_probabilities(), "Seabass", "Salmon") if show_hist: plots.plot_histogram(sea_bass.get_frequencies(), sea_bass.get_bins()) plots.plot_histogram(salmon.get_frequencies(), salmon.get_bins()) plots.plot_histogram(total.get_frequencies(), total.get_bins())