def compare_mea_speeds(): """Compare the speeds of the three mea implementations""" plt.figure(figsize=(14, 4)) panel1 = plt.axes([0.07, 0.11, .4, .8]) panel2 = plt.axes([0.55, 0.11, .4, .8]) # longest = max(data[0]) + data[1]) fast_times = [] slow_times = [] slower_times = [] num_points = [] for x in range(10, 200, 10): size = x prob_matrix, shortest_future_col_per_row = create_random_prob_matrix(col=size, row=size) points = np.count_nonzero(prob_matrix) num_points.append(points) _, time = time_it(mea_slow, prob_matrix, shortest_future_col_per_row) slow_times.append(time) print(time, time/points) _, time = time_it(mea_slower, prob_matrix, shortest_future_col_per_row) slower_times.append(time) print(time, time/points) _, time = time_it(maximum_expected_accuracy_alignment, prob_matrix, shortest_future_col_per_row) fast_times.append(time) print(time, time/points) # plot number of points vs time handle1, = panel1.plot(num_points, fast_times, color='black') handle2, = panel1.plot(num_points, slow_times, color='blue') handle3, = panel1.plot(num_points, slower_times, color='red') slowest_time = max(slower_times + slow_times) most_points = max(num_points) panel1.set_xlim(0, most_points) panel1.set_ylim(0, slowest_time) panel1.set_xlabel('Number of Points') panel1.set_ylabel('Time (s)') panel1.legend([handle1, handle2, handle3], ["Fast", "Slow", "Slower"], loc='upper left') panel1.set_title('Time (s) vs Number of points ') fast_ratio = [x/y for x, y in zip(fast_times, num_points)] slow_ratio = [x/y for x, y in zip(slow_times, num_points)] slower_ratio = [x/y for x, y in zip(slower_times, num_points)] handle1, = panel2.plot(num_points, fast_ratio, color='black') handle2, = panel2.plot(num_points, slow_ratio, color='blue') handle3, = panel2.plot(num_points, slower_ratio, color='red') panel2.set_xlim(0, max(num_points)) panel2.set_ylim(0, max(slower_ratio)) panel2.set_xlabel('Number of Points') panel2.set_ylabel('Time/ number of points (s/point)') panel2.legend([handle1, handle2, handle3], ["Fast", "Slow", "Slower"], loc='upper left') panel2.set_title('Time(s)/Data Points vs Number of points ') plt.show()
def test_time_it(self): """Test time_it function""" with captured_output() as (_, _): def add(x, y): return x + y _, _ = time_it(add, 1, 2) with self.assertRaises(AssertionError): time_it(1, 1, 2)
def test_generate_buildAlignments4(self): kmers = get_kmers(6, alphabet="ATGC") data_files = [self.alignments_path] data, time = time_it(multiprocess_make_kmer_assignment_tables, data_files, kmers, {"t", "c"}, 0.0, False, True, 10, 8) with tempfile.TemporaryDirectory() as temdir: output_file = os.path.join(temdir, "built_alignment.tsv") data2, time2 = time_it(generate_top_n_kmers_from_sa_output, data_files, temdir, output_file, 10, "ACGT", 6, 0.0, 8, False, True, False, True) # get kmers associated with each sample num_lines = len(list(open(output_file))) print(time2, time) self.assertEqual(len(data.index), num_lines) self.assertLess(time2, time)
def test_binary_search_exact_match(self): with captured_output() as (_, _): for x in range(100, 1000, 10): test_list = list(range(x)) time_list = [] for _ in range(100): find_number = np.random.randint(0, x) index, time = time_it(binary_search, test_list, find_number) time_list.append(time) self.assertEqual(test_list[index], find_number)
def test_binary_search_no_match(self): with captured_output() as (_, _): for x in range(10, 100, 10): test_list = list(range(x)) time_list = [] for _ in range(100): find_number = np.random.randint(0, x) + 0.5 index, time = time_it(binary_search, test_list, find_number, False) time_list.append(time) if index == x - 1: self.assertTrue(test_list[index] < find_number) elif index == 0: self.assertTrue(find_number < test_list[index + 1]) else: self.assertTrue( test_list[index] < find_number < test_list[index + 1])
args.positions_file) step_size = 10000 step_number = 0 with open(args.positions_file, "r") as fh: while True: out_name = args.output_file + str(step_number) + "tmp" execute = "samtools view -b -o {} {}".format(out_name, args.bam) positions = " " counter = 0 for line in fh: split_line = line.split() chromosome = split_line[0] position = split_line[1] positions += chromosome + ":" + position + "-" + position + " " counter += 1 if counter > 10000: break if positions == " " or step_number > 10: break execute += positions check_call(execute.split()) check_call( f"samtools rmdup -s {out_name} {args.output_file+str(step_number)}" .split()) os.remove(out_name) step_number += 1 if __name__ == '__main__': print(time_it(main)[1])
print("Per-genomic-site confusion matrix", file=log_file) print(cmh.confusion_matrix(), file=log_file) all_data.append(chr_strand_data) print("All Chromosomes both strands:", file=log_file) print("Per-call confusion matrix", file=log_file) print(print_confusion_matrix(tps, fps, fns, tns), file=log_file) plot_confusion_matrix(tps, fps, fns, tns, normalize=True, output_path=os.path.join(output_dir, "all_calls_confusion_matrix.png"), title="All calls CpG " "Normalized Confusion Matrix") all_data = pd.concat(all_data) label_data = all_data.loc[:, ['C_label', "E_label"]] prediction_data = all_data.loc[:, ['C', "E"]] label_data.rename(columns={'C_label': 'C', "E_label": "E"}, inplace=True) cmh = ClassificationMetrics(label_data, prediction_data) cmh.plot_roc("E", os.path.join(output_dir, "per_genomic_site_all_chromosomes_roc.png")) cmh.plot_precision_recall("E", os.path.join(output_dir, "per_genomic_site_all_chromosomes" "precision_recall.png")) print("Per-genomic-site confusion matrix", file=log_file) print(cmh.confusion_matrix(), file=log_file) if __name__ == '__main__': print(time_it(main))
def main(): args = parse_args() filter_bed = FilterBed() filters = [] if args.filter_by_percentage is not None: filter_bed.set_filter_by_percentage( *[float(x) for x in args.filter_by_percentage]) filters.append(filter_bed.filter_by_percentage_min_min_max_max) if args.filter_by_coverage is not None: filter_bed.set_filter_by_coverage( *[float(x) for x in args.filter_by_coverage]) filters.append(filter_bed.filter_by_coverage_min_max) filter_bed.chain_logic(*filters) kmers = get_kmer_counts_from_reference_given_bed( args.reference, args.methyl_bed, k=args.kmer_length, param_filter=filter_bed.function, check_base=args.check_base) print(kmers) with open(os.path.join(args.output, "kmer_counts.pkl"), 'wb') as fh: pickle.dump(kmers, fh) if __name__ == '__main__': _, time = time_it(main) print(time, "seconds")