def bootstrap(args, alignment, tree, reps=100): bs_args = range_args(args, reps) bs_alignments = sample(alignment, reps) partition_jobs = [(partition, (args, bsa)) for args, bsa in zip(bs_args, bs_alignments)] part_results = mapPool(reps, partition_jobs) phylo_jobs = [(phylogeny, (args, pr[0], pr[1])) for args, pr in zip(bs_args, part_results)] bs_trees = mapPool(reps, phylo_jobs) bs_tree = map_support(tree, bs_trees) return bs_tree
def partition_sites(seqs, args): sites = [[s[i] for s in seqs] for i in range(len(seqs[0]))] jobs = [(partition, (site, args.alphabet)) for site in sites] partitions = mapPool(args.threads, jobs, daemonic=True, chunksize=10000) patterns = list(set(partitions)) partitions = [patterns.index(part) for part in partitions] return partitions, patterns
def enumerate_minimal_covers(clique_matrix, threads=1): m, n = clique_matrix.shape elements = get_duals(clique_matrix, n) elements = reduce_elements(elements, m, n) elements = order_minimals(elements) islands = split_disconnected(elements) parts = [(minimal_covers, island) for island in islands] partial_covers, partial_chains = list(zip(*mapPool(threads, parts))) covers = merge_disconnected(partial_covers) return covers
def calculate_rates(patterns, pattern_counts, nMinusOne, num_invariants, invariant_index, partitions, args): # parallelize, since this step can be very long jobs = [(score_conflict, (pat, patterns, pattern_counts, nMinusOne, num_invariants)) for pat in patterns] pattern_conflicts = mapPool(args.threads, jobs, daemonic=True, chunksize=100) pattern_conflicts[ invariant_index] = 0 # definitionally, and above calculation doesn't account for invariant sites pattern_rates = [1. - c for c in pattern_conflicts] # Expand pattern_rates into places where they occur in partitons. rates = [pattern_rates[i] for i in partitions] return pattern_rates, rates
def multiple_alignment(args, fastas): basedir = os.getcwd() alignment = basedir + '/2_alignment/' + args.output + '.fasta' os.chdir('2_alignment') if not os.path.isfile(alignment) or args.force: if args.force: unaligned_fastas = fastas else: unaligned_fastas = [ fasta for fasta in fastas if not os.path.isfile(trim_name(fasta)) ] if unaligned_fastas: chunk_size = int(len(unaligned_fastas) / 4) + 1 chunks = [ unaligned_fastas[i:i + chunk_size] for i in [n * chunk_size for n in range(4)] ] # Run this script with list of fastas as args jobs = [(submit_alignment_batch, [ '{} {} {}'.format(sys.executable, __file__, ' '.join(chunk)) ]) for chunk in chunks] IDs = mapPool(4, jobs) outfiles = ['mafft_' + str(ID) + '.out' for ID in IDs] errfiles = ['mafft_' + str(ID) + '.err' for ID in IDs] else: outfiles = [] errfiles = [] aligned = [align_name(fasta) for fasta in fastas ] # Intermediate files from the alignment process. aligned_trimmed = [trim_name(fasta) for fasta in fastas ] # The output files from the aligment process. concatenate_fasta(aligned_trimmed, alignment) cleanup(logs=outfiles + errfiles, trash=fastas + aligned + aligned_trimmed) os.chdir(basedir) return alignment
def estimate_rates(count_blocks, rates, threads=1): estimations = [(estimate_rate, count_part + (rates,)) for count_part in count_blocks] estimates = mapPool(threads, estimations) header = ['start', 'end', 'length', 'E'] + [str(rate) for rate in rates] return pd.DataFrame(estimates, columns=header)
ustates = set() for seq in seqlist: if len(seq) < 1: return False seq = seq.upper() ustates = ustates.union(set(seq)) if len(ustates) >= 4: return True return False def cleanup(logs=[], trash=[]): try: os.mkdir('logs') except OSError: pass for log in logs: os.rename(log, 'logs/' + log) for f in trash: try: os.remove(f) except OSError: pass if __name__ == '__main__': fastas = sys.argv[1:] calls = [(align_trim, [fasta]) for fasta in fastas] nones = mapPool(20, calls, daemonic=True, chunksize=50)
def main(tree, reference, outgroup, output, segment_files, seqs_files, step=7000, window_size=35000, nthreads=1, centromeres=None, tracks=None): if output: outfile = output + '.BRAG.stats' print('Writing messages to {}'.format(outfile)) log = open(outfile, 'w') else: output = 'rearrangement_analysis' print('Writing messages to stdout\nWriting results to {}*'.format( output)) log = sys.stdout clock = timer() log.write('Reading input. . .\n') tree = Tree(tree) root(tree, outgroup) reference_genome_file = infer_reference(seqs_files) table_jobs = [(segment_tables, (reference, segment_file, seqs_file, reference_genome_file)) for segment_file, seqs_file in zip(segment_files, seqs_files) ] tables = mapPool(nthreads, table_jobs) order = tree_order(reference, tree) tables.sort(key=lambda t: order.index(t[0])) # sort by queries into order queries, rscaffolds, qscaffolds, os_tabs = list(zip(*tables)) rscaffolds = rscaffolds[0] # all have same reference N = rscaffolds.iloc[ -1].abs_pos # position of the end == reference genome size coverages = [ np.sum(os_tab.rend - (os_tab.rstart - 1)) / float(N) for os_tab in os_tabs ] coverage_stats = describe(coverages) log.write('{} genomes aligned to {}.\n'.format(coverage_stats.nobs, reference)) log.write('Minimum coverage:\t{}\n'.format(coverage_stats.minmax[0])) log.write('Mean coverage:\t{}\n'.format(coverage_stats.mean)) log.write('Maximum coverage:\t{}\n'.format(coverage_stats.minmax[1])) log.write('SD coverage:\t{}\n'.format(coverage_stats.variance**0.5)) log.write('Cumulative coverage:\t{}\n'.format( cumulative_coverage(os_tabs, N))) log.write(clock.report() + '\n\n') log.write( 'Plotting coverage of alignments and histograms of OS and qbreak lengths. . .\n' ) degrading_coverage(coverages, os_tabs, N, output + '_coverage_survival_curve') hist_jobs = [(OS_length_hist, (reference, query, os_tab)) for query, rscaffolds, qscaffolds, os_tab in tables] mapPool(nthreads, hist_jobs) log.write(clock.report() + '\n') certain_out = output + '_certain' uncertain_out = output + '_uncertain' log.write('\nEstimating break rates. . .\n\n') if not (os.path.isfile(uncertain_out + '_rates.tab') and os.path.isfile(certain_out + '_rates.tab') and os.path.isfile(uncertain_out + '.log') and os.path.isfile(certain_out + '.log')): adj_jobs = [(map_breakpoints, [os_tab]) for os_tab in os_tabs] uncertain_adj_coords = mapPool(nthreads, adj_jobs) certain_adj_coords = [[coord for coord in coords if coord[2]] for coords in uncertain_adj_coords] uncertain_adj_coords = list(zip(queries, uncertain_adj_coords)) certain_adj_coords = list(zip(queries, certain_adj_coords)) br.set_reference(tree & reference, N) log.write('Calculating Uncertain (True or False qbreaks) Break Rates:\n') if not (os.path.isfile(uncertain_out + '_rates.tab') and os.path.isfile(uncertain_out + '.log')): uncertain_estimates = br.break_rate(uncertain_adj_coords, output=uncertain_out, threads=nthreads) else: uncertain_estimates = pd.read_csv(uncertain_out + '_rates.tab', sep='\t') log.write(open(uncertain_out + '.log', 'r').read()) log.write('\nCalculating Certain (True qbreaks only) Break Rates:\n') if not (os.path.isfile(certain_out + '_rates.tab') and os.path.isfile(certain_out + '.log')): certain_estimates = br.break_rate(certain_adj_coords, output=certain_out, threads=nthreads) else: certain_estimates = pd.read_csv(certain_out + '_rates.tab', sep='\t') log.write(open(certain_out + '.log', 'r').read()) if not os.path.isfile(uncertain_out + '_rate_windows.txt'): uncertain_rate_windows = rate_windows(uncertain_estimates, N, step=step, window_size=window_size) uncertain_rate_windows.to_csv(uncertain_out + '_rate_windows.txt', sep='\t', index=False) else: uncertain_rate_windows = pd.read_csv(uncertain_out + '_rate_windows.txt', sep='\t', header=0) if not os.path.isfile(certain_out + '_rate_windows.txt'): certain_rate_windows = rate_windows(certain_estimates, N, step=step, window_size=window_size) certain_rate_windows.to_csv(certain_out + '_rate_windows.txt', sep='\t', index=False) else: certain_rate_windows = pd.read_csv(certain_out + '_rate_windows.txt', sep='\t', header=0) log.write('\n' + clock.report() + '\n\n') log.write( 'Processing centromeres & extra data tracks, if applicable. . .\n') # Mask Centromeres if centromeres: centromeres = [ list(map(int, line.split('#')[0].strip().split())) for line in open(centromeres, 'r') if line.split('#')[0].strip() ] abs_centromeres = [(rscaffolds.iloc[scaf_idx].abs_pos + start, rscaffolds.iloc[scaf_idx].abs_pos + stop) for scaf_idx, start, stop in centromeres] certain_rate_windows = mask(certain_rate_windows, abs_centromeres) uncertain_rate_windows = mask(uncertain_rate_windows, abs_centromeres) else: abs_centromeres = [] # Mask scaffold edges scaffold_boundaries = [(x, x) for x in rscaffolds.abs_pos] certain_rate_windows = mask(certain_rate_windows, scaffold_boundaries, inclusive=False) uncertain_rate_windows = mask(uncertain_rate_windows, scaffold_boundaries, inclusive=False) # Add in extra data tracks and mask if tracks: tracks = pd.read_csv(tracks, sep='\t') tracks.sort_values('start', inplace=True) tracks = mask(tracks, scaffold_boundaries, inclusive=False) else: tracks = [] track_labels = [ label for label in list(tracks) if label not in ['start', 'end'] ] log.write(clock.report() + '\n') # Plot Figures log.write('\n') log.write( 'Plotting break rates calculated with "True" qbreaks ("certain", lower bound estimate)\n' ) log.write( 'against break rates calculated with "True" and "False" break rates ("uncertain:, upper bound estimate).\n' ) log.write('Output: ' + output + '_uncertainty\n') indexer = (certain_rate_windows['E'] != uncertain_rate_windows['E']) & ( uncertain_rate_windows['E'] != 0) model = correlation_scatter(certain_rate_windows['E'].loc[indexer], uncertain_rate_windows['E'].loc[indexer], output + '_uncertainty') same = (certain_rate_windows['E'] == uncertain_rate_windows['E']).sum() num_windows = len(certain_rate_windows) log.write('{}/{} ({:.2f}%) windows have identical break rates\n'.format( same, num_windows, same / num_windows * 100)) uncertain_over = (certain_rate_windows['E'] < uncertain_rate_windows['E']).sum() report = '{}/{} ({:.2f}%) of non-identical windows have (True) < (True | False)\n' report = report.format(uncertain_over, num_windows - same, uncertain_over / (num_windows - same) * 100) log.write(report) mean_ratio = (certain_rate_windows['E'].loc[indexer] / uncertain_rate_windows['E'].loc[indexer]).mean() log.write( 'Mean ratio of (True)/(True | False) when True != False: {}\n'.format( mean_ratio)) log.write( '(True)/(True | False) = {:.4f}*(True | False) + {:.4f}; p={:.3E} R2={:.3E}\n' .format(model.slope, model.intercept, model.p_val, model.r2)) log.write(clock.report() + '\n') if track_labels: log.write('\n') log.write( 'Performing linear regression between extra data tracks and break rate.\n' ) log.write('Output: ' + output + '_tracks-x-breakrate\n') track_results = track_correlation(certain_rate_windows, tracks, track_labels, output + '_tracks-x-breakrate') log.write(track_results.summary().as_text() + '\n') log.write(clock.report() + '\n') log.write('\n') log.write( 'Plotting break rates and extra tracks along the reference genome.\n') log.write('Output: ' + output + '_brMap\n') plot_break_rate(N, queries, os_tabs, certain_estimates, uncertain_estimates, certain_rate_windows, uncertain_rate_windows, tracks, track_labels, rscaffolds, abs_centromeres, step, output + '_brMap') log.write('BRAG Finished!\t{}\n'.format(clock.report()))