def main(align_path, output_path, flank_size, direction, seed, randomise, step, dry_run, force_overwrite): """Export tab delimited counts table from alignment centred on SNP position. Output file is written to the same path with just the file suffix changed from fasta to txt.""" args = locals() if not seed: seed = str(time.time()) align_path = abspath(align_path) output_path = abspath(output_path) counts_filename = get_counts_filename(align_path, output_path) runlog_path = counts_filename.replace(".txt", ".log") LOGGER.log_file_path = runlog_path if not dry_run: if not force_overwrite and (os.path.exists(counts_filename) or os.path.exists(runlog_path)): msg = "Either %s or %s already exist. Force overwrite of existing"\ " files with -F." raise ValueError(msg % (counts_filename, runlog_path)) makedirs(output_path) LOGGER.log_message(str(args), label='vars') LOGGER.input_file(align_path, label="align_path") LOGGER.log_message(str(seed), label="random_number_seed") start_time = time.time() # run the program counts_table = align_to_counts(align_path, output_path, flank_size, direction, step, seed, randomise, dry_run) if not dry_run: counts_table.write(counts_filename, sep='\t') LOGGER.output_file(counts_filename) # determine runtime duration = time.time() - start_time if not dry_run: LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)")
def align_to_counts(align_path, output_path, flank_size, direction, step, seed, randomise, dry_run): '''returns counts table from alignment of sequences centred on a SNP''' if not dry_run: makedirs(output_path) print("Deriving counts from sequence file") step = int(step) direction = tuple(direction.split('to')) chosen_base = direction[0] orig_seqs = load_from_fasta(os.path.abspath(align_path)) seqs = orig_seqs.array_seqs seqs = just_nucs(seqs) if not randomise: orig, ctl = profile.get_profiles(seqs, chosen_base=chosen_base, step=step, flank_size=flank_size, seed=seed) else: LOGGER.log_message("A randomised selection of starting base " "locations use for observed counts.") # we are setting a randomised set of locations as our observed SNPs ctl = profile.get_control(seqs, chosen_base=chosen_base, step=step, flank_size=flank_size, seed=seed) orig = profile.get_control(seqs, chosen_base=chosen_base, step=step, flank_size=flank_size, seed=seed) # convert profiles to a motif count table orig_counts = motif_count.profile_to_seq_counts(orig, flank_size=flank_size) ctl_counts = motif_count.profile_to_seq_counts(ctl, flank_size=flank_size) counts_table = motif_count.get_count_table(orig_counts, ctl_counts, flank_size * 2) counts_table = counts_table.sorted(columns='mut') return counts_table
def test_aln_to_counts(self): """exercising aln_to_counts""" if os.path.exists(self.dirname): shutil.rmtree(self.dirname) makedirs(self.dirname) runner = CliRunner() # should fail, as data files not in this directory r = runner.invoke(aln_to_counts_main, ["-adata/sample_AtoC.fasta", "-o%s" % self.dirname, "-f1", "--direction=AtoC", "-S111", "-F"]) dirlist = os.listdir(self.dirname) self.assertEqual(r.exit_code, 0) self.assertEqual(set(dirlist), set(["sample_AtoC.txt", "sample_AtoC.log"])) counts = LoadTable(os.path.join( self.dirname, "sample_AtoC.txt"), sep="\t") # two columns with pos, two groups giving shape=2*16 self.assertEqual(counts.shape[0], 32) shutil.rmtree(self.dirname)
def grid(fig_config, figpath, format, no_type3): """draws an arbitrary shaped grid of mutation motifs based on fig_config""" # we read in the config file and determine number of rows and columns # paths, headings, etc .. # then create the figure and axes and call the mutation_motif drawing code args = locals() if no_type3: util.exclude_type3_fonts() if not figpath: dirname = os.path.dirname(fig_config.name) figpath = os.path.join(dirname, "drawn_array.%s" % format) log_file_path = os.path.join(dirname, "drawn_array.log") else: figpath = util.abspath(figpath) log_file_path = "%s.log" % ".".join(figpath.split(".")[:-1]) util.makedirs(os.path.dirname(figpath)) LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') ncols, nrows, figsize, col_labels, row_labels, paths, axis_cfg = \ read_plot_array_config(fig_config) print("ncols:", ncols) print("nrows:", nrows) print("figsize:", figsize) print("col_labels:", col_labels) print("row_labels:", row_labels) print("paths:", paths) print("axis_cfg:", axis_cfg) #TODO: Convert below into Cogent3 Plotly #-Plotly layout = UnionDict(shapes=[]) adaptive_y = 0 plottable = {} for coord in paths: data = util.load_loglin_stats(paths[coord]) positions = list(data) positions.sort() heights, characters, indices = get_plot_data(data, positions) adaptive_y = max(adaptive_y, logo.est_ylim(heights)) plottable[coord] = dict(char_heights=heights, characters=characters, position_indices=indices) ylim = axis_cfg.get("ylim", adaptive_y) for coord in plottable: kwargs = plottable[coord] kwargs["ax"] = coord kwargs["ylim"] = ylim r = logo.draw_multi_position_cogent3(**kwargs) for key in r: if key == "shapes": layout.shapes.extend(r.shapes) else: layout[key] = r[key] for i in range(0, ncols): xaxis = "xaxis" + str(i + 1 if i != 0 else "") layout[xaxis]["domain"] = [ 0.0 + (i * (1 / ncols)), (i * (1 / ncols)) + (1 / ncols) ] print(layout) MARGININCHES = 0 PPI = 100 fig = Drawable(layout=layout, width=(figsize[0] - MARGININCHES) * PPI, height=(figsize[1] - MARGININCHES) * PPI) #export fig.write(path=figpath) click.secho("Wrote Cogent3 %s" % figpath, fg="green") """
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite, dry_run, verbose): args = locals() table = LoadTable(countsfile, sep='\t') if not dry_run: log_file_path = os.path.join(util.abspath(outpath), 'spectra_analysis.log') LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') LOGGER.input_file(countsfile) # if there's a strand symmetry argument then we don't need a second file if strand_symmetry: group_label = 'strand' counts_table = util.spectra_table(table, group_label) if not strand_symmetry: group_label = 'group' # be sure there's two files counts_table2 = LoadTable(countsfile2, sep='\t') LOGGER.input_file(countsfile2) counts_table2 = counts_table2.with_new_column('group', lambda x: '2', columns=counts_table2.header[0]) counts_table1 = table.with_new_column('group', lambda x: '1', columns=table.header[0]) counts_table1 = util.spectra_table(counts_table1, group_label) counts_table2 = util.spectra_table(counts_table2, group_label) # now combine header = ['group'] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = LoadTable(header=header, rows=raw1 + raw2) if verbose: print(counts_table) # spectra table has [count, start, end, group] order # we reduce comparisons to a start base results = [] saveable = {} for start_base in counts_table.distinct_values('start'): subtable = counts_table.filtered('start == "%s"' % start_base) columns = [c for c in counts_table.header if c != 'start'] subtable = subtable.get_columns(columns) total_re, dev, df, collated, formula = log_lin.spectra_difference( subtable, group_label) r = [list(x) for x in collated.to_records(index=False)] if not strand_symmetry: grp_labels = {'1': countsfile, '2': countsfile2} grp_index = list(collated.columns).index('group') for row in r: row[grp_index] = grp_labels[row[grp_index]] p = chisqprob(dev, df) if p < 1e-6: prob = "%.2e" % p else: prob = "%.6f" % p for row in r: row.insert(0, start_base) row.append(prob) results += r significance = ["RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df, "p=%s" % p] stats = " : ".join(significance) print("Start base=%s %s" % (start_base, stats)) saveable[start_base] = dict(rel_entropy=total_re, deviance=dev, df=df, prob=p, formula=formula, stats=collated.to_json()) table = LoadTable(header=['start_base'] + list(collated.columns) + ['prob'], rows=results, digits=5).sorted(columns='ret') json_path = None outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) json_path = os.path.join(outpath, 'spectra_analysis.json') dump_json(saveable, json_path) LOGGER.output_file(json_path) table_path = os.path.join(outpath, 'spectra_summary.txt') table.write(table_path, sep='\t') LOGGER.output_file(table_path) LOGGER.log_message(str(significance), label="significance")
def main(counts_pattern, output_path, strand_symmetric, split_dir, dry_run, force_overwrite): """export tab delimited combined counts table by appending the 12 mutation direction tables, adding a new column ``direction``.""" args = locals() output_path = abspath(output_path) if strand_symmetric and split_dir: split_dir = abspath(split_dir) else: split_dir = None # check we the glob pattern produces the correct number of files counts_files = glob.glob(counts_pattern) check_found_filenames(counts_files) counts_filename = os.path.join(output_path, 'combined_counts.txt') runlog_path = os.path.join(output_path, "combined_counts.log") if not dry_run: if not force_overwrite and (os.path.exists(counts_filename) or os.path.exists(runlog_path)): msg = "Either %s or %s already exist. Force overwrite of "\ "existing files with -F." raise ValueError(msg % (counts_filename, runlog_path)) makedirs(output_path) if split_dir: makedirs(split_dir) LOGGER.log_file_path = runlog_path LOGGER.log_message(str(args), label='vars') for fn in counts_files: LOGGER.input_file(fn, label="count_file") start_time = time.time() # run the program all_counts = [] header = None num_rows = 0 basenames = [] for fn in counts_files: basenames.append(os.path.basename(fn)) mutation = direction.findall(fn)[0] table = LoadTable(fn, sep='\t') if header is None: header = list(table.header) header.append('direction') num_rows = table.shape[0] data = table.tolist() new = [] for row in data: row.append(mutation) new.append(row) all_counts += new table = LoadTable(header=header, rows=all_counts) if strand_symmetric: table = make_strand_symmetric_table(table) if split_dir: group_subtables = get_subtables(table, group_label='direction') if not dry_run: table.write(counts_filename, sep='\t') LOGGER.output_file(counts_filename) if split_dir: for group, subtable in group_subtables: # we first assume that group is part of the filenames! fn = [bn for bn in basenames if group in bn] if len(fn) == 1: fn = fn[0] else: fn = "%s.txt" % group counts_filename = os.path.join(split_dir, fn) subtable.write(counts_filename, sep='\t') LOGGER.output_file(counts_filename) # determine runtime duration = time.time() - start_time if not dry_run: LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)") print("Done!")
def grid(fig_config, figpath, format, no_type3): """draws an arbitrary shaped grid of mutation motifs based on fig_config""" # we read in the config file and determine number of rows and columns # paths, headings, etc .. # then create the figure and axes and call the mutation_motif drawing code args = locals() if no_type3: util.exclude_type3_fonts() if not figpath: dirname = os.path.dirname(fig_config.name) figpath = os.path.join(dirname, "drawn_array.%s" % format) log_file_path = os.path.join(dirname, "drawn_array.log") else: figpath = util.abspath(figpath) log_file_path = "%s.log" % ".".join(figpath.split(".")[:-1]) util.makedirs(os.path.dirname(figpath)) LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') ncols, nrows, figsize, col_labels, row_labels, paths, axis_cfg = \ read_plot_array_config(fig_config) fig, axes = pyplot.subplots(nrows=nrows, ncols=ncols, figsize=figsize, sharex=True, sharey=True) figwidth = fig.get_figwidth() try: axes[0] except TypeError: axes = numpy.array([[axes]]) if len(axes.shape) == 1: # required for indexing of appropriate axis axes = numpy.vstack(axes) if nrows == 1: axes = axes.T adaptive_y = 0 plottable = {} for coord in paths: data = util.load_loglin_stats(paths[coord]) positions = list(data) positions.sort() heights, characters, indices = get_plot_data(data, positions) adaptive_y = max(adaptive_y, logo.est_ylim(heights)) plottable[coord] = dict(char_heights=heights, characters=characters, position_indices=indices, figwidth=figwidth, verbose=False) ylim = axis_cfg.get("ylim", adaptive_y) for coord in plottable: kwargs = plottable[coord] kwargs["ax"] = axes[coord] kwargs["ylim"] = ylim fig = logo.draw_multi_position(**kwargs) xformat = FuncFormatter(format_float(1e-3, float_places=2)) for col in range(ncols): top_ax = axes[0, col] top_ax.set_title(col_labels[col], fontsize=axis_cfg["xlabel_fontsize"], weight="bold", y=1.1) btm_ax = axes[-1, col] for xticklabel in btm_ax.get_xticklabels(): xticklabel.set_fontsize(axis_cfg["xtick_fontsize"]) xticklabel.set_rotation(0) btm_ax.set_xlabel("Position", fontsize=axis_cfg["xlabel_fontsize"], weight="bold") btm_ax.xaxis.labelpad = axis_cfg['xlabel_pad'] for row in range(nrows): lft_ax = axes[row, 0] for yticklabel in lft_ax.get_yticklabels(): yticklabel.set_fontsize(axis_cfg["ytick_fontsize"]) yticklabel.set_rotation(0) lft_ax.yaxis.set_major_formatter(FuncFormatter(xformat)) lft_ax.yaxis.labelpad = axis_cfg['ylabel_pad'] lft_ax.set_ylabel(row_labels[row], rotation=0, fontsize=axis_cfg['ylabel_fontsize'], weight="bold") fig.tight_layout() fig.savefig(figpath) click.secho("Wrote %s" % figpath, fg="green")
def run(input_path, output_path, direction, prefix, chrom_class, gc_class, freq_class, adjust_strand, limit, force_overwrite, dry_run, verbose): if not dry_run: makedirs(output_path) correct_freq = { 'All': everything, 'Common': MakeFreqCompare(MAF, ge=True, get_freq=min, verbose=verbose), 'Rare': MakeFreqCompare(MAF, ge=False, get_freq=min, verbose=verbose) }[freq_class] correct_comp = { 'All': everything, 'Hi': MakeFreqCompare(0.5, ge=True, get_freq=get_gc_freq, verbose=verbose), 'Lo': MakeFreqCompare(0.4, ge=False, get_freq=get_gc_freq, verbose=verbose) }[gc_class] correct_chrom = { 'All': everything, 'A': is_autosome, 'X': is_xchrom }[chrom_class] seen = set() chroms = set() if not os.path.exists(input_path): raise IOError("no files matching %s" % input_path) name_components = dict(freq_class='freq_' + freq_class, chrom_class='chrom_' + chrom_class, gc_class='GC_' + gc_class, direction=direction, prefix=prefix or '') outfilename = os.path.join( output_path, '%(prefix)s%(freq_class)s-%(chrom_class)s-%(gc_class)s-%(direction)s.fasta.gz' % name_components) runlog_path = os.path.join( output_path, '%(prefix)s%(freq_class)s-%(chrom_class)s-%(gc_class)s-%(direction)s.log' % name_components) LOGGER.log_file_path = runlog_path if not force_overwrite and (os.path.exists(outfilename) or os.path.exists(runlog_path)): msg = "Either %s or %s already exist. Force overwrite of existing files with -F." raise ValueError(msg % (outfilename, runlog_path)) LOGGER.input_file(input_path) direction = tuple(direction.split('to')) with open_(input_path) as infile: with open_(outfilename, 'w') as outfile: num = 0 for record in filtered_records(infile, direction, seen, chroms, correct_chrom=correct_chrom, correct_freq=correct_freq, correct_comp=correct_comp, stranded=adjust_strand, verbose=False): outfile.write(record) num += 1 if limit and num >= limit: break LOGGER.output_file(outfilename) msg = "Wrote %d records to %s" % (num, outfilename) print(msg) LOGGER.log_message(msg + "\n", label="completed")
def nbr(countsfile, outpath, countsfile2, first_order, strand_symmetry, group_label, group_ref, plot_cfg, no_type3, format, verbose, dry_run): '''log-linear analysis of neighbouring base influence on point mutation Writes estimated statistics, figures and a run log to the specified directory outpath. See documentation for count table format requirements. ''' if no_type3: util.exclude_type3_fonts() args = locals() outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) runlog_path = os.path.join(outpath, "analysis.log") LOGGER.log_file_path = runlog_path LOGGER.log_message(str(args), label='vars') counts_filename = util.abspath(countsfile) counts_table = util.load_table_from_delimited_file(counts_filename, sep='\t') LOGGER.input_file(counts_filename, label="countsfile1_path") positions = [c for c in counts_table.header if c.startswith('pos')] if not first_order and len(positions) != 4: raise ValueError("Requires four positions for analysis") group_label = group_label or None group_ref = group_ref or None if strand_symmetry: group_label = 'strand' group_ref = group_ref or '+' if group_label not in counts_table.header: print("ERROR: no column named 'strand', exiting.") exit(-1) if countsfile2: print("Performing 2 group analysis") group_label = group_label or 'group' group_ref = group_ref or '1' counts_table1 = counts_table.with_new_column(group_label, lambda x: '1', columns=counts_table.header[0]) fn2 = util.abspath(countsfile2) counts_table2 = util.load_table_from_delimited_file(fn2, sep='\t') LOGGER.input_file(fn2, label="countsfile2_path") counts_table2 = counts_table2.with_new_column(group_label, lambda x: '2', columns=counts_table2.header[0]) # now combine header = [group_label] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = make_table(header=header, rows=raw1 + raw2) if not dry_run: outfile = os.path.join(outpath, 'group_counts_table.txt') counts_table.write(outfile, sep='\t') LOGGER.output_file(outfile, label="group_counts") if dry_run or verbose: print() print(counts_table) print() plot_config = util.get_plot_configs(cfg_path=plot_cfg) msg = single_group(counts_table, outpath, group_label, group_ref, positions, plot_config, first_order, dry_run) print(msg)