def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite, dry_run, verbose): args = locals() table = LoadTable(countsfile, sep='\t') if not dry_run: log_file_path = os.path.join(util.abspath(outpath), 'spectra_analysis.log') LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') LOGGER.input_file(countsfile) # if there's a strand symmetry argument then we don't need a second file if strand_symmetry: group_label = 'strand' counts_table = util.spectra_table(table, group_label) if not strand_symmetry: group_label = 'group' # be sure there's two files counts_table2 = LoadTable(countsfile2, sep='\t') LOGGER.input_file(countsfile2) counts_table2 = counts_table2.with_new_column('group', lambda x: '2', columns=counts_table2.header[0]) counts_table1 = table.with_new_column('group', lambda x: '1', columns=table.header[0]) counts_table1 = util.spectra_table(counts_table1, group_label) counts_table2 = util.spectra_table(counts_table2, group_label) # now combine header = ['group'] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = LoadTable(header=header, rows=raw1 + raw2) if verbose: print(counts_table) # spectra table has [count, start, end, group] order # we reduce comparisons to a start base results = [] saveable = {} for start_base in counts_table.distinct_values('start'): subtable = counts_table.filtered('start == "%s"' % start_base) columns = [c for c in counts_table.header if c != 'start'] subtable = subtable.get_columns(columns) total_re, dev, df, collated, formula = log_lin.spectra_difference( subtable, group_label) r = [list(x) for x in collated.to_records(index=False)] if not strand_symmetry: grp_labels = {'1': countsfile, '2': countsfile2} grp_index = list(collated.columns).index('group') for row in r: row[grp_index] = grp_labels[row[grp_index]] p = chisqprob(dev, df) if p < 1e-6: prob = "%.2e" % p else: prob = "%.6f" % p for row in r: row.insert(0, start_base) row.append(prob) results += r significance = ["RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df, "p=%s" % p] stats = " : ".join(significance) print("Start base=%s %s" % (start_base, stats)) saveable[start_base] = dict(rel_entropy=total_re, deviance=dev, df=df, prob=p, formula=formula, stats=collated.to_json()) table = LoadTable(header=['start_base'] + list(collated.columns) + ['prob'], rows=results, digits=5).sorted(columns='ret') json_path = None outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) json_path = os.path.join(outpath, 'spectra_analysis.json') dump_json(saveable, json_path) LOGGER.output_file(json_path) table_path = os.path.join(outpath, 'spectra_summary.txt') table.write(table_path, sep='\t') LOGGER.output_file(table_path) LOGGER.log_message(str(significance), label="significance")
def main(counts_pattern, output_path, strand_symmetric, split_dir, dry_run, force_overwrite): """export tab delimited combined counts table by appending the 12 mutation direction tables, adding a new column ``direction``.""" args = locals() output_path = abspath(output_path) if strand_symmetric and split_dir: split_dir = abspath(split_dir) else: split_dir = None # check we the glob pattern produces the correct number of files counts_files = glob.glob(counts_pattern) check_found_filenames(counts_files) counts_filename = os.path.join(output_path, 'combined_counts.txt') runlog_path = os.path.join(output_path, "combined_counts.log") if not dry_run: if not force_overwrite and (os.path.exists(counts_filename) or os.path.exists(runlog_path)): msg = "Either %s or %s already exist. Force overwrite of "\ "existing files with -F." raise ValueError(msg % (counts_filename, runlog_path)) makedirs(output_path) if split_dir: makedirs(split_dir) LOGGER.log_file_path = runlog_path LOGGER.log_message(str(args), label='vars') for fn in counts_files: LOGGER.input_file(fn, label="count_file") start_time = time.time() # run the program all_counts = [] header = None num_rows = 0 basenames = [] for fn in counts_files: basenames.append(os.path.basename(fn)) mutation = direction.findall(fn)[0] table = LoadTable(fn, sep='\t') if header is None: header = list(table.header) header.append('direction') num_rows = table.shape[0] data = table.tolist() new = [] for row in data: row.append(mutation) new.append(row) all_counts += new table = LoadTable(header=header, rows=all_counts) if strand_symmetric: table = make_strand_symmetric_table(table) if split_dir: group_subtables = get_subtables(table, group_label='direction') if not dry_run: table.write(counts_filename, sep='\t') LOGGER.output_file(counts_filename) if split_dir: for group, subtable in group_subtables: # we first assume that group is part of the filenames! fn = [bn for bn in basenames if group in bn] if len(fn) == 1: fn = fn[0] else: fn = "%s.txt" % group counts_filename = os.path.join(split_dir, fn) subtable.write(counts_filename, sep='\t') LOGGER.output_file(counts_filename) # determine runtime duration = time.time() - start_time if not dry_run: LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)") print("Done!")
def nbr(countsfile, outpath, countsfile2, first_order, strand_symmetry, group_label, group_ref, plot_cfg, no_type3, format, verbose, dry_run): '''log-linear analysis of neighbouring base influence on point mutation Writes estimated statistics, figures and a run log to the specified directory outpath. See documentation for count table format requirements. ''' if no_type3: util.exclude_type3_fonts() args = locals() outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) runlog_path = os.path.join(outpath, "analysis.log") LOGGER.log_file_path = runlog_path LOGGER.log_message(str(args), label='vars') counts_filename = util.abspath(countsfile) counts_table = util.load_table_from_delimited_file(counts_filename, sep='\t') LOGGER.input_file(counts_filename, label="countsfile1_path") positions = [c for c in counts_table.header if c.startswith('pos')] if not first_order and len(positions) != 4: raise ValueError("Requires four positions for analysis") group_label = group_label or None group_ref = group_ref or None if strand_symmetry: group_label = 'strand' group_ref = group_ref or '+' if group_label not in counts_table.header: print("ERROR: no column named 'strand', exiting.") exit(-1) if countsfile2: print("Performing 2 group analysis") group_label = group_label or 'group' group_ref = group_ref or '1' counts_table1 = counts_table.with_new_column(group_label, lambda x: '1', columns=counts_table.header[0]) fn2 = util.abspath(countsfile2) counts_table2 = util.load_table_from_delimited_file(fn2, sep='\t') LOGGER.input_file(fn2, label="countsfile2_path") counts_table2 = counts_table2.with_new_column(group_label, lambda x: '2', columns=counts_table2.header[0]) # now combine header = [group_label] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = LoadTable(header=header, rows=raw1 + raw2) if not dry_run: outfile = os.path.join(outpath, 'group_counts_table.txt') counts_table.write(outfile, sep='\t') LOGGER.output_file(outfile, label="group_counts") if dry_run or verbose: print() print(counts_table) print() plot_config = util.get_plot_configs(cfg_path=plot_cfg) msg = single_group(counts_table, outpath, group_label, group_ref, positions, plot_config, first_order, dry_run) print(msg)
def collate(base_path, output_path, exclude_paths, overwrite): """collates all classifier performance stats and writes to a single tsv file""" LOGGER.log_args() outpath = os.path.join(output_path, "collated.tsv.gz") logfile_path = os.path.join(output_path, "collated.log") if os.path.exists(outpath) and not overwrite: click.secho(f"Skipping. {outpath} exists. " "Use overwrite to force.", fg='green') exit(0) stat_fns = exec_command(f'find {base_path} -name' ' "*performance.json*"') stat_fns = stat_fns.splitlines() if not stat_fns: msg = f'No files matching "*performance.json*" in {base_path}' click.secho(msg, fg='red') return LOGGER.log_file_path = logfile_path records = [] keys = set() exclude_paths = [] if exclude_paths is None else exclude_paths.split(',') num_skipped = 0 for fn in tqdm(stat_fns, ncols=80): if skip_path(exclude_paths, fn): num_skipped += 1 LOGGER.log_message(fn, label="SKIPPED FILE") continue LOGGER.input_file(fn) data = load_json(fn) labels = data['classification_report']['labels'] fscores = data['classification_report']['f-score'] row = { "stat_path": fn, "classifier_path": data["classifier_path"], "auc": data["auc"], "algorithm": data["classifier_label"], "mean_precision": data["mean_precision"], f"fscore({labels[0]})": fscores[0], f"fscore({labels[1]})": fscores[1], 'balanced_accuracy': data['balanced_accuracy'] } row.update(data["feature_params"]) keys.update(row.keys()) records.append(row) columns = sorted(keys) rows = list(map(lambda r: [r.get(c, None) for c in columns], records)) table = LoadTable(header=columns, rows=rows) table = table.sorted(reverse="auc") table = table.with_new_column( "name", lambda x: model_name_from_features(*x), columns=["flank_size", "feature_dim", "usegc", "proximal"]) table = table.with_new_column("size", sample_size_from_path, columns="classifier_path") table.write(outpath) LOGGER.output_file(outpath) # make summary statistics via grouping by factors factors = [ "algorithm", "name", "flank_size", "feature_dim", "proximal", "usegc", "size" ] summary = summary_stat_table(table, factors=factors) outpath = os.path.join(output_path, "summary_statistics.tsv.gz") summary.write(outpath) LOGGER.output_file(outpath) if num_skipped: click.secho("Skipped %d files that matched exclude_paths" % num_skipped, fg='red')
def single_group(counts_table, outpath, group_label, group_ref, positions, plot_config, first_order, dry_run): # Collect statistical analysis results summary = [] max_results = {} # Single position analysis print("Doing single position analysis") single_results = single_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(single_results) max_results[1] = max(single_results[p]['rel_entropy'] for p in single_results) if not dry_run: outfilename = os.path.join(outpath, "1.json") util.dump_loglin_stats(single_results, outfilename) LOGGER.output_file(outfilename, label="analysis1") fig = get_single_position_fig( single_results, positions, plot_config.get('1-way plot', 'figsize'), group_label=group_label, group_ref=group_ref, figwidth=plot_config.get('1-way plot', 'figwidth'), xlabel_fontsize=plot_config.get('1-way plot', 'xlabel_fontsize'), ylabel_fontsize=plot_config.get('1-way plot', 'ylabel_fontsize'), xtick_fontsize=plot_config.get('1-way plot', 'xtick_fontsize'), ytick_fontsize=plot_config.get('1-way plot', 'ytick_fontsize')) format_offset(fig, int(plot_config.get('1-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "1.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section if first_order: msg = "Done! Check %s for your results" % outpath summary = LoadTable(header=['Position', 'RE', 'Deviance', 'df', 'prob', 'formula'], rows=summary, digits=2, space=2) if not dry_run: outfilename = os.path.join(outpath, "summary.txt") summary.write(outfilename, sep='\t') LOGGER.output_file(outfilename, label="summary") return msg print("Doing two positions analysis") results = get_two_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(results) max_results[2] = max(results[p]['rel_entropy'] for p in results) if not dry_run: outfilename = os.path.join(outpath, "2.json") util.dump_loglin_stats(results, outfilename) LOGGER.output_file(outfilename, label="analysis2") fig = get_two_position_fig(results, positions, plot_config.get('2-way plot', 'figsize'), group_label=group_label, group_ref=group_ref, xtick_fontsize=plot_config.get( '2-way plot', 'xtick_fontsize'), ytick_fontsize=plot_config.get('2-way plot', 'ytick_fontsize')) fig.set_figwidth(plot_config.get('2-way plot', 'figwidth')) x_fsz = plot_config.get('2-way plot', 'xlabel_fontsize') y_fsz = plot_config.get('2-way plot', 'ylabel_fontsize') fig.text(0.5, plot_config.get('2-way plot', 'xlabel_pad'), 'Position', ha='center', va='center', fontsize=x_fsz) fig.text(plot_config.get('2-way plot', 'ylabel_pad'), 0.5, 'RE', ha='center', va='center', rotation='vertical', fontsize=y_fsz) format_offset(fig, int(plot_config.get('2-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "2.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section print("Doing three positions analysis") results = get_three_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(results) max_results[3] = max(results[p]['rel_entropy'] for p in results) if not dry_run: outfilename = os.path.join(outpath, "3.json") util.dump_loglin_stats(results, outfilename) LOGGER.output_file(outfilename, label="analysis3") fig = get_three_position_fig(results, positions, plot_config.get('3-way plot', 'figsize'), group_label=group_label, group_ref=group_ref, xtick_fontsize=plot_config.get( '3-way plot', 'xtick_fontsize'), ytick_fontsize=plot_config.get('3-way plot', 'ytick_fontsize')) fig.set_figwidth(plot_config.get('3-way plot', 'figwidth')) x_fsz = plot_config.get('3-way plot', 'xlabel_fontsize') y_fsz = plot_config.get('3-way plot', 'ylabel_fontsize') fig.text(0.5, plot_config.get('3-way plot', 'xlabel_pad'), 'Position', ha='center', va='center', fontsize=x_fsz) fig.text(plot_config.get('3-way plot', 'ylabel_pad'), 0.5, 'RE', ha='center', va='center', rotation='vertical', fontsize=y_fsz) format_offset(fig, int(plot_config.get('3-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "3.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section print("Doing four positions analysis") results = get_four_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(results) max_results[4] = max(results[p]['rel_entropy'] for p in results) if not dry_run: outfilename = os.path.join(outpath, "4.json") util.dump_loglin_stats(results, outfilename) LOGGER.output_file(outfilename, label="analysis4") fig = get_four_position_fig(results, positions, plot_config.get('4-way plot', 'figsize'), group_label=group_label, group_ref=group_ref) fig.set_figwidth(plot_config.get('4-way plot', 'figwidth')) ax = fig.gca() x_fsz = plot_config.get('4-way plot', 'xlabel_fontsize') y_fsz = plot_config.get('4-way plot', 'ylabel_fontsize') ax.set_xlabel('Position', fontsize=x_fsz) ax.set_ylabel('RE', fontsize=y_fsz) format_offset(fig, int(plot_config.get('4-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "4.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section # now generate summary plot bar_width = 0.5 index = numpy.arange(4) y_lim = max(max_results.values()) y_fmt = util.FixedOrderFormatter(numpy.floor(numpy.log10(y_lim))) fig = pyplot.figure(figsize=plot_config.get('summary plot', 'figsize')) ax = fig.gca() ax.yaxis.set_major_formatter(y_fmt) bar = pyplot.bar(index, [max_results[i] for i in range(1, 5)], bar_width) pyplot.xticks(index + (bar_width / 2.), list(range(1, 5)), fontsize=plot_config.get('summary plot', 'xtick_fontsize')) x_sz = plot_config.get('summary plot', 'xlabel_fontsize') y_sz = plot_config.get('summary plot', 'ylabel_fontsize') ax.set_xlabel("Effect Order", fontsize=x_sz) ax.set_ylabel("RE$_{max}$", fontsize=y_sz) x_sz = plot_config.get('summary plot', 'xtick_fontsize') y_sz = plot_config.get('summary plot', 'ytick_fontsize') ax.tick_params(axis='x', labelsize=x_sz, pad=x_sz // 2, length=0) ax.tick_params(axis='y', labelsize=y_sz, pad=y_sz // 2) format_offset(fig, int(plot_config.get('summary plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "summary.pdf") pyplot.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) summary = LoadTable(header=['Position', 'RE', 'Deviance', 'df', 'prob', 'formula'], rows=summary, digits=2, space=2) if not dry_run: outfilename = os.path.join(outpath, "summary.txt") summary.write(outfilename, sep='\t') LOGGER.output_file(outfilename, label="summary") print(summary) pyplot.close('all') msg = "Done! Check %s for your results" % outpath return msg