def test_make_table_from_dataframe(self): """makes a table from a pandas data frame""" df = DataFrame(data=[[0, 1], [3, 7]], columns=["a", "b"]) t = make_table(data_frame=df) assert_equal(t.columns["a"], [0, 3]) assert_equal(t.columns["b"], [1, 7]) with self.assertRaises(TypeError): make_table(data_frame="abcde")
def test_str_tex_format(self): """str() produces latex tabular table""" tex_table = make_table(header=["a", "b"], data=[["val1", "val2"], ["val3", "val4"]]) tex = tex_table.to_string(format="tex") self.assertFalse("caption" in tex) # with a title tex_table = make_table( header=["a", "b"], data=[["val1", "val2"], ["val3", "val4"]], title="a title", ) tex = tex_table.to_string(format="tex") tex = tex.splitlines() self.assertEqual(tex[-2], r"\caption{a title}") tex = tex_table.to_string(format="tex", label="tab:first") tex = tex.splitlines() self.assertEqual(tex[-3], r"\caption{a title}") self.assertEqual(tex[-2], r"\label{tab:first}") # with a legend, no title tex_table = make_table( header=["a", "b"], data=[["val1", "val2"], ["val3", "val4"]], legend="a legend", ) tex = tex_table.to_string(format="tex") tex = tex.splitlines() # because it's treated as a title by default self.assertEqual(tex[-2], r"\caption{a legend}") # unless you say not to tex = tex_table.to_string(format="tex", concat_title_legend=False) tex = tex.splitlines() self.assertEqual(tex[-2], r"\caption*{a legend}") tex_table = make_table( header=["a", "b"], data=[["val1", "val2"], ["val3", "val4"]], title="a title.", legend="a legend", ) tex = tex_table.to_string(format="tex") tex = tex.splitlines() self.assertEqual(tex[-2], r"\caption{a title. a legend}") tex = tex_table.to_string(format="tex", concat_title_legend=False) tex = tex.splitlines() self.assertEqual(tex[2], r"\caption{a title.}") self.assertEqual(tex[-2], r"\caption*{a legend}") tex = tex_table.to_string(format="tex", concat_title_legend=False, label="table") tex = tex.splitlines() self.assertEqual(tex[2], r"\caption{a title.}") self.assertEqual(tex[3], r"\label{table}")
def test_make_table(self): """makes a table""" data = { "edge.parent": { "NineBande": "root", "edge.1": "root", "DogFaced": "root", "Human": "edge.0", }, "x": { "NineBande": 1.0, "edge.1": 1.0, "DogFaced": 1.0, "Human": 1.0, }, "length": { "NineBande": 4.0, "edge.1": 4.0, "DogFaced": 4.0, "Human": 4.0, }, "y": { "NineBande": 3.0, "edge.1": 3.0, "DogFaced": 3.0, "Human": 3.0, }, "z": { "NineBande": 6.0, "edge.1": 6.0, "DogFaced": 6.0, "Human": 6.0, }, "edge.names": { "NineBande": "NineBande", "edge.1": "edge.1", "DogFaced": "DogFaced", "Human": "Human", }, } t = make_table(data=data) self.assertEqual(t.shape, (4, 6)) # if index column not specified with self.assertRaises(IndexError): _ = t["Human", "edge.parent"] # applies row_ids as an index t = make_table(data=data, row_ids="edge.names") # index col is the first one, and the data can be indexed self.assertEqual(t.columns.order[0], "edge.names") self.assertEqual(t["Human", "edge.parent"], "edge.0")
def spectra_table(table, group_label): """returns a table with columns without position information""" assert 'direction' in table.header if 'mut' in table.header: # remove redundant category (counts of M == R) table = table.filtered("mut=='M'") columns = ['count', 'direction', group_label] table = table.get_columns(columns) # so we have a table with counts per direction results = [] group_categories = table.distinct_values(group_label) filter_template = "direction=='%(direction)s' and "\ "%(label)s=='%(category)s'" for direction in table.distinct_values('direction'): start = direction[0] for group_category in group_categories: condition = dict(direction=direction, label=group_label, category=group_category) sub_table = table.filtered(filter_template % condition) total = sub_table.summed('count') results.append([total, start, direction, group_category]) result = make_table(header=['count', 'start', 'direction', group_label], rows=results) return result
def get_combined_counts(table, positions): bases = 'ACGT' if type(positions) == str: counts = reduced_one_position(table, positions) mut_counts = counts['M'] unmut_counts = counts['R'] positions = [positions] states = bases header = ['mut', 'base', 'count'] else: counts = reduced_multiple_positions(table, *positions) mut_counts = counts['M'] unmut_counts = counts['R'] states = product(*list([bases] * len(positions))) header = ['mut'] + ['base%d' % (i + 1) for i in range(len(positions))] + ['count'] combined = [] for state in states: combined.append(['R'] + list(state) + [unmut_counts[state]]) combined.append(['M'] + list(state) + [mut_counts[state]]) counts_table = make_table(header=header, rows=combined) counts_table = counts_table.sorted(columns=header[:-1]) return counts_table
def test_strandsym_table(self): """makes strand symmetric table""" data = [ [1, "T", "T", "T", "T", "M", "TtoG"], [1, "G", "A", "A", "C", "M", "TtoG"], [1, "A", "G", "A", "A", "M", "TtoG"], [1, "G", "A", "A", "G", "M", "TtoG"], [1, "A", "C", "A", "A", "M", "TtoG"], [1, "G", "A", "C", "A", "M", "TtoG"], ] exp = [] for row in self.data: n = row[:] n.append("+") exp.append(n) for row in data: seq = list(map(DNA.complement, row[1:-2])) seq.reverse() n = [row[0]] + seq + ["M", "AtoC"] n.append("-") exp.append(n) table = make_table(header=self.header, rows=self.data + data) r = make_strand_symmetric_table(table) self.assertEqual(r.tolist(), exp)
def make_strand_symmetric_table(table): '''takes a combined counts table and returns a table with reverse complemented seqs Uses MUTATION_COMPLEMENTS''' new_data = [] direction_index = [i for i in range(len(table.header)) if table.header[i] == 'direction'][0] for plus, minus in list(MUTATION_COMPLEMENTS.items()): plus_table = table.filtered('direction=="%s"' % plus) plus_data = add_strand_column(plus_table.tolist(), '+') new_data.extend(plus_data) minus_table = table.filtered('direction=="%s"' % minus) if minus_table.shape[0] == 0: continue minus_table = _reverse_complement(minus_table) minus_data = minus_table.tolist() for row in minus_data: row[direction_index] = plus minus_data = add_strand_column(minus_data, '-') new_data.extend(minus_data) return make_table(header=table.header[:] + ['strand'], rows=new_data)
def test_str_md_format(self): """str() produces markdown table""" md_table = make_table(header=["a", "b"], data=[["val1", "val2"], ["has | symbol", "val4"]]) md = md_table.to_string(format="md") self.assertTrue(r"has \| symbol" in md)
def test_valid_setitem(self): """tabular_result works when set correct item type""" tr = tabular_result("null") tr["result"] = make_table(data={"A": [0, 1]}) darr = DictArray({"A": [0, 1]}) tr["result2"] = darr js = tr.to_json() self.assertIsInstance(js, str)
def test_load_mixed_static(self): """load data, mixed data type columns remain as string""" t = make_table(header=["A", "B"], data=[[1, 1], ["a", 2]]) with TemporaryDirectory(".") as dirname: path = pathlib.Path(dirname) / "table.txt" t.write(str(path), sep="\t") # if static types, then mixed columns become strings r = load_table(path, sep="\t", static_column_types=True) self.assertTrue("str" in r.columns["A"].dtype.name)
def load_table_from_delimited_file(path, sep='\t'): '''returns a Table object after a quicker loading''' with open_(path, 'rt') as infile: header = infile.readline().strip().split(sep) count_index = header.index('count') records = [] for line in infile: line = line.strip().split(sep) line[count_index] = int(line[count_index]) records.append(line) table = make_table(header=header, rows=records) return table
def missing_species_names(names): """returns a Table of missing species names, or None""" missing = [] for name in names: n = Species.get_species_name(name) if n == "None": missing.append([name]) if missing: result = make_table(header=["MISSING SPECIES"], data=missing) else: result = None return result
def test_reverse_complement(self): table = make_table(header=self.header, rows=self.data) ex = [ [1670, "A", "A", "A", "A", "M", "AtoC"], [557, "G", "T", "T", "C", "M", "AtoC"], [1479, "T", "T", "C", "T", "M", "AtoC"], [925, "C", "T", "T", "C", "M", "AtoC"], [1919, "T", "T", "G", "T", "M", "AtoC"], [442, "T", "G", "T", "C", "M", "AtoC"], ] got = _reverse_complement(table) raw_got = got.tolist() self.assertEqual(raw_got, ex)
def _reverse_complement(table): '''returns a table with sequences reverse complemented''' pos_indices = [i for i, c in enumerate( table.header) if c.startswith('pos')] rows = table.tolist() for row in rows: # we use the cogent3 DnaSeq object to do reverse complementing seq = DNA.make_seq(''.join(row[i] for i in pos_indices)) seq = list(seq.rc()) for i, index in enumerate(pos_indices): row[index] = seq[i] if rows: new = make_table(header=table.header, rows=rows) else: new = None return new
def get_grouped_combined_counts(table, position, group_label): """wraps motif_count.get_combined_counts for groups""" group_cats = table.distinct_values(group_label) all_data = [] header = None for category in group_cats: subtable = table.filtered(lambda x: x == category, columns=group_label) counts = motif_count.get_combined_counts(subtable, position) if header is None: header = [group_label] + list(counts.header) counts = counts.with_new_column(group_label, lambda x: category, columns=counts.header[0]) all_data.extend(counts.tolist(header)) counts = make_table(header=header, rows=all_data) counts.sorted(columns=[group_label, 'mut']) return counts
def test_deserialise_tabular_table(self): """correctly deserialises Table""" from cogent3 import make_table table = make_table( header=["id", "foo", "bar"], rows=[ [1, "abc", 11], [2, "bca", 22], [3, "cab", 33], [4, "abc", 44], [5, "bca", 55], ], ) json = table.to_json() got = deserialise_object(json) self.assertEqual(got.to_dict(), table.to_dict())
def test_summed(self): """test the table summed method""" t5 = Table(header=self.t5_header, rows=self.t5_rows) self.assertEqual(t5.summed(), [4, 4, 4, 4]) self.assertEqual(t5.summed(col_sum=False), [4, 4, 8]) t2 = Table(header=self.t2_header, rows=self.t2_rows) self.assertEqual(t2.summed(indices=2), 165) mix = make_table(header=["A", "B"], rows=[[0, ""], [1, 2], [3, 4]]) self.assertEqual(mix.summed("B", strict=False), 6) self.assertEqual(mix.summed(0, col_sum=False, strict=False), 0) self.assertEqual(mix.summed(1, col_sum=False), 3) self.assertEqual(mix.summed(strict=False), [4, 6]) self.assertEqual(mix.summed(col_sum=False, strict=False), [0, 3, 7]) with self.assertRaises(RuntimeError): _ = mix.summed([0, 2], col_sum=False, strict=False) with self.assertRaises(TypeError): _ = mix.summed(strict=True)
def test_count_unique(self): """correctly computes unique values""" data = { "Project_Code": [ "Ovary-AdenoCA", "Liver-HCC", "Panc-AdenoCA", "Panc-AdenoCA", ], "Donor_ID": ["DO46416", "DO45049", "DO51493", "DO32860"], "Variant_Classification": ["IGR", "Intron", "Intron", "Intron"], } table = make_table(data=data) co = table.count_unique(["Project_Code", "Variant_Classification"]) self.assertEqual(co[("Panc-AdenoCA", "Intron")], 2) self.assertEqual(co[("Liver-HCC", "IGR")], 0) co = table.count_unique("Variant_Classification") self.assertEqual(co["Intron"], 3) self.assertEqual(co["IGR"], 1)
def dump_genes(ensembl_account, species, outpath, coord_names, release, limit): """Dump meta data table for genes from one species in release ENSEMBL_ACCOUNT and exits.""" ensembl_account = _get_account(ensembl_account) if len(species) > 1: msg = "dump_genes handles single species only" click.secho(msg, fg="red") sys.exit(-1) missing_species = missing_species_names(species) if missing_species: msg = [ "The following species names don't match an Ensembl record. " "Check spelling!", str(missing_species), "\nAvailable species are at this server are:", str(display_available_dbs(ensembl_account)), ] click.secho("\n".join(msg), fg="red") sys.exit(-1) if coord_names: chroms = load_coord_names(coord_names) else: chroms = None genome = Genome(species[0], release=release, account=ensembl_account) genes = _get_ref_genes(genome, chroms, limit) records = [] for g in genes: records.append([g.stableid, g.biotype, g.location, g.description]) if records: table = make_table( header=["stableid", "biotype", "location", "description"], rows=records ) table.write(outpath) click.secho("Wrote %d genes to %s" % (table.shape[0], outpath), fg="green") else: click.secho("No genes matching criteria", fg="blue")
def status(configpath): """checks download/install status using checkpoint files and config""" release, remote_path, local_path, species_dbs = read_config(configpath) content = os.listdir(local_path) dbnames = reduce_dirnames(content, species_dbs) rows = [] for db in dbnames: row = [ db.name, is_downloaded(local_path, db.name), is_installed(local_path, db.name), ] rows.append(row) table = make_table( header=["dbname", "Downloaded", "Installed"], rows=rows, title="Status of download and install", legend=f"config={configpath.name}; local_path={local_path}", ) print(table)
def get_count_table(observed, control, k=None): """return table of motif counts Each motif position is a separate column. All possible DNA motifs of length k are included. Arguments: - observed: the observed counts as {seq: count} - control: the control counts as {seq: count} - k: size of the motif""" rows = [] lengths = set( list(map(len, list(observed.keys()))) + list(map(len, list(control.keys())))) if len(lengths) != 1: raise ValueError("Motifs not all same length: %s" % str(lengths)) length = list(lengths)[0] if k and length != k: raise ValueError("k[%d] doesn't match motif length [%d]" % (k, length)) elif k is None: k = length states = list(set(observed.keys()) | set(control.keys())) states.sort() for state in states: state = ''.join(state) control_counts = control[state] observed_counts = observed[state] if control_counts == observed_counts == 0: # we skip unobserved states continue rows.append([control_counts] + list(state) + ['R']) rows.append([observed_counts] + list(state) + ['M']) header = ['count'] + ["pos%d" % i for i in range(k)] + ['mut'] table = make_table(header=header, rows=rows) return table
def display_available_dbs(account, release=None): """displays the available Ensembl databases at the nominated host""" db_list = get_db_name(account=account, db_type="core", release=release) db_list += get_db_name(account=account, db_type="compara", release=release) rows = [] for db_name in db_list: species_name = db_name.species if species_name: common_name = Species.get_common_name(db_name.species, level="ignore") if "compara" in db_name.name: species_name = common_name = "-" rows.append([db_name.release, db_name.name, species_name, common_name]) table = make_table( header=["Release", "Db Name", "Species", "Common Name"], data=rows, space=2 ) table = table.sorted(["Release", "Db Name"]) table.legend = ( "Values of 'None' indicate cogent does not have a value for that database name." ) return table
def _parse_db_display(output, columns): """finds the table display and accumulates the content""" result = output.splitlines() header = [] for index, line in enumerate(result): if not header and columns[0] in line: header = columns break if header: rows = [] for i in range(index + 2, len(result)): line = result[i].strip() if line.startswith("----------"): break line = line.split() rows.append(line[:len(columns)]) table = make_table(header=header, data=rows) else: table = None return table
def get_one2one_orthologs( compara, ref_genes, outpath, not_strict, force_overwrite, test ): """writes one-to-one orthologs of protein coding genes to outpath""" species = Counter(compara.species) written = 0 records = [] with click.progressbar(ref_genes, label="Finding 1to1 orthologs") as ids: for gene in ids: outfile_name = os.path.join(outpath, "%s.fa.gz" % gene) if os.path.exists(outfile_name) and not force_overwrite: written += 1 continue syntenic = list( compara.get_related_genes( stableid=gene, relationship="ortholog_one2one" ) ) if len(syntenic) != 1: continue syntenic = syntenic[0] if not not_strict and ( syntenic is None or Counter(syntenic.get_species_set()) != species ): # skipping, not all species had a 1to1 ortholog for this gene continue seqs = [] for m in syntenic.members: records.append([gene, m.stableid, m.location, m.description]) name = Species.get_common_name(m.genome.species) cds = m.canonical_transcript.cds.trim_stop_codon(allow_partial=True) cds.name = name seqs.append([name, cds]) seqs = make_unaligned_seqs(data=seqs) if test: print() print(gene) print(seqs.to_fasta()) else: with gzip.open(outfile_name, "wt") as outfile: outfile.write(seqs.to_fasta() + "\n") LOGGER.output_file(outfile_name) written += 1 if test: msg = "Would have written %d files to %s" % (written, outpath) else: msg = "Wrote %d files to %s" % (written, outpath) click.echo(msg) if written > 0: metadata = make_table( header=["refid", "stableid", "location", "description"], rows=records ) metadata.write(os.path.join(outpath, "metadata.tsv")) return
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite, dry_run, verbose): args = locals() table = load_table(countsfile, sep='\t') if not dry_run: log_file_path = os.path.join(util.abspath(outpath), 'spectra_analysis.log') LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') LOGGER.input_file(countsfile) # if there's a strand symmetry argument then we don't need a second file if strand_symmetry: group_label = 'strand' counts_table = util.spectra_table(table, group_label) if not strand_symmetry: group_label = 'group' # be sure there's two files assert countsfile2, f"must provide second counts file" counts_table2 = load_table(countsfile2, sep='\t') LOGGER.input_file(countsfile2) counts_table2 = counts_table2.with_new_column( 'group', lambda x: '2', columns=counts_table2.header[0]) counts_table1 = table.with_new_column('group', lambda x: '1', columns=table.header[0]) counts_table1 = util.spectra_table(counts_table1, group_label) counts_table2 = util.spectra_table(counts_table2, group_label) # now combine header = ['group'] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = make_table(header=header, rows=raw1 + raw2) if verbose: print(counts_table) # spectra table has [count, start, end, group] order # we reduce comparisons to a start base results = [] saveable = {} for start_base in counts_table.distinct_values('start'): subtable = counts_table.filtered('start == "%s"' % start_base) columns = [c for c in counts_table.header if c != 'start'] subtable = subtable.get_columns(columns) total_re, dev, df, collated, formula = log_lin.spectra_difference( subtable, group_label) r = [list(x) for x in collated.to_records(index=False)] if not strand_symmetry: grp_labels = {'1': countsfile, '2': countsfile2} grp_index = list(collated.columns).index('group') for row in r: row[grp_index] = grp_labels[row[grp_index]] p = chisqprob(dev, df) if p < 1e-6: prob = "%.2e" % p else: prob = "%.6f" % p for row in r: row.insert(0, start_base) row.append(prob) results += r significance = [ "RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df, "p=%s" % p ] stats = " : ".join(significance) print("Start base=%s %s" % (start_base, stats)) saveable[start_base] = dict(rel_entropy=total_re, deviance=dev, df=df, prob=p, formula=formula, stats=collated.to_json()) table = make_table(header=['start_base'] + list(collated.columns) + ['prob'], rows=results, digits=5).sorted(columns='ret') json_path = None outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) json_path = os.path.join(outpath, 'spectra_analysis.json') dump_json(saveable, json_path) LOGGER.output_file(json_path) table_path = os.path.join(outpath, 'spectra_summary.txt') table.write(table_path, sep='\t') LOGGER.output_file(table_path) LOGGER.log_message(str(significance), label="significance")
def test_to_plotly(self): """exercise producing a plotly table""" table = make_table(header=["a", "b"], data=[[0, 1]], index="a") drawable = table.to_plotly() self.assertIsInstance(drawable, Drawable) self._check_drawable_attrs(drawable.figure, "table")
def single_group(counts_table, outpath, group_label, group_ref, positions, plot_config, first_order, dry_run): # Collect statistical analysis results summary = [] max_results = {} # Single position analysis print("Doing single position analysis") single_results = single_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(single_results) max_results[1] = max(single_results[p]['rel_entropy'] for p in single_results) if not dry_run: outfilename = os.path.join(outpath, "1.json") util.dump_loglin_stats(single_results, outfilename) LOGGER.output_file(outfilename, label="analysis1") fig = get_single_position_fig( single_results, positions, plot_config.get('1-way plot', 'figsize'), group_label=group_label, group_ref=group_ref, figwidth=plot_config.get('1-way plot', 'figwidth'), xlabel_fontsize=plot_config.get('1-way plot', 'xlabel_fontsize'), ylabel_fontsize=plot_config.get('1-way plot', 'ylabel_fontsize'), xtick_fontsize=plot_config.get('1-way plot', 'xtick_fontsize'), ytick_fontsize=plot_config.get('1-way plot', 'ytick_fontsize')) format_offset(fig, int(plot_config.get('1-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "1.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section if first_order: msg = "Done! Check %s for your results" % outpath summary = make_table(header=['Position', 'RE', 'Deviance', 'df', 'prob', 'formula'], rows=summary, digits=2, space=2) if not dry_run: outfilename = os.path.join(outpath, "summary.txt") summary.write(outfilename, sep='\t') LOGGER.output_file(outfilename, label="summary") return msg print("Doing two positions analysis") results = get_two_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(results) max_results[2] = max(results[p]['rel_entropy'] for p in results) if not dry_run: outfilename = os.path.join(outpath, "2.json") util.dump_loglin_stats(results, outfilename) LOGGER.output_file(outfilename, label="analysis2") fig = get_two_position_fig(results, positions, plot_config.get('2-way plot', 'figsize'), group_label=group_label, group_ref=group_ref, xtick_fontsize=plot_config.get( '2-way plot', 'xtick_fontsize'), ytick_fontsize=plot_config.get('2-way plot', 'ytick_fontsize')) fig.set_figwidth(plot_config.get('2-way plot', 'figwidth')) x_fsz = plot_config.get('2-way plot', 'xlabel_fontsize') y_fsz = plot_config.get('2-way plot', 'ylabel_fontsize') fig.text(0.5, plot_config.get('2-way plot', 'xlabel_pad'), 'Position', ha='center', va='center', fontsize=x_fsz) fig.text(plot_config.get('2-way plot', 'ylabel_pad'), 0.5, 'RE', ha='center', va='center', rotation='vertical', fontsize=y_fsz) format_offset(fig, int(plot_config.get('2-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "2.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section print("Doing three positions analysis") results = get_three_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(results) max_results[3] = max(results[p]['rel_entropy'] for p in results) if not dry_run: outfilename = os.path.join(outpath, "3.json") util.dump_loglin_stats(results, outfilename) LOGGER.output_file(outfilename, label="analysis3") fig = get_three_position_fig(results, positions, plot_config.get('3-way plot', 'figsize'), group_label=group_label, group_ref=group_ref, xtick_fontsize=plot_config.get( '3-way plot', 'xtick_fontsize'), ytick_fontsize=plot_config.get('3-way plot', 'ytick_fontsize')) fig.set_figwidth(plot_config.get('3-way plot', 'figwidth')) x_fsz = plot_config.get('3-way plot', 'xlabel_fontsize') y_fsz = plot_config.get('3-way plot', 'ylabel_fontsize') fig.text(0.5, plot_config.get('3-way plot', 'xlabel_pad'), 'Position', ha='center', va='center', fontsize=x_fsz) fig.text(plot_config.get('3-way plot', 'ylabel_pad'), 0.5, 'RE', ha='center', va='center', rotation='vertical', fontsize=y_fsz) format_offset(fig, int(plot_config.get('3-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "3.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section print("Doing four positions analysis") results = get_four_position_effects(counts_table, positions, group_label=group_label) summary += make_summary(results) max_results[4] = max(results[p]['rel_entropy'] for p in results) if not dry_run: outfilename = os.path.join(outpath, "4.json") util.dump_loglin_stats(results, outfilename) LOGGER.output_file(outfilename, label="analysis4") fig = get_four_position_fig(results, positions, plot_config.get('4-way plot', 'figsize'), group_label=group_label, group_ref=group_ref) fig.set_figwidth(plot_config.get('4-way plot', 'figwidth')) ax = fig.gca() x_fsz = plot_config.get('4-way plot', 'xlabel_fontsize') y_fsz = plot_config.get('4-way plot', 'ylabel_fontsize') ax.set_xlabel('Position', fontsize=x_fsz) ax.set_ylabel('RE', fontsize=y_fsz) format_offset(fig, int(plot_config.get('4-way plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "4.pdf") fig.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) fig.clf() # refresh for next section # now generate summary plot bar_width = 0.5 index = numpy.arange(4) y_lim = max(max_results.values()) y_fmt = util.FixedOrderFormatter(numpy.floor(numpy.log10(y_lim))) fig = pyplot.figure(figsize=plot_config.get('summary plot', 'figsize')) ax = fig.gca() ax.yaxis.set_major_formatter(y_fmt) bar = pyplot.bar(index, [max_results[i] for i in range(1, 5)], bar_width) pyplot.xticks(index + (bar_width / 2.), list(range(1, 5)), fontsize=plot_config.get('summary plot', 'xtick_fontsize')) x_sz = plot_config.get('summary plot', 'xlabel_fontsize') y_sz = plot_config.get('summary plot', 'ylabel_fontsize') ax.set_xlabel("Effect Order", fontsize=x_sz) ax.set_ylabel("RE$_{max}$", fontsize=y_sz) x_sz = plot_config.get('summary plot', 'xtick_fontsize') y_sz = plot_config.get('summary plot', 'ytick_fontsize') ax.tick_params(axis='x', labelsize=x_sz, pad=x_sz // 2, length=0) ax.tick_params(axis='y', labelsize=y_sz, pad=y_sz // 2) format_offset(fig, int(plot_config.get('summary plot', 'ytick_fontsize') * .8)) if not dry_run: outfilename = os.path.join(outpath, "summary.pdf") pyplot.savefig(outfilename, bbox_inches='tight') print("Wrote", outfilename) summary = make_table(header=['Position', 'RE', 'Deviance', 'df', 'prob', 'formula'], rows=summary, digits=2, space=2) if not dry_run: outfilename = os.path.join(outpath, "summary.txt") summary.write(outfilename, sep='\t') LOGGER.output_file(outfilename, label="summary") print(summary) pyplot.close('all') msg = "Done! Check %s for your results" % outpath return msg
def nbr(countsfile, outpath, countsfile2, first_order, strand_symmetry, group_label, group_ref, plot_cfg, no_type3, format, verbose, dry_run): '''log-linear analysis of neighbouring base influence on point mutation Writes estimated statistics, figures and a run log to the specified directory outpath. See documentation for count table format requirements. ''' if no_type3: util.exclude_type3_fonts() args = locals() outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) runlog_path = os.path.join(outpath, "analysis.log") LOGGER.log_file_path = runlog_path LOGGER.log_message(str(args), label='vars') counts_filename = util.abspath(countsfile) counts_table = util.load_table_from_delimited_file(counts_filename, sep='\t') LOGGER.input_file(counts_filename, label="countsfile1_path") positions = [c for c in counts_table.header if c.startswith('pos')] if not first_order and len(positions) != 4: raise ValueError("Requires four positions for analysis") group_label = group_label or None group_ref = group_ref or None if strand_symmetry: group_label = 'strand' group_ref = group_ref or '+' if group_label not in counts_table.header: print("ERROR: no column named 'strand', exiting.") exit(-1) if countsfile2: print("Performing 2 group analysis") group_label = group_label or 'group' group_ref = group_ref or '1' counts_table1 = counts_table.with_new_column(group_label, lambda x: '1', columns=counts_table.header[0]) fn2 = util.abspath(countsfile2) counts_table2 = util.load_table_from_delimited_file(fn2, sep='\t') LOGGER.input_file(fn2, label="countsfile2_path") counts_table2 = counts_table2.with_new_column(group_label, lambda x: '2', columns=counts_table2.header[0]) # now combine header = [group_label] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = make_table(header=header, rows=raw1 + raw2) if not dry_run: outfile = os.path.join(outpath, 'group_counts_table.txt') counts_table.write(outfile, sep='\t') LOGGER.output_file(outfile, label="group_counts") if dry_run or verbose: print() print(counts_table) print() plot_config = util.get_plot_configs(cfg_path=plot_cfg) msg = single_group(counts_table, outpath, group_label, group_ref, positions, plot_config, first_order, dry_run) print(msg)
def get_syntenic_alignments_introns( compara, ref_genes, outpath, method_clade_id, mask_features, outdir, force_overwrite, test, ): """writes Ensembl `method` syntenic alignments to ref_genes""" species = Counter(compara.species) common_names = list(map(Species.get_common_name, compara.species)) filler = make_aligned_seqs( data=[(n, "N") for n in common_names], moltype=DNA, array_align=False ) written = 0 records = [] with click.progressbar(ref_genes, label="Finding 1to1 intron orthologs") as ids: for gene_id in ids: valid_locations = True locations = {} gene = _get_gene_from_compara(compara, gene_id) if not gene: LOGGER.log_message("stableid '%s' not found" % gene_id) continue if gene.canonical_transcript.introns is None: LOGGER.log_message("stableid '%s' has no introns" % gene_id) continue outfile_name = os.path.join(outpath, "%s.fa.gz" % gene.stableid) if os.path.exists(outfile_name) and not force_overwrite: written += 1 continue regions = list( compara.get_syntenic_regions( region=gene.canonical_transcript, method_clade_id=str(method_clade_id), ) ) alignments = [] for index, region in enumerate(regions): if region is None: msg = "stableid '%s' has no syntenic regions" % gene_id LOGGER.log_message(msg) continue try: got = Counter(region.get_species_set()) except (AttributeError, AssertionError): got = None # this is a PyCogent bug error = sys.exc_info() err_type = str(error[0]).split(".")[-1][:-2] err_msg = str(error[1]) msg = "gene_stable_id=%s; err_type=%s; msg=%s" % ( gene.stableid, err_type, err_msg, ) click.secho("ERROR:" + msg, fg="red") LOGGER.log_message(msg, label="ERROR") continue if got != species: msg = [ "stableid '%s'" % gene_id, "species set %s" % got, "does not match expected %s" % species, ] LOGGER.log_message(" ".join(msg)) continue if mask_features: aln = region.get_alignment(feature_types=["gene", "repeat", "cpg"]) aln = with_masked_features(aln, reverse=gene.location.strand == -1) else: aln = region.get_alignment() if aln is None: msg = "stableid '%s' has no syntenic alignment" % gene_id LOGGER.log_message(msg) continue aln = renamed_seqs(aln) if aln is not None: alignments.append(aln) for m in region.members: if m.location is None: valid_locations = False break if m.genome.species not in locations: union = m.location elif m.genome.species in locations: try: union = locations[m.genome.species].union(m.location) except AttributeError: raise AttributeError("%s" % str([gene_id, m.genome])) if union is None: valid_locations = False break locations[m.genome.species] = union if not alignments: msg = "stableid '%s' has no alignments" % gene_id LOGGER.log_message(msg) continue if not valid_locations: msg = [ "stableid '%s' has" % gene_id, "inconsistent location data for gene", "based syntenic block %s" % locations, ] LOGGER.log_message(" ".join(msg), label="WARN") continue assert len(locations) == len(species), locations for sp, loc in locations.items(): records.append([gene_id, loc]) # we put a column of Ns between syntenic regions so that subsequent # sampling for tuple aligned columns does not construct artificial # motifs align = None for aln in alignments: if align is None: align = aln continue align += filler + aln if test: print(repr(align)) else: with gzip.open(outfile_name, "wt") as outfile: outfile.write(align.to_fasta()) LOGGER.output_file(outfile_name) written += 1 click.secho("Wrote %d files to %s" % (written, outpath), fg="green") if written > 0: metadata = make_table(header=["refid", "location"], rows=records) metadata.write(os.path.join(outpath, "metadata.tsv")) return