def plot_rmsd(nucs, structure, cols): import matplotlib.pyplot as plt from .util import ceil_div from collections import defaultdict rmsdss = defaultdict(list) for nuc in nucs: with HDFFile(nuc, "r") as f: for chromosome, pos, rmsds in rmsd(f, structure): rmsdss[chromosome].append((pos, rmsds)) fig, axs = plt.subplots(ceil_div(len(rmsdss), cols), cols, sharex=True, sharey=True) if cols == 1: # Fix matplotlib's return type axs = [axs] for ax, (chromosome, data) in zip(chain.from_iterable(axs), sorted(rmsdss.items())): for poss, rmsds in data: ax.plot(poss, rmsds) ax.set_ylabel('\n'.join(("RMSD", chromosome))) for ax in axs[-1]: ax.set_xlabel("Genome Position (bp)") plt.show()
def plot_stats(nucs, structure, param, violation_padding): import matplotlib.pyplot as plt stat_names = {"scale": {}, "violations": {'padding': violation_padding}} stats = defaultdict(lambda: np.empty((len(nucs), 3), dtype='float')) fig, axs = plt.subplots(len(stat_names), 1) for (i, nuc), (stat, kwargs) in cartesian(enumerate(nucs), stat_names.items()): with HDFFile(nuc, "r") as f: param_value = f['structures'][structure]['calculation'].attrs[ param] if param_value == "particle_sizes": param_value["particle_sizes"] = param_value["particle_sizes"][ -1] stat_values = globals()[stat](f, structure, **kwargs) stats[stat][i] = [ param_value, np.mean(stat_values), np.std(stat_values) ] for ax, (stat_name, data) in zip(axs, stats.items()): ax.set_ylabel(stat_name) ax.set_xlabel(param) data = np.sort(data, axis=0) ax.errorbar(data.T[0], data.T[1], yerr=data.T[2]) fig.tight_layout() plt.show()
def linkage(nuc, structure): with HDFFile(nuc, "r") as f: coords = np.concatenate( list(f['structures'][structure]['coords'].values()), axis=1 ) return hierarchy.linkage( coords.reshape(coords.shape[0], -1), method='single', metric=distance )
def align(nuc, target, structure, mirror=True): with HDFFile(nuc, "r+") as f: coordss = f['structures'][structure]['coords'] all_coords = np.concatenate(list(coordss.values()), axis=1) if target == 'median': ref = np.median(all_coords, axis=0) else: ref = all_coords[int(target)] xforms = list( map(partial(least_squares, ref, mirror=mirror), all_coords)) for chr, coords in coordss.items(): coords[:] = np.array(list(map(op.matmul, xforms, coords)), dtype=coords.dtype)
def output_rmsd(nucs, structure, position): from functools import partial nucs_rmsds = [] for nuc in nucs: with HDFFile(nuc, "r") as f: nuc_rmsds = {} for chromo, positions, rmsds in rmsd(f, structure): nuc_rmsds.update(zip(zip(repeat(chromo), positions), rmsds)) nucs_rmsds.append(nuc_rmsds) conserved = set.intersection(*map(set, nucs_rmsds)) if position: positions = filter(partial(op.contains, conserved), position) else: positions = sorted(conserved) for chromo, pos in positions: print("{}:{} {}".format(chromo, pos, max(rmsd[chromo, pos] for rmsd in nucs_rmsds)))
def test_rmsd_cli(tmpdir): from nuc_analyze.main import cli files = [tmpdir.join("test1.nuc"), tmpdir.join("test2.nuc")] for p, nuc in zip(files, nucs): with HDFFile(p, 'w') as f: for chromo, coords in sorted( nuc['structures']['0']['coords'].items()): f.create_dataset('structures/0/coords/{}'.format(chromo), data=coords) for chromo, particle in nuc['structures']['0']['particles'].items( ): f.create_dataset( 'structures/0/particles/{}/positions'.format(chromo), data=particle['positions']) expected = ["1:10 0.0", "1:200 0.5", "X:100 2.0"] runner = CliRunner() result = runner.invoke(cli, ["rmsd", str(p)]) assert result.exit_code == 0 assert result.output == '\n'.join(expected) + '\n'
def test_csv(tmpdir): from nuc_analyze.main import cli from nuc_analyze.stats import flatten_dict from math import sqrt p = tmpdir.join("test.nuc") with HDFFile(p, 'w') as f: for chromo, coords in nuc['structures']['0']['coords'].items(): f.create_dataset('structures/0/coords/{}'.format(chromo), data=coords) flat_restraints = flatten_dict(nuc['structures']['0']['restraints']) for (chr_a, chr_b), restraints in flat_restraints.items(): f.create_dataset('structures/0/restraints/{}/{}'.format( chr_a, chr_b), data=restraints) calculation = f.create_group('structures/0/calculation') for attr, v in nuc['structures']['0']['calculation']['attrs'].items(): calculation.attrs[attr] = v scales = [0.0, sqrt(2) / 2] violations = [1, 4] expected = [ 'test.nuc', np.mean(scales), np.std(scales), np.mean(violations), np.std(violations), 1 ] runner = CliRunner() result = runner.invoke(cli, ["stats", str(p), "--param", "foo"]) assert result.exit_code == 0 out = iter(result.output.splitlines()) assert next( out ) == 'filename,scale_mean,scale_std,violations_mean,violations_std,foo' assert next(out) == ','.join(map(str, expected))
def stats(nucs, structure, param, violation_padding): import csv from sys import stdout stat_names = {"scale": {}, "violations": {'padding': violation_padding}} stat_cols = list( chain.from_iterable(("{}_mean".format(s), "{}_std".format(s)) for s in sorted(stat_names))) writer = csv.DictWriter(stdout, ["filename"] + stat_cols + list(param)) writer.writeheader() for nuc in nucs: with HDFFile(nuc, "r") as f: params = f['structures'][structure]['calculation'].attrs params = {k: params[k] for k in param} if "particle_sizes" in params: params["particle_sizes"] = params["particle_sizes"][-1] params["filename"] = str(nuc.name) for stat, kwargs in sorted(stat_names.items()): stat_values = globals()[stat](f, structure, **kwargs) params["{}_mean".format(stat)] = np.mean(stat_values) params["{}_std".format(stat)] = np.std(stat_values) writer.writerow(params)