def count_converge_and_truth_overlap(run_dirs, ground_truth_fp): counts = [] truth_set = set() with open(ground_truth_fp, 'r') as fh: for line in fh: line = line.rstrip() truth_set.add(os.path.basename(line)) for run_dir in run_dirs: converge = None obj_path = os.path.join(run_dir, 'obj.txt') if os.path.exists(obj_path): converge = prmf.parse_pathway_obj(obj_path) else: sys.stderr.write( "[warning] Missing objective output for run with output directory {}\n" .format(os.path.basename(run_dir))) if converge is not None: # then include results in histogram # count number of converged pathways which exist in the ground truth count = 0 for converge_pathway in converge.values(): if os.path.basename(converge_pathway) in truth_set: count += 1 counts.append(count) return counts
def main(): parser = argparse.ArgumentParser(description=""" Create a histogram of converged pathways """) parser.add_argument( '--indir', help= 'Directory containing nmf_pathway.py outputs for multiple different runs, each with a different initialization', required=True) parser.add_argument('--outdir', help='Directory to place image file', required=True) args = parser.parse_args() # <indir>/run\d+/obj.txt run_regexp = re.compile(r'run\d+') run_dirs = [] for fname in os.listdir(args.indir): if run_regexp.match(fname) is not None: run_dirs.append(os.path.join(args.indir, fname)) N = len(run_dirs) # count number of converged pathways pathway_to_count = {} for run_dir in run_dirs: pathways_dict = prmf.parse_pathway_obj(os.path.join( run_dir, 'obj.txt')) pathways = pathways_dict.values() pathway_basenames = list(map(os.path.basename, pathways)) for pathway in pathway_basenames: if pathway in pathway_to_count: pathway_to_count[pathway] += 1 else: pathway_to_count[pathway] = 1 pathway_count_pairs = sorted(pathway_to_count.items(), key=lambda x: x[1], reverse=True) hist_csv_fp = os.path.join(args.outdir, 'convergence_hist.csv') with open(hist_csv_fp, 'w') as hist_csv_fh: for pathway, count in pathway_count_pairs: hist_csv_fh.write("{},{}\n".format(pathway, count)) pathway_to_freq = {} for pathway in pathway_to_count.keys(): pathway_to_freq[pathway] = pathway_to_count[pathway] / N # histogram for converged pathways - {{ freqs = pathway_to_freq.values() freqs_per = np.array(list(freqs)) * 100 plt.clf() n, bins, patches = plt.hist(freqs_per, bins=np.arange(0, 110, 10)) hist_fp = os.path.join(args.outdir, 'convergence_hist.png') plt.xlabel('Pathway Frequency') plt.ylabel('Frequency') plt.title('Stability of Converged Pathways') plt.savefig(hist_fp)
def eval_prmf_runs(pathways_files, prmf_obj_files): if len(pathways_files) != len(prmf_obj_files): raise Exception("len(pathways_files) = {} != {} = len(prmf_obj_files)".format(len(pathways_files), len(prmf_obj_files))) vals = np.zeros((len(prmf_obj_files),)) for i in range(len(prmf_obj_files)): prmf_obj_file = prmf_obj_files[i] latent_to_pathway = prmf.parse_pathway_obj(prmf_obj_file) for k, v in latent_to_pathway.items(): pathway_int = parse_pathway_int(v) if pathway_int < K_LATENT: vals[i] += 1 return vals
def count_initialization_and_convergence_overlap(run_dirs): """ For each run in <run_dirs>, count how many of the pathways used for initialization exist in the set of pathways that the method converged to """ k = None counts = [] for run_dir in run_dirs: converge = None init = None obj_path = os.path.join(run_dir, 'obj.txt') if os.path.exists(obj_path): converge = prmf.parse_pathway_obj(obj_path) else: sys.stderr.write( "[warning] Missing objective output for run with output directory {}\n" .format(os.path.basename(run_dir))) init_path = os.path.join(run_dir, 'init_pathways.txt') if os.path.exists(init_path): init = prmf.parse_init(init_path) else: sys.stderr.write( "[warning] Missing initialization output for run with output directory {}\n" .format(os.path.basename(run_dir))) if init is not None and converge is not None: # then include results in histogram # count number of init pathways which persist in the converged set count = 0 converge_set = set(converge.values()) for init_pathway in init: if init_pathway in converge_set: count += 1 counts.append(count) if k is None: k = len(init) return counts, k
parser.add_argument('--opt-outfile', required=True) parser.add_argument('--ppi-network', help="PPI and pathway union graph stored as graphml", required=True) parser.add_argument( '--latent', default=None, help="If provided, only run script on this latent factor", type=int) parser.add_argument('--outdir', required=True) args = parser.parse_args() nodelist = prmf.parse_nodelist(open(args.nodelist)) ppi_network = nx.read_graphml(args.ppi_network) gene_by_latent = np.genfromtxt(args.gene_by_latent, delimiter=",") k_to_pathway_fp = prmf.parse_pathway_obj(args.opt_outfile) if (args.latent is not None): k_to_pathway_fp = {args.latent: k_to_pathway_fp[args.latent]} ofp = os.path.join(args.outdir, 'pathway_extension.out') ofh = open(ofp, 'w') for k, fp in k_to_pathway_fp.items(): pathway = nx.read_graphml(fp) vec = gene_by_latent[:, k] node_to_score = score_pathway_neighbors(ppi_network, pathway, nodelist, vec) node_to_score = filter_pathway_neighbors(vec, node_to_score) bn = os.path.basename(fp) bn, ext = os.path.splitext(bn) ofh.write(bn + '\n')
parser.add_argument("--pathway-mat") parser.add_argument("--pathway-obj") parser.add_argument("--nodelist") parser.add_argument( "--truncate", default=False, type=bool, help="If True, truncate graph down to 50 nodes for visualization") parser.add_argument("--mapping-file", help="Node identifier mapping") parser.add_argument("--outdir") args = parser.parse_args() pathway_mat = np.genfromtxt(args.pathway_mat, delimiter=",") if (len(pathway_mat.shape) == 1): pathway_mat = pathway_mat.reshape(pathway_mat.shape[0], 1) latent_to_fp = prmf.parse_pathway_obj(args.pathway_obj) latent_to_G = {} for k, fp in latent_to_fp.items(): latent_to_G[k] = nx.read_graphml(fp) nodelist = prmf.parse_nodelist(open(args.nodelist)) mapping = parse_mapping_file(args.mapping_file) node_to_ind = {} for i, node in enumerate(nodelist): node_to_ind[node] = i for k, fp in latent_to_fp.items(): G = latent_to_G[k] G = G.to_undirected() fig_width = 1200
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--indir', help= 'Directory containing nmf_pathway.py outputs for multiple different runs, each with a different initialization', required=True) parser.add_argument( '--ground-truth', help= 'File containing filepaths to the pathway files used as the basis of a simulation' ) parser.add_argument('--outdir', help='Directory to place image files', required=True) args = parser.parse_args() # <indir>/run\d+/obj.txt run_regexp = re.compile(r'run\d+') run_dirs = [] for fname in os.listdir(args.indir): if run_regexp.match(fname) is not None: run_dirs.append(os.path.join(args.indir, fname)) N = len(run_dirs) # count number of converged pathways # TODO cli all_dict = {} for run_dir in run_dirs: pathways_dict = prmf.parse_pathway_obj(os.path.join( run_dir, 'obj.txt')) pathways = pathways_dict.items() for pathway in pathways: if pathway in all_dict: all_dict[pathway] += 1 else: all_dict[pathway] = 1 pathway_count_pairs = sorted(all_dict.items(), key=lambda x: x[1], reverse=True) for pathway, count in pathway_count_pairs: print('{}\t{}'.format(pathway, count)) # histogram for initialization/convergence - {{ # make a histogram of counts counts, k = count_initialization_and_convergence_overlap(run_dirs) plt.clf() n_bins = k + 1 n, bins, patches = plt.hist(counts, bins=np.arange(n_bins + 1) - 0.5, color='red') initial_converged_fp = os.path.join(args.outdir, 'initialization_convergence_hist.png') plt.xlabel('Number persisting') plt.ylabel('Frequency') plt.title( 'Initial pathways persisting in converged set (N = {})'.format(N)) plt.savefig(initial_converged_fp) # }} - # histogram for initialization/ground truth - {{ if args.ground_truth is not None: counts = count_initialization_and_truth_overlap( run_dirs, args.ground_truth) plt.clf() n_bins = k + 1 n, bins, patches = plt.hist(counts, bins=np.arange(n_bins + 1) - 0.5, color='red') initial_converged_fp = os.path.join(args.outdir, 'initialization_truth_hist.png') plt.xlabel('Number existing') plt.ylabel('Frequency') plt.title( 'Initial pathways existing in ground truth set (N = {})'.format(N)) plt.savefig(initial_converged_fp) # }} - # histogram for ground truth/convergence - {{ if args.ground_truth is not None: counts = count_converge_and_truth_overlap(run_dirs, args.ground_truth) plt.clf() n_bins = k + 1 n, bins, patches = plt.hist(counts, bins=np.arange(n_bins + 1) - 0.5, color='red') initial_converged_fp = os.path.join(args.outdir, 'convergence_truth_hist.png') plt.xlabel('KEGG pathways recovered') plt.ylabel('Frequency') plt.title('Pathway recovery using simulated data (N = {})'.format(N)) plt.savefig(initial_converged_fp)
parser.add_argument("--indir", help="The output directory of nmf_pathway", required=True) parser.add_argument( "--nodelist", help="Node to index association used by the nmf_pathway run", required=True) parser.add_argument("--outdir", help="Directory to place histogram and csv", required=True) args = parser.parse_args() V_fp = os.path.join(args.indir, 'V.csv') V = np.genfromtxt(V_fp, delimiter=",") n_genes, k_latent = V.shape obj_fp = os.path.join(args.indir, 'obj.txt') latent_to_pathway_fp = prmf.parse_pathway_obj(obj_fp) Gs = [] for k in range(k_latent): pathway_fp = latent_to_pathway_fp[k] G = nx.read_graphml(pathway_fp).to_undirected() Gs.append(G) nodelist = prmf.parse_nodelist(open(args.nodelist)) latent_edge_diffs = measure_smoothness(V, Gs, nodelist) report_smoothness(latent_edge_diffs, 'edge') report_smoothness(latent_edge_diffs, 'comp') report_smoothness(latent_edge_diffs, 'all')