Exemplo n.º 1
0
def count_converge_and_truth_overlap(run_dirs, ground_truth_fp):
    counts = []

    truth_set = set()
    with open(ground_truth_fp, 'r') as fh:
        for line in fh:
            line = line.rstrip()
            truth_set.add(os.path.basename(line))

    for run_dir in run_dirs:
        converge = None

        obj_path = os.path.join(run_dir, 'obj.txt')
        if os.path.exists(obj_path):
            converge = prmf.parse_pathway_obj(obj_path)
        else:
            sys.stderr.write(
                "[warning] Missing objective output for run with output directory {}\n"
                .format(os.path.basename(run_dir)))

        if converge is not None:
            # then include results in histogram
            # count number of converged pathways which exist in the ground truth
            count = 0
            for converge_pathway in converge.values():
                if os.path.basename(converge_pathway) in truth_set:
                    count += 1
            counts.append(count)
    return counts
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description="""
Create a histogram of converged pathways
""")
    parser.add_argument(
        '--indir',
        help=
        'Directory containing nmf_pathway.py outputs for multiple different runs, each with a different initialization',
        required=True)
    parser.add_argument('--outdir',
                        help='Directory to place image file',
                        required=True)
    args = parser.parse_args()

    # <indir>/run\d+/obj.txt
    run_regexp = re.compile(r'run\d+')
    run_dirs = []
    for fname in os.listdir(args.indir):
        if run_regexp.match(fname) is not None:
            run_dirs.append(os.path.join(args.indir, fname))

    N = len(run_dirs)

    # count number of converged pathways
    pathway_to_count = {}
    for run_dir in run_dirs:
        pathways_dict = prmf.parse_pathway_obj(os.path.join(
            run_dir, 'obj.txt'))
        pathways = pathways_dict.values()
        pathway_basenames = list(map(os.path.basename, pathways))
        for pathway in pathway_basenames:
            if pathway in pathway_to_count:
                pathway_to_count[pathway] += 1
            else:
                pathway_to_count[pathway] = 1
    pathway_count_pairs = sorted(pathway_to_count.items(),
                                 key=lambda x: x[1],
                                 reverse=True)
    hist_csv_fp = os.path.join(args.outdir, 'convergence_hist.csv')
    with open(hist_csv_fp, 'w') as hist_csv_fh:
        for pathway, count in pathway_count_pairs:
            hist_csv_fh.write("{},{}\n".format(pathway, count))

    pathway_to_freq = {}
    for pathway in pathway_to_count.keys():
        pathway_to_freq[pathway] = pathway_to_count[pathway] / N

    # histogram for converged pathways - {{
    freqs = pathway_to_freq.values()
    freqs_per = np.array(list(freqs)) * 100

    plt.clf()
    n, bins, patches = plt.hist(freqs_per, bins=np.arange(0, 110, 10))
    hist_fp = os.path.join(args.outdir, 'convergence_hist.png')
    plt.xlabel('Pathway Frequency')
    plt.ylabel('Frequency')
    plt.title('Stability of Converged Pathways')
    plt.savefig(hist_fp)
Exemplo n.º 3
0
def eval_prmf_runs(pathways_files, prmf_obj_files):
  if len(pathways_files) != len(prmf_obj_files):
    raise Exception("len(pathways_files) = {} != {} = len(prmf_obj_files)".format(len(pathways_files), len(prmf_obj_files)))

  vals = np.zeros((len(prmf_obj_files),))
  for i in range(len(prmf_obj_files)):
    prmf_obj_file = prmf_obj_files[i]
    latent_to_pathway = prmf.parse_pathway_obj(prmf_obj_file)
    for k, v in latent_to_pathway.items():
      pathway_int = parse_pathway_int(v)
      if pathway_int < K_LATENT:
        vals[i] += 1
  return vals
Exemplo n.º 4
0
def count_initialization_and_convergence_overlap(run_dirs):
    """
  For each run in <run_dirs>, count how many of the pathways used for initialization
  exist in the set of pathways that the method converged to
  """
    k = None
    counts = []
    for run_dir in run_dirs:
        converge = None
        init = None

        obj_path = os.path.join(run_dir, 'obj.txt')
        if os.path.exists(obj_path):
            converge = prmf.parse_pathway_obj(obj_path)
        else:
            sys.stderr.write(
                "[warning] Missing objective output for run with output directory {}\n"
                .format(os.path.basename(run_dir)))

        init_path = os.path.join(run_dir, 'init_pathways.txt')
        if os.path.exists(init_path):
            init = prmf.parse_init(init_path)
        else:
            sys.stderr.write(
                "[warning] Missing initialization output for run with output directory {}\n"
                .format(os.path.basename(run_dir)))

        if init is not None and converge is not None:
            # then include results in histogram
            # count number of init pathways which persist in the converged set
            count = 0
            converge_set = set(converge.values())
            for init_pathway in init:
                if init_pathway in converge_set:
                    count += 1
            counts.append(count)

            if k is None:
                k = len(init)
    return counts, k
Exemplo n.º 5
0
    parser.add_argument('--opt-outfile', required=True)
    parser.add_argument('--ppi-network',
                        help="PPI and pathway union graph stored as graphml",
                        required=True)
    parser.add_argument(
        '--latent',
        default=None,
        help="If provided, only run script on this latent factor",
        type=int)
    parser.add_argument('--outdir', required=True)
    args = parser.parse_args()

    nodelist = prmf.parse_nodelist(open(args.nodelist))
    ppi_network = nx.read_graphml(args.ppi_network)
    gene_by_latent = np.genfromtxt(args.gene_by_latent, delimiter=",")
    k_to_pathway_fp = prmf.parse_pathway_obj(args.opt_outfile)
    if (args.latent is not None):
        k_to_pathway_fp = {args.latent: k_to_pathway_fp[args.latent]}

    ofp = os.path.join(args.outdir, 'pathway_extension.out')
    ofh = open(ofp, 'w')
    for k, fp in k_to_pathway_fp.items():
        pathway = nx.read_graphml(fp)
        vec = gene_by_latent[:, k]
        node_to_score = score_pathway_neighbors(ppi_network, pathway, nodelist,
                                                vec)
        node_to_score = filter_pathway_neighbors(vec, node_to_score)

        bn = os.path.basename(fp)
        bn, ext = os.path.splitext(bn)
        ofh.write(bn + '\n')
Exemplo n.º 6
0
    parser.add_argument("--pathway-mat")
    parser.add_argument("--pathway-obj")
    parser.add_argument("--nodelist")
    parser.add_argument(
        "--truncate",
        default=False,
        type=bool,
        help="If True, truncate graph down to 50 nodes for visualization")
    parser.add_argument("--mapping-file", help="Node identifier mapping")
    parser.add_argument("--outdir")
    args = parser.parse_args()

    pathway_mat = np.genfromtxt(args.pathway_mat, delimiter=",")
    if (len(pathway_mat.shape) == 1):
        pathway_mat = pathway_mat.reshape(pathway_mat.shape[0], 1)
    latent_to_fp = prmf.parse_pathway_obj(args.pathway_obj)
    latent_to_G = {}
    for k, fp in latent_to_fp.items():
        latent_to_G[k] = nx.read_graphml(fp)
    nodelist = prmf.parse_nodelist(open(args.nodelist))
    mapping = parse_mapping_file(args.mapping_file)

    node_to_ind = {}
    for i, node in enumerate(nodelist):
        node_to_ind[node] = i

    for k, fp in latent_to_fp.items():
        G = latent_to_G[k]
        G = G.to_undirected()

        fig_width = 1200
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--indir',
        help=
        'Directory containing nmf_pathway.py outputs for multiple different runs, each with a different initialization',
        required=True)
    parser.add_argument(
        '--ground-truth',
        help=
        'File containing filepaths to the pathway files used as the basis of a simulation'
    )
    parser.add_argument('--outdir',
                        help='Directory to place image files',
                        required=True)
    args = parser.parse_args()

    # <indir>/run\d+/obj.txt
    run_regexp = re.compile(r'run\d+')
    run_dirs = []
    for fname in os.listdir(args.indir):
        if run_regexp.match(fname) is not None:
            run_dirs.append(os.path.join(args.indir, fname))

    N = len(run_dirs)

    # count number of converged pathways
    # TODO cli
    all_dict = {}
    for run_dir in run_dirs:
        pathways_dict = prmf.parse_pathway_obj(os.path.join(
            run_dir, 'obj.txt'))
        pathways = pathways_dict.items()
        for pathway in pathways:
            if pathway in all_dict:
                all_dict[pathway] += 1
            else:
                all_dict[pathway] = 1
    pathway_count_pairs = sorted(all_dict.items(),
                                 key=lambda x: x[1],
                                 reverse=True)
    for pathway, count in pathway_count_pairs:
        print('{}\t{}'.format(pathway, count))

    # histogram for initialization/convergence - {{
    # make a histogram of counts
    counts, k = count_initialization_and_convergence_overlap(run_dirs)

    plt.clf()
    n_bins = k + 1
    n, bins, patches = plt.hist(counts,
                                bins=np.arange(n_bins + 1) - 0.5,
                                color='red')
    initial_converged_fp = os.path.join(args.outdir,
                                        'initialization_convergence_hist.png')
    plt.xlabel('Number persisting')
    plt.ylabel('Frequency')
    plt.title(
        'Initial pathways persisting in converged set (N = {})'.format(N))
    plt.savefig(initial_converged_fp)
    # }} -

    # histogram for initialization/ground truth - {{
    if args.ground_truth is not None:
        counts = count_initialization_and_truth_overlap(
            run_dirs, args.ground_truth)

        plt.clf()
        n_bins = k + 1
        n, bins, patches = plt.hist(counts,
                                    bins=np.arange(n_bins + 1) - 0.5,
                                    color='red')
        initial_converged_fp = os.path.join(args.outdir,
                                            'initialization_truth_hist.png')
        plt.xlabel('Number existing')
        plt.ylabel('Frequency')
        plt.title(
            'Initial pathways existing in ground truth set (N = {})'.format(N))
        plt.savefig(initial_converged_fp)
    # }} -

    # histogram for ground truth/convergence - {{
    if args.ground_truth is not None:
        counts = count_converge_and_truth_overlap(run_dirs, args.ground_truth)

        plt.clf()
        n_bins = k + 1
        n, bins, patches = plt.hist(counts,
                                    bins=np.arange(n_bins + 1) - 0.5,
                                    color='red')
        initial_converged_fp = os.path.join(args.outdir,
                                            'convergence_truth_hist.png')
        plt.xlabel('KEGG pathways recovered')
        plt.ylabel('Frequency')
        plt.title('Pathway recovery using simulated data (N = {})'.format(N))
        plt.savefig(initial_converged_fp)
Exemplo n.º 8
0
    parser.add_argument("--indir",
                        help="The output directory of nmf_pathway",
                        required=True)
    parser.add_argument(
        "--nodelist",
        help="Node to index association used by the nmf_pathway run",
        required=True)
    parser.add_argument("--outdir",
                        help="Directory to place histogram and csv",
                        required=True)
    args = parser.parse_args()

    V_fp = os.path.join(args.indir, 'V.csv')
    V = np.genfromtxt(V_fp, delimiter=",")
    n_genes, k_latent = V.shape

    obj_fp = os.path.join(args.indir, 'obj.txt')
    latent_to_pathway_fp = prmf.parse_pathway_obj(obj_fp)
    Gs = []
    for k in range(k_latent):
        pathway_fp = latent_to_pathway_fp[k]
        G = nx.read_graphml(pathway_fp).to_undirected()
        Gs.append(G)

    nodelist = prmf.parse_nodelist(open(args.nodelist))

    latent_edge_diffs = measure_smoothness(V, Gs, nodelist)
    report_smoothness(latent_edge_diffs, 'edge')
    report_smoothness(latent_edge_diffs, 'comp')
    report_smoothness(latent_edge_diffs, 'all')