Exemplo n.º 1
0
def simulate_mixture(args):
  nodelist = prmf.parse_nodelist(args.nodelist)

  # first, sample subset of seed-lists
  seed_lists_sample, seed_list_sizes, chosen_seed_fps = sample_pathways(args, nodelist)

  # define multinomial parameters
  prs = [(1 - args.noise_pr)/(len(seed_lists_sample) - 1)] * (len(seed_lists_sample) - 1)
  prs.append(args.noise_pr)

  # then, from the sampled seed list and the background <nodelist>, sample gene lists
  gene_lists = []
  for i in range(args.n_gene_lists):
    # sample from multinomial to determine number of elements coming from each seed list
    sample_sizes = nprand.multinomial(args.gene_list_size, prs)

    # if we try to overdraw from a seed list, sample the remaining from background
    for j in range(len(sample_sizes)-1):
      seed_list_size = seed_list_sizes[j]
      sample_size = sample_sizes[j]
      diff = sample_size - seed_list_size
      if diff > 0:
        sample_sizes[j] -= diff
        sample_sizes[-1] += diff

    # then sample from seed lists uniformly at random
    gene_list = set()
    for j in range(len(seed_lists_sample)):
      seed_list = seed_lists_sample[j]
      sample_size = sample_sizes[j]
      seed_list_inds = nprand.choice(len(seed_list), size=sample_size)
      for seed_list_ind in seed_list_inds:
        gene_list.add(seed_list[seed_list_ind])

    gene_lists.append(sorted(gene_list))

  # write gene lists to file
  write_lists(args, gene_lists)
  write_seeds(args, chosen_seed_fps)
Exemplo n.º 2
0
def simulate_whole(args):
  """
  In contrast to simulate_mixture, do not combine nodes from different pathways into a single gene list
  """
  nodelist = prmf.parse_nodelist(args.nodelist)

  # first, sample subset of seed-lists
  seed_lists_sample, seed_list_sizes, chosen_seed_fps = sample_pathways(args, nodelist)

  # then, from the sampled seed list and the background <nodelist>, sample gene lists
  gene_lists = []
  for i in range(args.n_gene_lists):
    # sample from multinomial to determine number of elements coming from each seed list
    sample_size = nprand.binomial(args.gene_list_size, 1 - args.noise_pr)

    # then sample from seed list uniformly at random
    gene_list = set()
    seed_list = seed_lists_sample[i]
    if sample_size < len(seed_list):
      seed_list_inds = nprand.choice(len(seed_list), size=sample_size, replace=False)
      for seed_list_ind in seed_list_inds:
        gene_list.add(seed_list[seed_list_ind])
    else:
      # TODO warn?
      for seed in seed_list:
        gene_list.add(seed)

    # sample remaining from background
    # TODO catch error if size > len(nodelist)
    nodelist_inds = nprand.choice(len(nodelist), size=(args.gene_list_size - sample_size), replace=False)
    for nodelist_ind in nodelist_inds:
      gene_list.add(nodelist[nodelist_ind])

    gene_lists.append(sorted(gene_list))

  # write gene lists to file
  write_lists(args, gene_lists)
  write_seeds(args, chosen_seed_fps)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="""
Evaluate NMF versus Pathway-Regularized Matrix Factorization by plotting PR curves on one figure.
""")
    parser.add_argument("--gene-by-latent-csvs",
                        nargs="+",
                        help=".csv files",
                        required=True)
    parser.add_argument("--labels",
                        nargs="+",
                        help="parallel to --gene-by-latent-csvs",
                        required=True)
    parser.add_argument("--nodelist",
                        type=argparse.FileType('r'),
                        required=True)
    parser.add_argument("--true-seeds",
                        type=argparse.FileType('r'),
                        required=True)
    parser.add_argument("--outdir", type=str, required=True)
    args = parser.parse_args()

    # parse inputs - {{
    W_mats = []
    colors = []  # TODO

    W_mats = list(
        map(
            lambda x: pd.read_csv(x, sep=",", header='infer', index_col=0).
            values, args.gene_by_latent_csvs))
    label_strs = list(map(lambda x: x + "; AUC={:0.3f}", args.labels))
    nodelist = prmf.parse_nodelist(args.nodelist)

    true_seed_fps = []
    for line in args.true_seeds:
        line = line.rstrip()
        true_seed_fps.append(line)

    true_seed_lists = []
    for true_seed_fp in true_seed_fps:
        seed_list = prmf.parse_seedlist(true_seed_fp)
        true_seed_lists.append(seed_list)

    pathways_mat = prmf.nodelists_to_mat(true_seed_lists, nodelist)
    # }} - parse inputs

    # reorganize <matching> so we can find each method's latent factor that best matches the ground truth
    pathway_to_latent_maps = []
    for i in range(len(W_mats)):
        matching = prmf.match(W_mats[i], pathways_mat)
        pathway_to_latent_map = {}
        for match in matching:
            factor_id_match, pathway_id_match, auc = match
            factor_id = matching_id_to_ind(factor_id_match)
            pathway_id = matching_id_to_ind(pathway_id_match)
            pathway_to_latent_map[pathway_id] = (factor_id, auc)
        pathway_to_latent_maps.append(pathway_to_latent_map)

    # plot Precision-Recall curves
    match_ind = 0
    method_to_avg_precision_vals = {}
    for i in range(len(W_mats)):
        method_to_avg_precision_vals[i] = []
    for pathway_id in range(pathways_mat.shape[1]):
        plt.clf()
        y_true = pathways_mat[:, pathway_id]
        true_fraction = np.sum(y_true) / y_true.shape[0]
        for i in range(len(pathway_to_latent_maps)):
            pathway_to_latent_map = pathway_to_latent_maps[i]
            factor_id, auc = pathway_to_latent_map[pathway_id]
            y_score = W_mats[i][:, factor_id]
            precision, recall, thresholds = sklearn.metrics.precision_recall_curve(
                y_true, y_score)
            method_to_avg_precision_vals[i].append(
                sklearn.metrics.average_precision_score(y_true, y_score))
            plt.plot(recall,
                     precision,
                     label=label_strs[i].format(auc),
                     linewidth=2.0)
            #plt.plot(recall, precision, color=colors[i], label=label_strs[i].format(auc), linewidth=2.0)
        plt.plot(np.linspace(0, 1, num=50),
                 np.repeat(true_fraction, 50),
                 label="Random; AUC={:0.3f}".format(true_fraction),
                 linewidth=2.0)
        plt.xlabel('Recall', fontsize='x-large')
        plt.ylabel('Precision', fontsize='x-large')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('Precision-Recall of PRMF and Friends', fontsize='xx-large')
        plt.legend()
        ofp = os.path.join(args.outdir, "fig{}.png".format(match_ind))
        plt.savefig(ofp, bbox_inches='tight')
        match_ind += 1

    # report the average of average precision for each method
    with open(os.path.join(args.outdir, 'avg_precision.txt'), 'w') as ofh:
        for i in range(len(W_mats)):
            avg_of_avg = np.mean(method_to_avg_precision_vals[i])
            label = args.labels[i]
            ofh.write("{}\t{}\n".format(label, avg_of_avg))
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description="""
Python implementation of Pathway-Regularized NMF.

Solve an optimization problem of the form
  min ||X - UV^T|| + 
    gamma * sum_k min_i V[:,k]^T Ls[i] V[:,k] + 
    delta * sum_k sum_{i | i in G_k} 1 / V[i,k] + 
    ||U||_F^2

where Ls[i] is the Laplacian matrix associated with Gs[i],
G_k is the manifold associated with latent factor k
X has shape (n_obs, n_features),
U has shape (n_obs, n_latent),
V has shape (n_feature, n_latent)

References
----------
Cai 2008. Non-negative Matrix Factorization on Manifold
""",
                                     formatter_class=RawTextHelpFormatter)
    prmf_args.add_prmf_arguments(parser)
    args = parser.parse_args()
    OUTDIR = args.outdir

    # tradeoff, gamma, and delta
    tradeoff = args.tradeoff
    if tradeoff == -1:
        tradeoff = None

    # TODO update gamma default

    manifold_fps = []
    if args.manifolds is None and args.manifolds_file is None:
        sys.stderr.write(
            "Exactly one of --manifolds or --manifolds-file is required.\n")
        sys.exit(22)
    elif args.manifolds is None and args.manifolds_file is not None:
        with open(args.manifolds_file, 'r') as fh:
            for line in fh:
                line = line.rstrip()
                manifold_fps.append(line)
    elif args.manifolds is not None and args.manifolds_file is None:
        manifold_fps = args.manifolds
    else:
        sys.stderr.write(
            "Exactly one of --manifolds or --manifolds-file is required.\n")
        sys.exit(23)
    G_fp_pairs = parse_pathways(manifold_fps)
    fp_to_G = {}
    for G, fp in G_fp_pairs:
        fp_to_G[fp] = G
    Gs = list(map(lambda x: x[0], G_fp_pairs))

    # TODO warn if --node-attribute is not found

    if args.seed is not None:
        seed = int(args.seed)
        np.random.seed(seed)
        random.seed(seed)

    has_header = check_header(args.data, args.delimiter)
    has_row_names = check_row_names(args.data, args.delimiter, has_header)
    # load data
    X = None
    # pd.read_csv defaults updated by CLI arguments
    nrows = None
    if args.m_samples is not None:
        n_rows = args.m_samples
    header = 'infer'
    if not has_header:
        header = None
    index_col = None
    if has_row_names:
        index_col = 0
    X = pd.read_csv(args.data,
                    sep=args.delimiter,
                    header=header,
                    nrows=nrows,
                    index_col=index_col)
    samples = list(X.index)

    # transpose data if desired
    m, n = X.shape
    if args.high_dimensional:
        if m > n:
            X = X.transpose()
    else:
        if m < n:
            X = X.transpose()

    # finalize data prep for nmf_pathway:
    # parse nodelist if provided or infer it from X as a dataframe
    # convert data frame to numpy
    nodelist = None
    if args.nodelist is not None:
        nodelist = prmf.parse_nodelist(open(args.nodelist))
        X = X.to_numpy()
    else:
        if has_header:
            # use the header to construct a nodelist
            nodelist = list(X.columns)
            nodelist_set = set(nodelist)
            for G in Gs:
                for node in G:
                    if node not in nodelist_set:
                        nodelist.append(node)
                        nodelist_set.add(node)

            X = prmf.embed_arr(nodelist, list(X.columns), X.to_numpy())
        else:
            sys.stderr.write(
                "--nodelist is not provided and there is no header in <--data>\n"
            )
            sys.exit(25)

    # check node identifiers in G against nodelist
    # TODO rework this test for inferred nodelist
    nodelist_set = set(nodelist)
    G_index_to_frac = {}
    all_zero = True
    for i, G in enumerate(Gs):
        count = 0
        for node in G.nodes_iter():
            if node in nodelist_set:
                count += 1
        frac = count / G.order()
        G_index_to_frac[i] = frac
        if count != 0:
            all_zero = False
    if all_zero:
        sys.stderr.write(
            "Invalid manifolds. Check that the node identifiers of the manifolds are present in the nodelist. Try setting --node-attribute if the node identifier is in a graphml attribute rather than the XML node attribute 'id'\n"
        )
        sys.exit(24)
    sys.stdout.write("Printing manifold node representation in nodelist:\n")
    for i, G_fp_pair in enumerate(G_fp_pairs):
        sys.stdout.write("{}: {:2.1f}%\n".format(G_fp_pair[1],
                                                 G_index_to_frac[i] * 100))

    U_fp = os.path.join(args.outdir, "U.csv")
    V_fp = os.path.join(args.outdir, "V.csv")
    obj_fp = os.path.join(args.outdir, "obj.txt")

    # cross validation
    # TODO use other folds
    X_test = None
    if args.cross_validation is not None:
        kf = KFold(n_splits=round(1 / args.cross_validation))
        for train_index, test_index in kf.split(X):
            X_train = X[train_index]
            X_test = X[test_index]

            X = X_train
            samples = [samples[i] for i in train_index]
            break

    # normalize data if desired
    # data at this stage is assumed to be observations x features
    # normalization is done for each feature value
    # e.g. the sample with the highest read count for gene X gets the value 1 in the gene X column
    if not args.no_normalize:
        X = quantile_transform(X)

    # --manifolds-init - {{
    pathway_init_fp = os.path.join(args.outdir, 'init_pathways.txt')
    U_init = None
    V_init = None
    init_fps = []
    if args.manifolds_init is not None:
        Gs_init = list(map(lambda fp: fp_to_G[fp], args.manifolds_init))
        if len(args.manifolds_init) < args.k_latent:
            # then extend Gs_init with a random sample from the pathway population
            non_init_fps = list(set(manifold_fps) - set(args.manifolds_init))
            chosen_fps = random.sample(
                non_init_fps, args.k_latent - len(args.manifolds_init))
            init_fps = copy.copy(args.manifolds_init)
            for chosen_fp in chosen_fps:
                Gs_init.append(fp_to_G[chosen_fp])
                init_fps.append(chosen_fp)
        elif len(args.manifolds_init) == args.k_latent:
            # no modification to Gs_init is needed
            init_fps = args.manifolds_init
        else:  # len(args.manifolds_init) > args.k_latent
            # then sample from Gs_init
            inds = np.random.choice(len(Gs_init), args.k_latent)
            Gs_init_new = []
            for ind in inds:
                Gs_init_new.append(Gs_init[ind])
                init_fps.append(args.manifolds_init[ind])
            Gs_init = Gs_init_new
        vs = []
        us = []
        for G in Gs_init:
            v, pathway_ind = pathway_to_vec(X, G, nodelist)
            v_pathway_signal = v[pathway_ind]
            u, res = nmf_init_u(X, v)
            v_new, res = nmf_init_v(X, u)
            v_new[pathway_ind] = v_pathway_signal
            vs.append(v_new)
            us.append(u)
        V_init = np.concatenate(vs, axis=1)
        U_init = np.concatenate(us, axis=1)
        sys.stdout.write(
            "Using the following manifolds for initialization:\n{}\n".format(
                "\n".join(init_fps)))
        # also write these to their own file
        with open(pathway_init_fp, 'w') as pathway_init_fh:
            pathway_init_fh.write("\n".join(init_fps))
    # }} - --manifolds-init

    # TODO other arguments
    U, V, obj_data = nmf_pathway(X,
                                 Gs,
                                 nodelist=nodelist,
                                 gamma=args.gamma,
                                 tradeoff=tradeoff,
                                 k_latent=args.k_latent,
                                 U_init=U_init,
                                 V_init=V_init,
                                 verbose=args.verbose)
    U = pd.DataFrame(U,
                     index=samples,
                     columns=list(
                         map(lambda x: "LV{}".format(x),
                             range(args.k_latent))))
    V = pd.DataFrame(V,
                     index=nodelist,
                     columns=list(
                         map(lambda x: "LV{}".format(x),
                             range(args.k_latent))))
    U.to_csv(U_fp, sep=",", index=has_row_names, quoting=csv.QUOTE_NONNUMERIC)
    V.to_csv(V_fp, sep=",", index=True, quoting=csv.QUOTE_NONNUMERIC)

    # cross validation
    if args.cross_validation is not None:
        normalized_test_errors = prmf.measure_cv_performance(V, X_test)
        avg_normalized_test_error = np.mean(normalized_test_errors)
        error_fp = os.path.join(args.outdir, 'test_error.csv')
        np.savetxt(error_fp, normalized_test_errors, delimiter=",")
        obj_data['average_normalized_test_error'] = avg_normalized_test_error

    with open(obj_fp, 'w') as obj_fh:
        latent_to_pathway_data = obj_data.pop('latent_to_pathway_data', {})
        for k, v in obj_data.items():
            obj_fh.write("{} = {:0.5f}\n".format(k, v))

        # write which manifold file was used for each latent factor
        ks = sorted(latent_to_pathway_data.keys())
        for k in ks:
            lapl_inds = list(map(lambda x: x[0], latent_to_pathway_data[k]))
            # TODO pick first, assumes convergence
            lapl_ind = lapl_inds[0]
            G, fp = G_fp_pairs[lapl_ind]
            obj_fh.write("{} -> {}\n".format(k, fp))
Exemplo n.º 5
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--nodelist', required=True)
    parser.add_argument('--gene-by-latent', required=True)
    parser.add_argument('--opt-outfile', required=True)
    parser.add_argument('--ppi-network',
                        help="PPI and pathway union graph stored as graphml",
                        required=True)
    parser.add_argument(
        '--latent',
        default=None,
        help="If provided, only run script on this latent factor",
        type=int)
    parser.add_argument('--outdir', required=True)
    args = parser.parse_args()

    nodelist = prmf.parse_nodelist(open(args.nodelist))
    ppi_network = nx.read_graphml(args.ppi_network)
    gene_by_latent = np.genfromtxt(args.gene_by_latent, delimiter=",")
    k_to_pathway_fp = prmf.parse_pathway_obj(args.opt_outfile)
    if (args.latent is not None):
        k_to_pathway_fp = {args.latent: k_to_pathway_fp[args.latent]}

    ofp = os.path.join(args.outdir, 'pathway_extension.out')
    ofh = open(ofp, 'w')
    for k, fp in k_to_pathway_fp.items():
        pathway = nx.read_graphml(fp)
        vec = gene_by_latent[:, k]
        node_to_score = score_pathway_neighbors(ppi_network, pathway, nodelist,
                                                vec)
        node_to_score = filter_pathway_neighbors(vec, node_to_score)
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(description="""
Diffuse node scores over a network. The diffused matrix is of shape (n_nodes, n_gene_lists) === (n_feature x n_obs).
""")
    parser.add_argument("--network",
                        type=str,
                        help="graphml network file to run diffusion on",
                        required=True)
    parser.add_argument(
        "--nodelist",
        type=argparse.FileType("r"),
        required=True,
        help=
        "Association between gene identifier and matrix index provided as a whitespace delimited list"
    )
    parser.add_argument(
        "--gene-lists",
        nargs="+",
        help="one or more files with an node identifiers on each line")
    parser.add_argument(
        "--gene-csv",
        help=
        "One csv file with genes along columns and observations along rows; must contain column names but not row names"
    )
    parser.add_argument("--diffused",
                        "-d",
                        type=argparse.FileType("w"),
                        required=True,
                        help="Diffused matrix")
    parser.add_argument("--alpha",
                        "-a",
                        type=float,
                        default=0.7,
                        help="Diffusion rate parameter")
    parser.add_argument(
        "--tolerance",
        "-t",
        type=float,
        default=10e-6,
        help=
        "Tolerance threshold for diffusion; stop when change in diffused matrix crosses below threshold"
    )
    parser.add_argument(
        "--string-edge-type",
        default="combined_score",
        help=
        "\"experimental\" for edges supported by experimental evidence only; \"combined_score\" for the entire stringdb network; default=\"combined_score\""
    )
    parser.add_argument(
        "--diffused-format",
        type=str,
        default='csv',
        help=
        "Either \"ampl\" or \"csv\"; default=\"csv\" which is short for MatrixMarket, a sparse matrix file format"
    )
    args = parser.parse_args()

    # TODO ampl only right now
    # fail fast on --diffused-format
    #if args.diffused_format not in ['ampl', 'mm']:
    #  sys.stderr.write("invalid --diffused-format={}\n".format(args.diffused_format))
    #  sys.exit(22)

    if args.gene_lists is None and args.gene_csv is None:
        sys.stderr.write(
            "Exactly one of --gene-lists or --gene-csv is required")
        sys.exit(23)

    # TODO edge confidence threshold, edge_type in other script
    G_ppi = nx.read_graphml(args.network)
    nodelist = prmf.parse_nodelist(args.nodelist)

    # NOTE if G_ppi has 'weight' attribute on edges, its value is used; otherwise a value of
    # 1 is populated in the ij entry for an edge (i, j)
    adj = nx.to_scipy_sparse_matrix(G_ppi, nodelist=nodelist, dtype=bool)

    mat = None
    if args.gene_lists is not None:
        # parse gene lists
        gene_lists = []
        for gene_path in args.gene_lists:
            with open(gene_path) as fh:
                gene_lists.append(prmf.parse_ws_delim(fh))

        # verify gene lists present in ppi_db
        def get_row_vec_for_gene_list(gene_list):
            row_vec, missing = prmf.embed_ids(nodelist, gene_list)
            sys.stderr.write("missing {}/{} node identifiers: {}\n".format(
                len(missing), len(gene_list), ", ".join(missing)))
            return row_vec

        row_vecs = map(get_row_vec_for_gene_list, gene_lists)

        mat = sp.vstack(row_vecs)
    else:
        mat = sp.csc_matrix(np.genfromtxt(args.gene_csv, delimiter=","))

    # do diffusion
    smoothed_mat = prmf.diffusion(mat,
                                  adj,
                                  alpha=args.alpha,
                                  tol=args.tolerance)

    # write results
    if args.diffused_format == "ampl":
        # TODO does this work with 'wb'?
        prmf.ampl_write_sparse_arr(smoothed_mat, args.diffused, len(nodelist))
    else:
        index = list(
            map(lambda x: "sample{}".format(x + 1),
                range(len(args.gene_lists))))
        smoothed_mat_df = pd.DataFrame(smoothed_mat.todense(),
                                       index=index,
                                       columns=nodelist)
        smoothed_mat_df.to_csv(args.diffused,
                               sep=",",
                               index=True,
                               quoting=csv.QUOTE_NONNUMERIC)