def simulate_mixture(args): nodelist = prmf.parse_nodelist(args.nodelist) # first, sample subset of seed-lists seed_lists_sample, seed_list_sizes, chosen_seed_fps = sample_pathways(args, nodelist) # define multinomial parameters prs = [(1 - args.noise_pr)/(len(seed_lists_sample) - 1)] * (len(seed_lists_sample) - 1) prs.append(args.noise_pr) # then, from the sampled seed list and the background <nodelist>, sample gene lists gene_lists = [] for i in range(args.n_gene_lists): # sample from multinomial to determine number of elements coming from each seed list sample_sizes = nprand.multinomial(args.gene_list_size, prs) # if we try to overdraw from a seed list, sample the remaining from background for j in range(len(sample_sizes)-1): seed_list_size = seed_list_sizes[j] sample_size = sample_sizes[j] diff = sample_size - seed_list_size if diff > 0: sample_sizes[j] -= diff sample_sizes[-1] += diff # then sample from seed lists uniformly at random gene_list = set() for j in range(len(seed_lists_sample)): seed_list = seed_lists_sample[j] sample_size = sample_sizes[j] seed_list_inds = nprand.choice(len(seed_list), size=sample_size) for seed_list_ind in seed_list_inds: gene_list.add(seed_list[seed_list_ind]) gene_lists.append(sorted(gene_list)) # write gene lists to file write_lists(args, gene_lists) write_seeds(args, chosen_seed_fps)
def simulate_whole(args): """ In contrast to simulate_mixture, do not combine nodes from different pathways into a single gene list """ nodelist = prmf.parse_nodelist(args.nodelist) # first, sample subset of seed-lists seed_lists_sample, seed_list_sizes, chosen_seed_fps = sample_pathways(args, nodelist) # then, from the sampled seed list and the background <nodelist>, sample gene lists gene_lists = [] for i in range(args.n_gene_lists): # sample from multinomial to determine number of elements coming from each seed list sample_size = nprand.binomial(args.gene_list_size, 1 - args.noise_pr) # then sample from seed list uniformly at random gene_list = set() seed_list = seed_lists_sample[i] if sample_size < len(seed_list): seed_list_inds = nprand.choice(len(seed_list), size=sample_size, replace=False) for seed_list_ind in seed_list_inds: gene_list.add(seed_list[seed_list_ind]) else: # TODO warn? for seed in seed_list: gene_list.add(seed) # sample remaining from background # TODO catch error if size > len(nodelist) nodelist_inds = nprand.choice(len(nodelist), size=(args.gene_list_size - sample_size), replace=False) for nodelist_ind in nodelist_inds: gene_list.add(nodelist[nodelist_ind]) gene_lists.append(sorted(gene_list)) # write gene lists to file write_lists(args, gene_lists) write_seeds(args, chosen_seed_fps)
def main(): parser = argparse.ArgumentParser(description=""" Evaluate NMF versus Pathway-Regularized Matrix Factorization by plotting PR curves on one figure. """) parser.add_argument("--gene-by-latent-csvs", nargs="+", help=".csv files", required=True) parser.add_argument("--labels", nargs="+", help="parallel to --gene-by-latent-csvs", required=True) parser.add_argument("--nodelist", type=argparse.FileType('r'), required=True) parser.add_argument("--true-seeds", type=argparse.FileType('r'), required=True) parser.add_argument("--outdir", type=str, required=True) args = parser.parse_args() # parse inputs - {{ W_mats = [] colors = [] # TODO W_mats = list( map( lambda x: pd.read_csv(x, sep=",", header='infer', index_col=0). values, args.gene_by_latent_csvs)) label_strs = list(map(lambda x: x + "; AUC={:0.3f}", args.labels)) nodelist = prmf.parse_nodelist(args.nodelist) true_seed_fps = [] for line in args.true_seeds: line = line.rstrip() true_seed_fps.append(line) true_seed_lists = [] for true_seed_fp in true_seed_fps: seed_list = prmf.parse_seedlist(true_seed_fp) true_seed_lists.append(seed_list) pathways_mat = prmf.nodelists_to_mat(true_seed_lists, nodelist) # }} - parse inputs # reorganize <matching> so we can find each method's latent factor that best matches the ground truth pathway_to_latent_maps = [] for i in range(len(W_mats)): matching = prmf.match(W_mats[i], pathways_mat) pathway_to_latent_map = {} for match in matching: factor_id_match, pathway_id_match, auc = match factor_id = matching_id_to_ind(factor_id_match) pathway_id = matching_id_to_ind(pathway_id_match) pathway_to_latent_map[pathway_id] = (factor_id, auc) pathway_to_latent_maps.append(pathway_to_latent_map) # plot Precision-Recall curves match_ind = 0 method_to_avg_precision_vals = {} for i in range(len(W_mats)): method_to_avg_precision_vals[i] = [] for pathway_id in range(pathways_mat.shape[1]): plt.clf() y_true = pathways_mat[:, pathway_id] true_fraction = np.sum(y_true) / y_true.shape[0] for i in range(len(pathway_to_latent_maps)): pathway_to_latent_map = pathway_to_latent_maps[i] factor_id, auc = pathway_to_latent_map[pathway_id] y_score = W_mats[i][:, factor_id] precision, recall, thresholds = sklearn.metrics.precision_recall_curve( y_true, y_score) method_to_avg_precision_vals[i].append( sklearn.metrics.average_precision_score(y_true, y_score)) plt.plot(recall, precision, label=label_strs[i].format(auc), linewidth=2.0) #plt.plot(recall, precision, color=colors[i], label=label_strs[i].format(auc), linewidth=2.0) plt.plot(np.linspace(0, 1, num=50), np.repeat(true_fraction, 50), label="Random; AUC={:0.3f}".format(true_fraction), linewidth=2.0) plt.xlabel('Recall', fontsize='x-large') plt.ylabel('Precision', fontsize='x-large') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall of PRMF and Friends', fontsize='xx-large') plt.legend() ofp = os.path.join(args.outdir, "fig{}.png".format(match_ind)) plt.savefig(ofp, bbox_inches='tight') match_ind += 1 # report the average of average precision for each method with open(os.path.join(args.outdir, 'avg_precision.txt'), 'w') as ofh: for i in range(len(W_mats)): avg_of_avg = np.mean(method_to_avg_precision_vals[i]) label = args.labels[i] ofh.write("{}\t{}\n".format(label, avg_of_avg))
def main(): parser = argparse.ArgumentParser(description=""" Python implementation of Pathway-Regularized NMF. Solve an optimization problem of the form min ||X - UV^T|| + gamma * sum_k min_i V[:,k]^T Ls[i] V[:,k] + delta * sum_k sum_{i | i in G_k} 1 / V[i,k] + ||U||_F^2 where Ls[i] is the Laplacian matrix associated with Gs[i], G_k is the manifold associated with latent factor k X has shape (n_obs, n_features), U has shape (n_obs, n_latent), V has shape (n_feature, n_latent) References ---------- Cai 2008. Non-negative Matrix Factorization on Manifold """, formatter_class=RawTextHelpFormatter) prmf_args.add_prmf_arguments(parser) args = parser.parse_args() OUTDIR = args.outdir # tradeoff, gamma, and delta tradeoff = args.tradeoff if tradeoff == -1: tradeoff = None # TODO update gamma default manifold_fps = [] if args.manifolds is None and args.manifolds_file is None: sys.stderr.write( "Exactly one of --manifolds or --manifolds-file is required.\n") sys.exit(22) elif args.manifolds is None and args.manifolds_file is not None: with open(args.manifolds_file, 'r') as fh: for line in fh: line = line.rstrip() manifold_fps.append(line) elif args.manifolds is not None and args.manifolds_file is None: manifold_fps = args.manifolds else: sys.stderr.write( "Exactly one of --manifolds or --manifolds-file is required.\n") sys.exit(23) G_fp_pairs = parse_pathways(manifold_fps) fp_to_G = {} for G, fp in G_fp_pairs: fp_to_G[fp] = G Gs = list(map(lambda x: x[0], G_fp_pairs)) # TODO warn if --node-attribute is not found if args.seed is not None: seed = int(args.seed) np.random.seed(seed) random.seed(seed) has_header = check_header(args.data, args.delimiter) has_row_names = check_row_names(args.data, args.delimiter, has_header) # load data X = None # pd.read_csv defaults updated by CLI arguments nrows = None if args.m_samples is not None: n_rows = args.m_samples header = 'infer' if not has_header: header = None index_col = None if has_row_names: index_col = 0 X = pd.read_csv(args.data, sep=args.delimiter, header=header, nrows=nrows, index_col=index_col) samples = list(X.index) # transpose data if desired m, n = X.shape if args.high_dimensional: if m > n: X = X.transpose() else: if m < n: X = X.transpose() # finalize data prep for nmf_pathway: # parse nodelist if provided or infer it from X as a dataframe # convert data frame to numpy nodelist = None if args.nodelist is not None: nodelist = prmf.parse_nodelist(open(args.nodelist)) X = X.to_numpy() else: if has_header: # use the header to construct a nodelist nodelist = list(X.columns) nodelist_set = set(nodelist) for G in Gs: for node in G: if node not in nodelist_set: nodelist.append(node) nodelist_set.add(node) X = prmf.embed_arr(nodelist, list(X.columns), X.to_numpy()) else: sys.stderr.write( "--nodelist is not provided and there is no header in <--data>\n" ) sys.exit(25) # check node identifiers in G against nodelist # TODO rework this test for inferred nodelist nodelist_set = set(nodelist) G_index_to_frac = {} all_zero = True for i, G in enumerate(Gs): count = 0 for node in G.nodes_iter(): if node in nodelist_set: count += 1 frac = count / G.order() G_index_to_frac[i] = frac if count != 0: all_zero = False if all_zero: sys.stderr.write( "Invalid manifolds. Check that the node identifiers of the manifolds are present in the nodelist. Try setting --node-attribute if the node identifier is in a graphml attribute rather than the XML node attribute 'id'\n" ) sys.exit(24) sys.stdout.write("Printing manifold node representation in nodelist:\n") for i, G_fp_pair in enumerate(G_fp_pairs): sys.stdout.write("{}: {:2.1f}%\n".format(G_fp_pair[1], G_index_to_frac[i] * 100)) U_fp = os.path.join(args.outdir, "U.csv") V_fp = os.path.join(args.outdir, "V.csv") obj_fp = os.path.join(args.outdir, "obj.txt") # cross validation # TODO use other folds X_test = None if args.cross_validation is not None: kf = KFold(n_splits=round(1 / args.cross_validation)) for train_index, test_index in kf.split(X): X_train = X[train_index] X_test = X[test_index] X = X_train samples = [samples[i] for i in train_index] break # normalize data if desired # data at this stage is assumed to be observations x features # normalization is done for each feature value # e.g. the sample with the highest read count for gene X gets the value 1 in the gene X column if not args.no_normalize: X = quantile_transform(X) # --manifolds-init - {{ pathway_init_fp = os.path.join(args.outdir, 'init_pathways.txt') U_init = None V_init = None init_fps = [] if args.manifolds_init is not None: Gs_init = list(map(lambda fp: fp_to_G[fp], args.manifolds_init)) if len(args.manifolds_init) < args.k_latent: # then extend Gs_init with a random sample from the pathway population non_init_fps = list(set(manifold_fps) - set(args.manifolds_init)) chosen_fps = random.sample( non_init_fps, args.k_latent - len(args.manifolds_init)) init_fps = copy.copy(args.manifolds_init) for chosen_fp in chosen_fps: Gs_init.append(fp_to_G[chosen_fp]) init_fps.append(chosen_fp) elif len(args.manifolds_init) == args.k_latent: # no modification to Gs_init is needed init_fps = args.manifolds_init else: # len(args.manifolds_init) > args.k_latent # then sample from Gs_init inds = np.random.choice(len(Gs_init), args.k_latent) Gs_init_new = [] for ind in inds: Gs_init_new.append(Gs_init[ind]) init_fps.append(args.manifolds_init[ind]) Gs_init = Gs_init_new vs = [] us = [] for G in Gs_init: v, pathway_ind = pathway_to_vec(X, G, nodelist) v_pathway_signal = v[pathway_ind] u, res = nmf_init_u(X, v) v_new, res = nmf_init_v(X, u) v_new[pathway_ind] = v_pathway_signal vs.append(v_new) us.append(u) V_init = np.concatenate(vs, axis=1) U_init = np.concatenate(us, axis=1) sys.stdout.write( "Using the following manifolds for initialization:\n{}\n".format( "\n".join(init_fps))) # also write these to their own file with open(pathway_init_fp, 'w') as pathway_init_fh: pathway_init_fh.write("\n".join(init_fps)) # }} - --manifolds-init # TODO other arguments U, V, obj_data = nmf_pathway(X, Gs, nodelist=nodelist, gamma=args.gamma, tradeoff=tradeoff, k_latent=args.k_latent, U_init=U_init, V_init=V_init, verbose=args.verbose) U = pd.DataFrame(U, index=samples, columns=list( map(lambda x: "LV{}".format(x), range(args.k_latent)))) V = pd.DataFrame(V, index=nodelist, columns=list( map(lambda x: "LV{}".format(x), range(args.k_latent)))) U.to_csv(U_fp, sep=",", index=has_row_names, quoting=csv.QUOTE_NONNUMERIC) V.to_csv(V_fp, sep=",", index=True, quoting=csv.QUOTE_NONNUMERIC) # cross validation if args.cross_validation is not None: normalized_test_errors = prmf.measure_cv_performance(V, X_test) avg_normalized_test_error = np.mean(normalized_test_errors) error_fp = os.path.join(args.outdir, 'test_error.csv') np.savetxt(error_fp, normalized_test_errors, delimiter=",") obj_data['average_normalized_test_error'] = avg_normalized_test_error with open(obj_fp, 'w') as obj_fh: latent_to_pathway_data = obj_data.pop('latent_to_pathway_data', {}) for k, v in obj_data.items(): obj_fh.write("{} = {:0.5f}\n".format(k, v)) # write which manifold file was used for each latent factor ks = sorted(latent_to_pathway_data.keys()) for k in ks: lapl_inds = list(map(lambda x: x[0], latent_to_pathway_data[k])) # TODO pick first, assumes convergence lapl_ind = lapl_inds[0] G, fp = G_fp_pairs[lapl_ind] obj_fh.write("{} -> {}\n".format(k, fp))
parser = argparse.ArgumentParser() parser.add_argument('--nodelist', required=True) parser.add_argument('--gene-by-latent', required=True) parser.add_argument('--opt-outfile', required=True) parser.add_argument('--ppi-network', help="PPI and pathway union graph stored as graphml", required=True) parser.add_argument( '--latent', default=None, help="If provided, only run script on this latent factor", type=int) parser.add_argument('--outdir', required=True) args = parser.parse_args() nodelist = prmf.parse_nodelist(open(args.nodelist)) ppi_network = nx.read_graphml(args.ppi_network) gene_by_latent = np.genfromtxt(args.gene_by_latent, delimiter=",") k_to_pathway_fp = prmf.parse_pathway_obj(args.opt_outfile) if (args.latent is not None): k_to_pathway_fp = {args.latent: k_to_pathway_fp[args.latent]} ofp = os.path.join(args.outdir, 'pathway_extension.out') ofh = open(ofp, 'w') for k, fp in k_to_pathway_fp.items(): pathway = nx.read_graphml(fp) vec = gene_by_latent[:, k] node_to_score = score_pathway_neighbors(ppi_network, pathway, nodelist, vec) node_to_score = filter_pathway_neighbors(vec, node_to_score)
def main(): parser = argparse.ArgumentParser(description=""" Diffuse node scores over a network. The diffused matrix is of shape (n_nodes, n_gene_lists) === (n_feature x n_obs). """) parser.add_argument("--network", type=str, help="graphml network file to run diffusion on", required=True) parser.add_argument( "--nodelist", type=argparse.FileType("r"), required=True, help= "Association between gene identifier and matrix index provided as a whitespace delimited list" ) parser.add_argument( "--gene-lists", nargs="+", help="one or more files with an node identifiers on each line") parser.add_argument( "--gene-csv", help= "One csv file with genes along columns and observations along rows; must contain column names but not row names" ) parser.add_argument("--diffused", "-d", type=argparse.FileType("w"), required=True, help="Diffused matrix") parser.add_argument("--alpha", "-a", type=float, default=0.7, help="Diffusion rate parameter") parser.add_argument( "--tolerance", "-t", type=float, default=10e-6, help= "Tolerance threshold for diffusion; stop when change in diffused matrix crosses below threshold" ) parser.add_argument( "--string-edge-type", default="combined_score", help= "\"experimental\" for edges supported by experimental evidence only; \"combined_score\" for the entire stringdb network; default=\"combined_score\"" ) parser.add_argument( "--diffused-format", type=str, default='csv', help= "Either \"ampl\" or \"csv\"; default=\"csv\" which is short for MatrixMarket, a sparse matrix file format" ) args = parser.parse_args() # TODO ampl only right now # fail fast on --diffused-format #if args.diffused_format not in ['ampl', 'mm']: # sys.stderr.write("invalid --diffused-format={}\n".format(args.diffused_format)) # sys.exit(22) if args.gene_lists is None and args.gene_csv is None: sys.stderr.write( "Exactly one of --gene-lists or --gene-csv is required") sys.exit(23) # TODO edge confidence threshold, edge_type in other script G_ppi = nx.read_graphml(args.network) nodelist = prmf.parse_nodelist(args.nodelist) # NOTE if G_ppi has 'weight' attribute on edges, its value is used; otherwise a value of # 1 is populated in the ij entry for an edge (i, j) adj = nx.to_scipy_sparse_matrix(G_ppi, nodelist=nodelist, dtype=bool) mat = None if args.gene_lists is not None: # parse gene lists gene_lists = [] for gene_path in args.gene_lists: with open(gene_path) as fh: gene_lists.append(prmf.parse_ws_delim(fh)) # verify gene lists present in ppi_db def get_row_vec_for_gene_list(gene_list): row_vec, missing = prmf.embed_ids(nodelist, gene_list) sys.stderr.write("missing {}/{} node identifiers: {}\n".format( len(missing), len(gene_list), ", ".join(missing))) return row_vec row_vecs = map(get_row_vec_for_gene_list, gene_lists) mat = sp.vstack(row_vecs) else: mat = sp.csc_matrix(np.genfromtxt(args.gene_csv, delimiter=",")) # do diffusion smoothed_mat = prmf.diffusion(mat, adj, alpha=args.alpha, tol=args.tolerance) # write results if args.diffused_format == "ampl": # TODO does this work with 'wb'? prmf.ampl_write_sparse_arr(smoothed_mat, args.diffused, len(nodelist)) else: index = list( map(lambda x: "sample{}".format(x + 1), range(len(args.gene_lists)))) smoothed_mat_df = pd.DataFrame(smoothed_mat.todense(), index=index, columns=nodelist) smoothed_mat_df.to_csv(args.diffused, sep=",", index=True, quoting=csv.QUOTE_NONNUMERIC)