def train(args, sample_populations): workdir = args.workdir features = read_features(workdir) class_labels = make_labels(features.sample_labels, sample_populations) print "Training Ensemble 1" lr1 = LogisticRegressionEnsemble(args.n_models, args.method, args.batch_size, bagging=args.bagging) feature_importances = lr1.feature_importances(features.feature_matrix, class_labels) snp_importances = features.rank_snps(feature_importances) write_snps(workdir, snp_importances, args.method, args.n_models, "1") print "Training ensemble 2" lr2 = LogisticRegressionEnsemble(args.n_models, args.method, args.batch_size, bagging=args.bagging) feature_importances = lr2.feature_importances(features.feature_matrix, class_labels) snp_importances = features.rank_snps(feature_importances) write_snps(workdir, snp_importances, args.method, args.n_models, "2")
def cluster_samples(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] components = map(lambda idx: idx - 1, args.components) selected = projected[:, components] features = read_features(workdir) _, labels, inertia = k_means(selected, args.n_clusters, n_jobs=-2) fig_flname = os.path.join(analysis_dir, "clusters_%s.tsv" % args.n_clusters) clusters = defaultdict(list) for name, cluster in zip(features.sample_labels, labels): clusters[cluster].append(name) with open(fig_flname, "w") as fl: for cluster, samples in clusters.iteritems(): fl.write(str(cluster)) fl.write(",") fl.write(",".join(samples)) fl.write("\n")
def sweep_clusters(args): workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] components = map(lambda idx: idx - 1, args.components) selected = projected[:, components] features = read_features(workdir) inertia_values = [] for k in args.n_clusters: print "Clustering with %s clusters" % k _, _, inertia = k_means(selected, k, n_jobs=-2) inertia_values.append(inertia) plt.plot(args.n_clusters, inertia_values, "k.-") plt.xlabel("Number of Clusters", fontsize=16) plt.ylabel("Inertia", fontsize=16) fig_flname = os.path.join(figures_dir, "cluster_inertia") for dim in args.components: fig_flname += "_%s" % dim fig_flname += ".png" plt.savefig(fig_flname, DPI=300)
def output_coordinates(args): workdir = args.workdir project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] selected = projected[:, map(lambda idx: idx - 1, args.selected_components)] features = read_features(workdir) with open(args.output_fl, "w") as fl: headers = ["sample", "population_index", "population_name"] headers.extend(map(str, args.selected_components)) fl.write("\t".join(headers)) fl.write("\n") for i in xrange(len(features.sample_labels)): sample = features.sample_labels[i] pop_idx = features.class_labels[i] pop_name = project_summary.population_names[pop_idx] line = [sample, str(pop_idx), pop_name] line.extend(map(str, selected[i, :])) fl.write("\t".join(line)) fl.write("\n")
def output_loading_factors(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) data_model = read_features(workdir) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) pca = model[MODEL_KEY] components = pca.components_ selected = components[map(lambda idx: idx - 1, args.components), :] output_fl = os.path.join(analysis_dir, "pca_loading_factors.tsv") with open(output_fl, "w") as fl: header = ["chromosome", "position", "dummy variable"] header.extend(map(str, args.components)) fl.write("\t".join(header)) fl.write("\n") for i, pair in enumerate(data_model.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label for j, idx in enumerate(feature_idx): features = selected[:, idx] fl.write("%s\t%s\t%s\t" % (chrom, pos, j)) fl.write("\t".join(map(str, features))) fl.write("\n")
def min_components_explained_variance(args): workdir = args.workdir features = read_features(workdir) n_components = args.init_n_components while True: print "Computing PCA with %s components" % n_components pca = PCA(n_components=n_components, whiten=True) pca.fit(features.feature_matrix) explained_variance_ratios = pca.explained_variance_ratio_ sorted_ratios = np.sort(explained_variance_ratios)[::-1] cum_ratios = np.cumsum(sorted_ratios) total_explained_variance = cum_ratios[-1] if total_explained_variance >= args.explained_variance_threshold: break n_components *= 2 needed_components = 0 achieved_ev_ratio = 0.0 for i, ev_ratio in enumerate(cum_ratios): if ev_ratio >= args.explained_variance_threshold: needed_components = i + 1 achieved_ev_ratio = ev_ratio break print "Explained-variance threshold of %s surpassed at %s with %s components" % \ (args.explained_variance_threshold, achieved_ev_ratio, needed_components)
def pca(args): workdir = args["workdir"] figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) n_pcs = args["n_pcs"] if n_pcs is None: print "Number of pcs must be specified for PCA" sys.exit(1) features = read_features(workdir) proj, explained_variance_ratios = features.svd(n_pcs) plt.clf() plt.plot(explained_variance_ratios, "b.-") plt.xlabel("PC", fontsize=16) plt.ylabel("Explained Variance Ratio", fontsize=16) plt.savefig(os.path.join(figures_dir, "pca_explained_variance_ratios.png"), DPI=200) for i in xrange(n_pcs - 1): for j in xrange(i + 1, n_pcs): plt.clf() plt.scatter(proj[:, i], proj[:, j]) plt.xlabel("PC " + str(i), fontsize=16) plt.ylabel("PC " + str(j), fontsize=16) plt.savefig(os.path.join(figures_dir, "pca_%s_%s.png" % (i, j)), DPI=200)
def snp_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_iter = estimate_lr_iter(len(data_model.class_labels)) # we set the intercept to the class ratios in the lr test function lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) n_pcs = projections.shape[0] for pc in args.components: flname = os.path.join(analysis_dir, "snp_pc_%s_association_tests.tsv" % pc) with open(flname, "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label snp_features = data_model.feature_matrix[:, feature_idx] triplet = generate_training_set(snp_features, projections[:, pc - 1]) n_copies, class_labels, imputed_projections = triplet imputed_projections = imputed_projections.reshape(-1, 1) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size try: p_value = likelihood_ratio_test(imputed_projections, class_labels, lr, g_scaling_factor=1.0 / n_copies) # in case of underflow or overflow in a badly-behaving model except ValueError: p_value = 1.0 if i == next_output: print i, "SNP", snp_label, "and PC", pc, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")
def train_model(args, sample_populations): workdir = args["workdir"] figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) n_trees = args["trees"] if n_trees is None: print "Number of trees must be specified for training" sys.exit(1) n_resamples = args["resamples"] if n_resamples is None: print "Number of additional samples must be specified for training" sys.exit(1) features = read_features(workdir) class_labels = make_labels(features.sample_labels, sample_populations) rf = ConstrainedBaggingRandomForest(n_trees, n_resamples, args["batch_size"]) feature_importances, used_feature_counts, used_feature_sets = \ rf.feature_importances( features.feature_matrix, class_labels, statistics=args["statistics"], interactions=args["interactions"]) snp_importances = features.rank_snps(feature_importances) write_rf_snps(workdir, snp_importances, n_trees, "model1") if args["statistics"]: dense = histogram_sparse_to_dense(used_feature_counts) flname = os.path.join(figures_dir, "features_used_histogram_rf_%s_trees.png" \ % args["trees"]) plot_feature_histogram(flname, dense) if args["interactions"]: write_interactions(workdir, n_trees, used_feature_sets) rf = ConstrainedBaggingRandomForest(n_trees, n_resamples, args["batch_size"]) feature_importances, _, _ = rf.feature_importances(features.feature_matrix, class_labels, statistics=False) snp_importances = features.rank_snps(feature_importances) write_rf_snps(workdir, snp_importances, n_trees, "model2")
def plot_projections(args): if len(args.pairs) % 2 != 0: print "Error: PCs must be provided in pairs of 2" sys.exit(1) workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] features = read_features(workdir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) all_labels = set(features.class_labels) labels = np.array(features.class_labels, dtype=np.int32) populations = [] for l in all_labels: pop = labels == l pop_name = project_summary.population_names[l] populations.append((pop, pop_name)) for p1, p2 in pairwise(args.pairs): fig_flname = os.path.join( figures_dir, "pca_projection_%s_%s.png" % (str(p1), str(p2))) plt.clf() plt.grid(True) colors = ["m", "c", "k", "r", "g", "b"] markers = ["o"] * len(colors) + \ ["s"] * len(colors) + \ ["+"] * len(colors) for idx, (pop_idx, pop_name) in enumerate(populations): plt.scatter(projected[pop_idx, p1 - 1], projected[pop_idx, p2 - 1], color=colors[idx % len(colors)], marker=markers[idx % len(markers)], edgecolor="k", alpha=0.7, label=pop_name) plt.xlabel("Principal Component %s" % p1, fontsize=16) plt.ylabel("Principal Component %s" % p2, fontsize=16) if len(all_labels) > 1: plt.legend() plt.savefig(fig_flname, DPI=300)
def snp_linreg_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_pcs = projections.shape[0] for pc in args.components: flname = os.path.join(analysis_dir, "snp_pc_%s_linreg_assoc_tests.tsv" % pc) with open(flname, "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label snp_features = data_model.feature_matrix[:, feature_idx] # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size triplet = snp_linreg_pvalues(snp_features, projections[:, pc - 1]) snp_p_value, gt_ttest_pvalues, gt_normality_pvalues, gt_pred_ys = triplet if i == next_output: print i, "SNP", snp_label, "and PC", pc, "has p-value", snp_p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(snp_p_value)])) for j in xrange(3): fl.write("\t") fl.write(str(gt_ttest_pvalues[j])) fl.write("\t") fl.write(str(gt_normality_pvalues[j])) fl.write("\t") fl.write(str(gt_pred_ys[j])) fl.write("\n")
def pop_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_iter = estimate_lr_iter(len(data_model.sample_labels)) # we set the intercept to the class ratios in the lr test function lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter * 10., fit_intercept=True) pvalues_fl = os.path.join(analysis_dir, "population_pca_association_tests.tsv") class_labels = np.array(data_model.class_labels) with open(pvalues_fl, "w") as fl: for i in xrange(projections.shape[1]): features = projections[:, i].reshape(-1, 1) p_value = likelihood_ratio_test(features, class_labels, lr, set_intercept=False) lr.fit(features, class_labels) pred_labels = lr.predict(features) acc = 100. * accuracy_score(class_labels, pred_labels) cm = confusion_matrix(class_labels, pred_labels) print(i + 1), p_value, acc print cm print fl.write("%s\t%s\t%s\n" % (i + 1, p_value, acc))
def train_model(args): workdir = args["workdir"] n_trees = args["trees"] if n_trees is None: print "Number of trees must be specified for training" sys.exit(1) n_resamples = args["resamples"] if n_resamples is None: print "Number of additional samples must be specified for training" sys.exit(1) features = read_features(workdir) snp_importances1 = features.snp_importances(n_trees, n_resamples).rank() snp_importances2 = features.snp_importances(n_trees, n_resamples).rank() write_snps(workdir, snp_importances1, "model1") write_snps(workdir, snp_importances2, "model2")
def train(args): workdir = args.workdir models_dir = os.path.join(workdir, "models") if not os.path.exists(models_dir): os.makedirs(models_dir) features = read_features(workdir) if args.model_type == "PCA": pca = PCA(n_components=args.n_components, whiten=True) elif args.model_type == "NMF": pca = NMF(n_components=args.n_components) else: raise Exception("Unknown model type %s" % args.model_type) projections = pca.fit_transform(features.feature_matrix) model = {MODEL_KEY: pca, PROJECTION_KEY: projections} model_fl = os.path.join(models_dir, "pca.pkl") joblib.dump(model, model_fl)
if __name__ == "__main__": args = parseargs() if not os.path.exists(args.workdir): print "Work directory '%s' does not exist." % args.workdir sys.exit(1) stats_dir = os.path.join(args.workdir, "statistics") if not os.path.exists(stats_dir): os.makedirs(stats_dir) project_summary = deserialize( os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME)) features = read_features(args.workdir) if args.mode == "pairwise": if project_summary.feature_encoding != "categories": print "Pairwise Cramer's V only works with the 'categories' feature encoding." sys.exit(1) set_one = None if args.subset_1: with open(args.subset_1) as fl: set_one = set() for ln in fl: cols = ln.strip().split() set_one.add((cols[0], cols[1])) set_two = None
def run_likelihood_ratio_tests(args): if not os.path.exists(args.workdir): print "Work directory '%s' does not exist." % args.workdir sys.exit(1) stats_dir = os.path.join(args.workdir, OUTPUT_DIR) if not os.path.exists(stats_dir): os.makedirs(stats_dir) project_summary = deserialize( os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME)) data_model = read_features(args.workdir) genotypes = data_model.feature_matrix n_iter = estimate_lr_iter(len(data_model.class_labels)) lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) testing_variables = np.array(data_model.class_labels).reshape(-1, 1) null_variables = None if args.variables_fl: selected_sample_ids, null_variables = parse_variables_file( args.variables_fl) selected_indices = select_samples(data_model, selected_sample_ids) # select subset and re-order genotypes = genotypes[selected_indices, :] testing_variables = testing_variables[selected_indices, :] N_COPIES = 3 class_labels = None testing_features, null_features = prepare_model_variables( N_COPIES, testing_variables, null_variables) with open(os.path.join(stats_dir, OUTPUT_FLNAME), "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): pos_label, feature_idx = pair chrom, pos = pos_label pos_genotypes = genotypes[:, feature_idx] class_labels = prepare_class_labels(N_COPIES, pos_genotypes, class_labels) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size p_value = likelihood_ratio_test(testing_features, class_labels, lr, features_null=null_features, g_scaling_factor=1.0 / N_COPIES) if i == next_output: print i, "Position", pos_label, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")