accs_names = ['Calinski-Harabaz', 'Silhouette (euc)', 'Silhouette (corr)', 'Silhouette (jacc)', 'ARI'] accs = np.zeros((5, len(mixtures), len(num_cluster))) for i in range(len(num_cluster)): for j in range(len(mixtures)): print('Iteration k={0} mix={1}') trg_k = num_cluster[i] mix = mixtures[j] # -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- src_clustering = None if src_data is not None: src_clustering = NmfClustering(src_data, src_gene_ids, num_cluster=arguments.src_k) src_clustering.add_cell_filter(src_cell_filter_fun) src_clustering.add_gene_filter(src_gene_filter_fun) src_clustering.set_data_transformation(src_data_transf_fun) # -------------------------------------------------- # 3.2. SETUP TARGET DATA CLUSTERING # -------------------------------------------------- if arguments.method is 'NMF' and src_data is not None: print('Transfer learning method is NMF.') trg_clustering = DaNmfClustering(src_clustering, trg_data, trg_gene_ids, num_cluster=trg_k) trg_clustering.add_cell_filter(trg_cell_filter_fun) trg_clustering.add_gene_filter(trg_gene_filter_fun) trg_clustering.set_data_transformation(trg_data_transf_fun) trg_clustering.apply(mix=mix, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
num_cluster = map(np.int, arguments.cluster_range.split(",")) accs_names = ['KTA (linear)', 'ARI'] accs = np.zeros((2, len(num_cluster))) for i in range(len(num_cluster)): k = num_cluster[i] print('Iteration {0}, num-cluster={1}'.format(i, k)) # -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- if labels is None: # No source labels are provided, generate them via NMF clustering nmf_labels = None nmf_labels = NmfClustering(data, gene_ids, num_cluster=k, labels=[]) nmf_labels.add_cell_filter(cell_filter_fun) nmf_labels.add_gene_filter(gene_filter_fun) nmf_labels.set_data_transformation(data_transf_fun) nmf_labels.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err) labels = nmf_labels.cluster_labels # Use perfect number of latent states for nmf and sc3 src_labels = np.array(labels, dtype=np.int) src_lbl_set = np.unique(src_labels) k_now = src_lbl_set.size nmf = None nmf = NmfClustering_initW(data, gene_ids, labels=labels, num_cluster=k_now) nmf.add_cell_filter(cell_filter_fun) nmf.add_gene_filter(gene_filter_fun) nmf.set_data_transformation(data_transf_fun)
# Cell and gene filter and transformation within the procedure cell_filter_fun = partial(sc.cell_filter, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold) gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold) data_transf_fun = sc.data_transformation_log2 # Generating labels from complete dataset print "Train complete data" complete_nmf = None complete_nmf = NmfClustering(data, np.arange(data.shape[0]), num_cluster=num_cluster) complete_nmf.add_cell_filter(cell_filter_fun) complete_nmf.add_gene_filter(gene_filter_fun) complete_nmf.set_data_transformation(data_transf_fun) complete_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err) # Get labels desc, target_nmf, trg_lbls_pred, mixed_data = method_sc3_filter( complete_nmf, data, [], cell_filter=cell_filter_fun, gene_filter=gene_filter_fun, transformation=data_transf_fun, mix=0.0,
cell_inds = sc.cell_filter(data, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold) data = data[:,cell_inds] # labels = labels[cell_inds] gene_inds = sc.gene_filter(data, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold) data = data[gene_inds, :] data = sc.data_transformation_log2(data) cell_filter_fun = partial(sc.cell_filter, num_expr_genes=0, non_zero_threshold=-1) gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=1, non_zero_threshold=-1) data_transf_fun = sc.no_data_transformation print "data dimensions after preprocessing: genes x cells: ", data.shape # Generating labels from complete dataset print "Train complete data" complete_nmf = None complete_nmf = NmfClustering(data, np.arange(data.shape[0]), num_cluster=num_cluster) complete_nmf.add_cell_filter(cell_filter_fun) complete_nmf.add_gene_filter(gene_filter_fun) complete_nmf.set_data_transformation(data_transf_fun) complete_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err) # Get NMF labels labels_NMF = complete_nmf.cluster_labels label_names, label_counts = np.unique(labels_NMF, return_counts = True) print "Labels NMF: ", label_names print "Counts NMF: ", label_counts # Get SC3 labels desc, target_nmf, trg_lbls_pred, mixed_data = method_sc3_filter(complete_nmf, data, [], cell_filter=cell_filter_fun, gene_filter=gene_filter_fun, transformation=data_transf_fun, mix=0.0, metric='euclidean', use_da_dists=False, n_trg_cluster=num_cluster) labels_SC3 = trg_lbls_pred label_names, label_counts = np.unique(labels_SC3, return_counts = True) print "Labels SC3: ", label_names
accs_names = [ 'KTA (linear)', 'Silhouette (euc)', 'Silhouette (pearson)', 'Silhouette (spearman)', 'ARI' ] accs = np.zeros((5, len(num_cluster))) for i in range(len(num_cluster)): k = num_cluster[i] print('Iteration {0}, num-cluster={0}'.format(i, k)) # -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- nmf = None nmf = NmfClustering(data, gene_ids, num_cluster=k) nmf.add_cell_filter(cell_filter_fun) nmf.add_gene_filter(gene_filter_fun) nmf.set_data_transformation(data_transf_fun) nmf.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err) # -------------------------------------------------- # 3.2. EVALUATE CLUSTER ASSIGNMENT # -------------------------------------------------- print('\nUnsupervised evaluation:') accs[0, i] = unsupervised_acc_kta(nmf.pp_data, nmf.cluster_labels, kernel='linear')
data, labels, mode=1, target_ncells=n_trg, source_ncells=n_src[s]) src_labels = np.array(src_labels, dtype=np.int) #src_labels_SC3 = np.array(src_labels_SC3, dtype=np.int) # 3.c. train source once per repetition print "Train source data of rep {0}".format(r + 1) source_nmf = None source_nmf = NmfClustering(src, np.arange(src.shape[0]), num_cluster=num_cluster) source_nmf.add_cell_filter(cell_filter_fun) source_nmf.add_gene_filter(gene_filter_fun) source_nmf.set_data_transformation(data_transf_fun) source_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err) # Calculate ARIs and KTAs source_aris[s, r] = metrics.adjusted_rand_score( src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels) print 'SOURCE ARI Labels NMF, Method NMF = ', source_aris[s, r] r += 1