def beta_dist(data_sets, kmer_size, n_factor): # Loop over all data sets for data_set in data_sets: data_set = data_set[0] # Retrieve diseased data and labels allowed_labels = ['0', '1'] kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers( kmer_size, data_set, allowed_labels) print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(kmer_cnts)) + " SAMPLES") labelz = np.asarray(labelz) labelz = labelz.astype(np.int) # Conduct NMF and resave to data_normalized if n_factor == 0: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels else: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized = stats_utils_AEB.NMF_factor( data_normalized, kmer_size, n_components=int(n_factor), title=(str(data_set) + str(kmer_size) + "mers" + str(n_factor) + "factors")) data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels return x, y
'w', newline='') as csvfile: fieldnames = [ 'dataset', 'kmer_size', 'n_splits', 'n_repeats', 'acc', 'auc', 'precision', 'recall', 'f1', 'model', 'NMF_factors', 'params' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() # Loop over all data sets for data_set in data_sets_to_use: data_set = data_set[0] # Retrieve diseased data and labels allowed_labels = ['0', '1'] kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers( kmer_size, data_set, allowed_labels) print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(kmer_cnts)) + " SAMPLES") labelz = np.asarray(labelz) labelz = labelz.astype(np.int) # Conduct NMF and resave to data_normalized #enter in desired no.factors for n in [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: if n == 0: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels
backend = K.backend() import load_kmer_cnts_jf import deep_learning_models ################# # Load the data # ################# kmer_size = 7 #data_sets_healthy=['HMP', 'Qin_et_al','RA','MetaHIT','Feng','Karlsson_2013','LiverCirrhosis','Zeller_2014'] data_sets_healthy = ['MetaHIT'] allowed_labels = ['0'] kmer_cnts_healthy, accessions_healthy, labels_healthy, domain_labels = load_kmer_cnts_jf.load_kmers( kmer_size, data_sets_healthy, allowed_labels) data_sets_diseased = ['MetaHIT'] allowed_labels = ['1'] kmer_cnts_diseased, accessions_diseased, labels_diseased, domain_labels = load_kmer_cnts_jf.load_kmers( kmer_size, data_sets_diseased, allowed_labels) kmer_cnts = np.concatenate((kmer_cnts_healthy, kmer_cnts_diseased)) accessions = np.concatenate((accessions_healthy, accessions_diseased)) labels = np.concatenate((labels_healthy, labels_diseased)) labels = np.asarray(labels) labels = labels.astype(np.int) healthy = np.where(labels == 0) disease = np.where(labels == 1)
################# # Load the data # ################# #kmer_size=3 kmer_size=5 #kmer_size=10 #data_set='Qin_et_al' #data_set='RA' #data_set='MetaHIT' data_set='HMP' data_sets=['HMP'] kmer_cnts, accessions, labels =load_kmer_cnts_jf.load_kmers(kmer_size,data_sets) labels=np.asarray(labels) healthy=np.where(labels=='0') disease=np.where(labels=='1') data=pd.DataFrame(kmer_cnts) data_normalized = normalize(data, axis = 1, norm = 'l1') ################################ # set up a model (autoencoder) ################################ input_dim=len(data_normalized[0]) # this is the number of input kmers encoding_dim=10