def output_cluster(inputfiles): """Use already trained model to output clustered data.""" try: model_dir = os.path.join(args.output_dir, 'models', 'clustered') data_dir = os.path.join(args.output_dir, 'clustered') if os.path.exists(data_dir): shutil.rmtree(data_dir) os.mkdir(data_dir) tf.reset_default_graph() saucie = SAUCIE(None, restore_folder=model_dir) print("Finding all binary codes") all_codes = {} for counter, f in enumerate(inputfiles): x = get_data(f) load = Loader(data=x, shuffle=False) acts = saucie.get_layer(load, 'layer_c') acts = acts / acts.max() binarized = np.where(acts > .000001, 1, 0) unique_rows, counts = np.unique(binarized, axis=0, return_counts=True) for unique_row in unique_rows: unique_row = tuple(unique_row.tolist()) if unique_row not in all_codes: all_codes[unique_row] = len(all_codes) print("Found {} clusters".format(len(all_codes))) print("Starting to output {} clustered files...".format(len(inputfiles))) for counter, f in enumerate(inputfiles): fname = os.path.split(f)[-1] print("Outputing file {}".format(counter)) x = get_data(f) load = Loader(data=x, shuffle=False) acts = saucie.get_layer(load, 'layer_c') acts = acts / acts.max() binarized = np.where(acts > .000001, 1, 0) clusters = -1 * np.ones(x.shape[0]) for code in all_codes: rows_equal_to_this_code = np.where(np.all(binarized == code, axis=1))[0] clusters[rows_equal_to_this_code] = all_codes[code] embeddings = saucie.get_layer(load, 'embeddings') rawdata = get_data(f, return_rawfile=True) outcols = rawdata.columns.tolist() + ['Cluster', 'Embedding_SAUCIE1', 'Embedding_SAUCIE2'] rawdata = pd.concat([rawdata, pd.DataFrame(clusters), pd.DataFrame(embeddings[:, 0]), pd.DataFrame(embeddings[:, 1])], axis=1) outfile = os.path.join(data_dir, fname) fcswrite.write_fcs(outfile, outcols, rawdata) except Exception as ex: # if it didn't run all the way through, clean everything up and remove it shutil.rmtree(data_dir) raise(ex)
def output_batch_correction(rawfiles): """Use already trained models to output batch corrected data.""" try: model_dir = os.path.join(args.output_dir, 'models', 'batch_corrected') data_dir = os.path.join(args.output_dir, 'batch_corrected') if os.path.exists(data_dir): shutil.rmtree(data_dir) os.mkdir(data_dir) ref = rawfiles[0] refx = get_data(ref) refname = os.path.split(ref)[-1] print("Starting to output {} batch corrected files...".format(len(rawfiles))) for counter, nonref in enumerate(rawfiles[1:]): nonrefname = os.path.split(nonref)[-1] print("Outputing file {}".format(counter)) nonrefx = get_data(nonref) alldata = np.concatenate([refx.as_matrix(), nonrefx.as_matrix()], axis=0) alllabels = np.concatenate([np.zeros(refx.shape[0]), np.ones(nonrefx.shape[0])], axis=0) load = Loader(data=alldata, labels=alllabels, shuffle=False) tf.reset_default_graph() restore_folder = os.path.join(model_dir, nonrefname) saucie = SAUCIE(None, restore_folder=restore_folder) recon, labels = saucie.get_layer(load, 'output') recon = sinh(recon) # write out reference file if counter == 0: reconref = recon[labels == 0] rawdata = get_data(ref, return_rawfile=True) for ind, c in enumerate(args.cols): rawdata.iloc[:, c] = reconref[:, ind] outfileref = os.path.join(data_dir, refname) fcswrite.write_fcs(outfileref, rawdata.columns.tolist(), rawdata) # write out nonreference file reconnonref = recon[labels == 1] rawdata = get_data(nonref, return_rawfile=True) for ind, c in enumerate(args.cols): rawdata.iloc[:, c] = reconnonref[:, ind] outfilenonref = os.path.join(data_dir, nonrefname) fcswrite.write_fcs(outfilenonref, rawdata.columns.tolist(), rawdata) except Exception as ex: # if it didn't run all the way through, clean everything up and remove it shutil.rmtree(data_dir) raise(ex)
def train_batch_correction(rawfiles): """Run batch correction on all files.""" try: model_dir = os.path.join(args.output_dir, 'models', 'batch_corrected') if os.path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) ref = rawfiles[0] refx = get_data(ref) print("Starting to train {} batch correction models...".format(len(rawfiles[1:]))) for counter, nonref in enumerate(rawfiles[1:]): nonrefname = os.path.split(nonref)[-1] print("Training model {}".format(counter)) nonrefx = get_data(nonref) alldata = np.concatenate([refx.as_matrix(), nonrefx.as_matrix()], axis=0) alllabels = np.concatenate([np.zeros(refx.shape[0]), np.ones(nonrefx.shape[0])], axis=0) load = Loader(data=alldata, labels=alllabels, shuffle=True) tf.reset_default_graph() saucie = SAUCIE(input_dim=refx.shape[1], lambda_b=args.lambda_b) for i in range(args.num_iterations): saucie.train(load, steps=1000, batch_size=200) saucie.save(folder=os.path.join(model_dir, nonrefname)) except Exception as ex: # if it didn't run all the way through, clean everything up and remove it shutil.rmtree(model_dir) raise(ex)
def train_cluster(inputfiles): """Run clustering on all files.""" try: model_dir = os.path.join(args.output_dir, 'models', 'clustered') if os.path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) tf.reset_default_graph() x = get_data(inputfiles[0], sample=2) saucie = SAUCIE(input_dim=x.shape[1], lambda_d=args.lambda_d, lambda_c=args.lambda_c) for i in range(args.num_iterations): alldata = [] for f in inputfiles: x = get_data(f, sample=args.num_points_sample) alldata.append(x) alldata = np.concatenate(alldata, axis=0) load = Loader(data=alldata, shuffle=True) saucie.train(load, steps=1000, batch_size=400) saucie.save(folder=model_dir) except Exception as ex: # if it didn't run all the way through, clean everything up and remove it shutil.rmtree(model_dir) raise(ex)
from model import SAUCIE from loader import Loader import numpy as np import matplotlib.pyplot as plt x = np.concatenate([ np.random.uniform(-3, -2, (1000, 40)), np.random.uniform(2, 3, (1000, 40)) ], axis=0) load = Loader(x, shuffle=False) saucie = SAUCIE(x.shape[1], lambda_c=.2, lambda_d=.4) saucie.train(load, 100) embedding = saucie.get_embedding(load) num_clusters, clusters = saucie.get_clusters(load) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.scatter(embedding[:, 0], embedding[:, 1], c=clusters) fig.savefig('embedding_by_cluster.png')
"/Users/tkeyes/GitHub/classes/CS_230/course_project/data/saucie_data.csv", sep=",", header=0) x = x.to_numpy(dtype='float64') lambda_cs = [i / 10 for i in range(0, 1)] lambda_ds = [i / 10 for i in range(0, 10)] for lambda_c in lambda_cs: print("lambda_c = " + str(lambda_c)) for lambda_d in lambda_ds: print("lambda_d = " + str(lambda_d)) tf.reset_default_graph() # Construct and train SAUCIE model my_loader = Loader(x, shuffle=False) my_saucie = SAUCIE(x.shape[1], lambda_c=0.01, lambda_d=lambda_d) my_saucie.train(load=my_loader, steps=100, batch_size=3000) #extract features from SAUCIE embedding = my_saucie.get_embedding(my_loader) num_clusters, clusters = my_saucie.get_clusters(my_loader) reconstruction = my_saucie.get_reconstruction(my_loader) # save files output_frame = pd.DataFrame({ "clusters": clusters, "embedding_1": embedding[:, 0], "embedding_2": embedding[:, 1] }) output_frame.to_csv( "/Users/tkeyes/GitHub/classes/CS_230/course_project/data/saucie_output_"
try: os.makedirs(dire_name) except OSError as e: if e.errno != errno.EEXIST: raise e args = parse_args() print("run with these parametres: %s" % str(args)) # Main Part X = pd.read_csv(args.input, index_col=0) data = X.values.T saucie = SAUCIE(data.shape[1]) loadtrain = Loader(data, shuffle=True) saucie.train(loadtrain, steps=args.steps) loadeval = Loader(data, shuffle=False) embedding = saucie.get_embedding(loadeval) number_of_clusters, clusters = saucie.get_clusters(loadeval) reconstruction = saucie.get_reconstruction(loadeval) X_emb = pd.DataFrame(embedding, index=X.columns.values, columns=["D1", "D2"]) X_clusters = pd.DataFrame(clusters, index=X.columns.values, columns=["cluster"]) X_imp = pd.DataFrame(reconstruction.T, index=X.index.values, columns=X.columns.values)