Exemplo n.º 1
0
def output_cluster(inputfiles):
    """Use already trained model to output clustered data."""
    try:
        model_dir = os.path.join(args.output_dir, 'models', 'clustered')
        data_dir = os.path.join(args.output_dir, 'clustered')
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.mkdir(data_dir)

        tf.reset_default_graph()
        saucie = SAUCIE(None, restore_folder=model_dir)

        print("Finding all binary codes")
        all_codes = {}
        for counter, f in enumerate(inputfiles):
            x = get_data(f)
            load = Loader(data=x, shuffle=False)

            acts = saucie.get_layer(load, 'layer_c')
            acts = acts / acts.max()
            binarized = np.where(acts > .000001, 1, 0)

            unique_rows, counts = np.unique(binarized, axis=0, return_counts=True)
            for unique_row in unique_rows:
                unique_row = tuple(unique_row.tolist())
                if unique_row not in all_codes:
                    all_codes[unique_row] = len(all_codes)

        print("Found {} clusters".format(len(all_codes)))

        print("Starting to output {} clustered files...".format(len(inputfiles)))
        for counter, f in enumerate(inputfiles):
            fname = os.path.split(f)[-1]
            print("Outputing file {}".format(counter))
            x = get_data(f)
            load = Loader(data=x, shuffle=False)
            acts = saucie.get_layer(load, 'layer_c')
            acts = acts / acts.max()
            binarized = np.where(acts > .000001, 1, 0)

            clusters = -1 * np.ones(x.shape[0])
            for code in all_codes:
                rows_equal_to_this_code = np.where(np.all(binarized == code, axis=1))[0]
                clusters[rows_equal_to_this_code] = all_codes[code]

            embeddings = saucie.get_layer(load, 'embeddings')

            rawdata = get_data(f, return_rawfile=True)
            outcols = rawdata.columns.tolist() + ['Cluster', 'Embedding_SAUCIE1', 'Embedding_SAUCIE2']
            rawdata = pd.concat([rawdata, pd.DataFrame(clusters), pd.DataFrame(embeddings[:, 0]), pd.DataFrame(embeddings[:, 1])], axis=1)
            outfile = os.path.join(data_dir, fname)
            fcswrite.write_fcs(outfile, outcols, rawdata)

    except Exception as ex:
        # if it didn't run all the way through, clean everything up and remove it
        shutil.rmtree(data_dir)
        raise(ex)
Exemplo n.º 2
0
def output_batch_correction(rawfiles):
    """Use already trained models to output batch corrected data."""
    try:
        model_dir = os.path.join(args.output_dir, 'models', 'batch_corrected')
        data_dir = os.path.join(args.output_dir, 'batch_corrected')
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.mkdir(data_dir)
        ref = rawfiles[0]
        refx = get_data(ref)
        refname = os.path.split(ref)[-1]

        print("Starting to output {} batch corrected files...".format(len(rawfiles)))
        for counter, nonref in enumerate(rawfiles[1:]):
            nonrefname = os.path.split(nonref)[-1]
            print("Outputing file {}".format(counter))

            nonrefx = get_data(nonref)
            alldata = np.concatenate([refx.as_matrix(), nonrefx.as_matrix()], axis=0)
            alllabels = np.concatenate([np.zeros(refx.shape[0]), np.ones(nonrefx.shape[0])], axis=0)

            load = Loader(data=alldata, labels=alllabels, shuffle=False)

            tf.reset_default_graph()
            restore_folder = os.path.join(model_dir, nonrefname)
            saucie = SAUCIE(None, restore_folder=restore_folder)

            recon, labels = saucie.get_layer(load, 'output')

            recon = sinh(recon)

            # write out reference file
            if counter == 0:
                reconref = recon[labels == 0]
                rawdata = get_data(ref, return_rawfile=True)
                for ind, c in enumerate(args.cols):
                    rawdata.iloc[:, c] = reconref[:, ind]

                outfileref = os.path.join(data_dir, refname)
                fcswrite.write_fcs(outfileref, rawdata.columns.tolist(), rawdata)

            # write out nonreference file
            reconnonref = recon[labels == 1]
            rawdata = get_data(nonref, return_rawfile=True)
            for ind, c in enumerate(args.cols):
                rawdata.iloc[:, c] = reconnonref[:, ind]
            outfilenonref = os.path.join(data_dir, nonrefname)
            fcswrite.write_fcs(outfilenonref, rawdata.columns.tolist(), rawdata)

    except Exception as ex:
        # if it didn't run all the way through, clean everything up and remove it
        shutil.rmtree(data_dir)
        raise(ex)
Exemplo n.º 3
0
def train_batch_correction(rawfiles):
    """Run batch correction on all files."""
    try:
        model_dir = os.path.join(args.output_dir, 'models', 'batch_corrected')
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)
        os.mkdir(model_dir)
        ref = rawfiles[0]
        refx = get_data(ref)

        print("Starting to train {} batch correction models...".format(len(rawfiles[1:])))
        for counter, nonref in enumerate(rawfiles[1:]):
            nonrefname = os.path.split(nonref)[-1]
            print("Training model {}".format(counter))

            nonrefx = get_data(nonref)
            alldata = np.concatenate([refx.as_matrix(), nonrefx.as_matrix()], axis=0)
            alllabels = np.concatenate([np.zeros(refx.shape[0]), np.ones(nonrefx.shape[0])], axis=0)

            load = Loader(data=alldata, labels=alllabels, shuffle=True)

            tf.reset_default_graph()

            saucie = SAUCIE(input_dim=refx.shape[1], lambda_b=args.lambda_b)

            for i in range(args.num_iterations):
                saucie.train(load, steps=1000, batch_size=200)

            saucie.save(folder=os.path.join(model_dir, nonrefname))

    except Exception as ex:
        # if it didn't run all the way through, clean everything up and remove it
        shutil.rmtree(model_dir)
        raise(ex)
Exemplo n.º 4
0
def train_cluster(inputfiles):
    """Run clustering on all files."""
    try:
        model_dir = os.path.join(args.output_dir, 'models', 'clustered')
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)
        os.mkdir(model_dir)

        tf.reset_default_graph()
        x = get_data(inputfiles[0], sample=2)
        saucie = SAUCIE(input_dim=x.shape[1], lambda_d=args.lambda_d, lambda_c=args.lambda_c)

        for i in range(args.num_iterations):
            alldata = []
            for f in inputfiles:
                x = get_data(f, sample=args.num_points_sample)
                alldata.append(x)
            alldata = np.concatenate(alldata, axis=0)

            load = Loader(data=alldata, shuffle=True)

            saucie.train(load, steps=1000, batch_size=400)

        saucie.save(folder=model_dir)

    except Exception as ex:
        # if it didn't run all the way through, clean everything up and remove it
        shutil.rmtree(model_dir)
        raise(ex)
Exemplo n.º 5
0
from model import SAUCIE
from loader import Loader
import numpy as np
import matplotlib.pyplot as plt

x = np.concatenate([
    np.random.uniform(-3, -2, (1000, 40)),
    np.random.uniform(2, 3, (1000, 40))
],
                   axis=0)
load = Loader(x, shuffle=False)

saucie = SAUCIE(x.shape[1], lambda_c=.2, lambda_d=.4)

saucie.train(load, 100)
embedding = saucie.get_embedding(load)
num_clusters, clusters = saucie.get_clusters(load)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(embedding[:, 0], embedding[:, 1], c=clusters)
fig.savefig('embedding_by_cluster.png')
Exemplo n.º 6
0
    "/Users/tkeyes/GitHub/classes/CS_230/course_project/data/saucie_data.csv",
    sep=",",
    header=0)
x = x.to_numpy(dtype='float64')

lambda_cs = [i / 10 for i in range(0, 1)]
lambda_ds = [i / 10 for i in range(0, 10)]

for lambda_c in lambda_cs:
    print("lambda_c = " + str(lambda_c))
    for lambda_d in lambda_ds:
        print("lambda_d = " + str(lambda_d))
        tf.reset_default_graph()
        # Construct and train SAUCIE model
        my_loader = Loader(x, shuffle=False)
        my_saucie = SAUCIE(x.shape[1], lambda_c=0.01, lambda_d=lambda_d)
        my_saucie.train(load=my_loader, steps=100, batch_size=3000)

        #extract features from SAUCIE
        embedding = my_saucie.get_embedding(my_loader)
        num_clusters, clusters = my_saucie.get_clusters(my_loader)
        reconstruction = my_saucie.get_reconstruction(my_loader)

        # save files
        output_frame = pd.DataFrame({
            "clusters": clusters,
            "embedding_1": embedding[:, 0],
            "embedding_2": embedding[:, 1]
        })
        output_frame.to_csv(
            "/Users/tkeyes/GitHub/classes/CS_230/course_project/data/saucie_output_"
Exemplo n.º 7
0
    try:
        os.makedirs(dire_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise e


args = parse_args()
print("run with these parametres: %s" % str(args))

# Main Part

X = pd.read_csv(args.input, index_col=0)
data = X.values.T

saucie = SAUCIE(data.shape[1])
loadtrain = Loader(data, shuffle=True)
saucie.train(loadtrain, steps=args.steps)

loadeval = Loader(data, shuffle=False)
embedding = saucie.get_embedding(loadeval)
number_of_clusters, clusters = saucie.get_clusters(loadeval)
reconstruction = saucie.get_reconstruction(loadeval)

X_emb = pd.DataFrame(embedding, index=X.columns.values, columns=["D1", "D2"])
X_clusters = pd.DataFrame(clusters,
                          index=X.columns.values,
                          columns=["cluster"])
X_imp = pd.DataFrame(reconstruction.T,
                     index=X.index.values,
                     columns=X.columns.values)