def fit_transform(self, data):
        self.model = scscope.train(data,
                                   self.k,
                                   use_mask=True,
                                   batch_size=128,
                                   max_epoch=500,
                                   epoch_per_check=10,
                                   T=2,
                                   exp_batch_idx_input=[],
                                   encoder_layers=[1000, 500, 200, 500],
                                   decoder_layers=[200, 500, 1000],
                                   learning_rate=0.001,
                                   beta1=0.5,
                                   num_gpus=1)

        embedding, _, _ = scscope.predict(data, self.model, batch_effect=[])
        return embedding
示例#2
0
def main(cmd_args):
    dataset = cb.data.ExprDataSet.read_dataset(
        cmd_args.input, sparsify=True
    ).normalize()
    if cmd_args.clean is not None:
        dataset = utils.clean_dataset(dataset, cmd_args.clean)
    if cmd_args.genes is not None:
        dataset = dataset[:, dataset.uns[cmd_args.genes]]
    dataset = dataset.exprs.log1p().toarray()
    start_time = time.time()
    model = DeepImpute.train(
        dataset, cmd_args.n_latent,
        max_epoch=cmd_args.n_epochs, random_seed=cmd_args.seed
    )
    latent, _imputed_val, _batch_effect = DeepImpute.predict(dataset, model)
    cb.data.write_hybrid_path(
        time.time() - start_time,
        "//".join([cmd_args.output, "time"])
    )
    cb.data.write_hybrid_path(
        latent,
        "//".join([cmd_args.output, "latent"])
    )
示例#3
0
文件: demo.py 项目: zorrodong/scScope
def RUN_MAIN():

    # 1. Load gene expression matrix of simulated data
    # gene expression with simulated dropouts
    counts_drop = pd.read_csv('counts_1.csv', header=0, index_col=0)
    # ground trouth subpopulation assignment
    cellinfo = pd.read_csv('cellinfo_1.csv', header=0, index_col=0)

    group = cellinfo.Group
    label_ground_truth = []
    for g in group:
        g = int(g.split('Group')[1])
        label_ground_truth.append(g)

    # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size)
    # matrix of cells x genes
    gene_expression = sc.AnnData(counts_drop.values)
    # normalize each cell to have same count number
    sc.pp.normalize_per_cell(gene_expression)
    # update datastructure to use normalized data
    gene_expression = gene_expression.X

    latent_dim = 50

    # 3. scScope learning
    if gene_expression.shape[0] >= 100000:
        DI_model = DeepImpute.train(gene_expression,
                                    latent_dim,
                                    T=2,
                                    batch_size=512,
                                    max_epoch=10,
                                    num_gpus=4)
    else:
        DI_model = DeepImpute.train(gene_expression,
                                    latent_dim,
                                    T=2,
                                    batch_size=64,
                                    max_epoch=300,
                                    num_gpus=4)

    # 4. latent representations and imputed expressions
    latent_code, imputed_val, _ = DeepImpute.predict(gene_expression, DI_model)

    # 5. graph clustering
    if latent_code.shape[0] <= 10000:
        label, _, _ = phenograph.cluster(latent_code)
    else:
        label = DeepImpute.scalable_cluster(latent_code)

    # evaluate
    ARI = adjusted_rand_score(label, label_ground_truth)
    print(ARI)

    X_embedded = TSNE(n_components=2).fit_transform(latent_code)

    # visualization of the subpopulation using tSNE
    plt.figure()
    for i in range(5):
        idx = np.nonzero(label == i)[0]
        plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1])
    plt.show()
示例#4
0
raw_library_size = input_bc_gene_mat.sum(0)
filt_adata = sc.AnnData(input_bc_gene_mat)
# 2. Normalize gene expression based on scanpy (normalize each cell to have same library size)
# normalize each cell to have same count number
sc.pp.normalize_per_cell(filt_adata)
# update datastructure to use normalized data
gene_expression_norm = filt_adata.X
latent_dim = 50
# 3. scScope learning
DI_model = DeepImpute.train(gene_expression_norm,
                            latent_dim,
                            num_gpus=FLAGS["num_gpus"],
                            max_epoch=FLAGS["epoch"])

# 4. latent representations and imputed expressions
latent_code, imputed_val, predicted_batch_effect = DeepImpute.predict(
    gene_expression_norm, DI_model)
with h5py.File("{}/{}".format(output_dir, output_feature_h5), "w") as f:
    f["cell_id"] = cell_id.astype(h5py.special_dtype(vlen=str))
    ff_dset_feature = f.create_dataset("feature",
                                       shape=(cell_id.size,
                                              latent_code.shape[1]),
                                       chunks=(1, latent_code.shape[1]),
                                       dtype=np.float32)
    ff_dset_feature[...] = latent_code

imputed_val_resume = imputed_val * raw_library_size / filt_adata.X.sum(
    1, keepdims=True)
with h5py.File("{}/{}".format(output_dir, output_raw_h5), "w") as f:
    f["cell_id"] = cell_id.astype(h5py.special_dtype(vlen=str))
    f["gene_name"] = gene_name[gene_filter].astype(
        h5py.special_dtype(vlen=str))
示例#5
0
    import time
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--counts_file', metavar='counts_file', default=None, required=True, help='counts_file file')
    parser.add_argument('--num_pc', metavar='N', type=int, default=None, help='num_pc file')
    parser.add_argument('--out', metavar='out', required=True, help='output file')
    args = parser.parse_args()

    # 1. Load gene expression matrix of simulated data
    counts_drop = pd.read_csv(args.counts_file, header=0, index_col=0)
    # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size)
    # matrix of cells x genes
    start_time = time.time()
    gene_expression = sc.AnnData(counts_drop.values)
    # normalize each cell to have same count number
    sc.pp.normalize_per_cell(gene_expression)
    # update datastructure to use normalized data
    gene_expression = gene_expression.X

	# 3. scScope learning
    if gene_expression.shape[0] >= 100000:
        DI_model = DeepImpute.train(gene_expression, args.num_pc, T=2, batch_size=512, max_epoch=10, num_gpus=4)
    else:
        DI_model = DeepImpute.train(gene_expression, args.num_pc, T=2, batch_size=64, max_epoch=300, num_gpus=4)

    # 4. latent representations and imputed expressions
    latent_code, imputed_val, _ = DeepImpute.predict(gene_expression, DI_model)
    elapsed_time = time.time() - start_time

    np.savetxt(args.out, latent_code, fmt='%.4f')
示例#6
0
X = pd.read_csv(args.input, index_col=0)
X = X.transpose()

import scscope as DeepImpute

DI_model = DeepImpute.train(X.values,
                            args.latent_code_dim,
                            use_mask=not args.no_mask,
                            batch_size=args.batch_size,
                            max_epoch=args.n_epochs,
                            epoch_per_check=args.n_epochs + 1,
                            T=args.T,
                            exp_batch_idx_input=[],
                            encoder_layers=[],
                            decoder_layers=[],
                            learning_rate=args.lr,
                            beta1=args.beta1,
                            num_gpus=1)

latent_code, imputed_val, _ = DeepImpute.predict(X.values, DI_model)

make_sure_dir_exists(args.outputdir)
filename_latent = os.path.join(args.outputdir, "latent.csv")
filename_imputation = os.path.join(args.outputdir, "imputed_values.csv")

pd.DataFrame(latent_code.T, columns=X.index.values).to_csv(filename_latent)
pd.DataFrame(imputed_val.T, columns=X.index.values,
             index=X.columns.values).to_csv(filename_imputation)

print("Done!")