Пример #1
0
def test_solver():
    # Testing exact vs approximate solver
    magic_op = magic.MAGIC(t="auto",
                           decay=20,
                           knn=10,
                           solver="exact",
                           verbose=False,
                           random_state=42)
    data_imputed_exact = magic_op.fit_transform(scdata_norm)
    # should have exactly as many genes stored
    assert magic_op.X_magic.shape[1] == scdata_norm.shape[1]
    # should be nonzero
    assert np.all(data_imputed_exact >= 0)

    magic_op = magic.MAGIC(
        t="auto",
        decay=20,
        knn=10,
        n_pca=150,
        solver="approximate",
        verbose=False,
        random_state=42,
    )
    # magic_op.set_params(solver='approximate')
    data_imputed_apprx = magic_op.fit_transform(scdata_norm)
    # should have n_pca genes stored
    assert magic_op.X_magic.shape[1] == 150
    # make sure they're close-ish
    np.testing.assert_allclose(data_imputed_apprx,
                               data_imputed_exact,
                               atol=0.15)
    # make sure they're not identical
    assert np.any(data_imputed_apprx != data_imputed_exact)
Пример #2
0
def test_dremi():
    magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False)
    # test DREMI: need numerical precision here
    magic_op.set_params(random_state=42)
    magic_op.fit(scdata_norm)
    dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True)
    np.testing.assert_allclose(dremi, 1.466004, atol=0.0000005)
Пример #3
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    parser.add_option("-o", "--out_file", help="File to write output H5 file")
    (options, args) = parser.parse_args()

    dataset_f = args[0]
    out_f = options.out_file

    with h5py.File(dataset_f, 'r') as in_f:
        print('Loading expression matrix from {}...'.format(dataset_f))
        X = in_f['expression'][:]
        print('done.')
        print('Running MAGIC...')
        magic_operator = magic.MAGIC()
        magic_X = magic_operator.fit_transform(X)
        print('done.')
        print('Writing results to {}...'.format(out_f))
        with h5py.File(out_f, 'w') as out_f:
            out_f.create_dataset('expression',
                                 data=magic_X,
                                 compression="gzip")
            # Copy other datasets to new H5 file
            for k in in_f.keys():
                if k != 'expression':
                    out_f.create_dataset(k, data=in_f[k][:])
Пример #4
0
def test_all_genes():
    magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False, random_state=42)
    int_gene_magic = magic_op.fit_transform(scdata_norm, genes=[-2, -1])
    magic_all_genes = magic_op.fit_transform(scdata_norm, genes="all_genes")
    assert scdata_norm.shape == magic_all_genes.shape
    int_gene_magic2 = magic_op.transform(scdata_norm, genes=[-2, -1])
    np.testing.assert_allclose(int_gene_magic, int_gene_magic2, rtol=0.015)
Пример #5
0
def test_genes_str_int():
    magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False)
    str_gene_magic = magic_op.fit_transform(scdata_norm, genes=["VIM", "ZEB1"])
    int_gene_magic = magic_op.fit_transform(scdata_norm,
                                            graph=magic_op.graph,
                                            genes=[-2, -1])
    assert str_gene_magic.shape[0] == scdata_norm.shape[0]
    np.testing.assert_array_equal(str_gene_magic, int_gene_magic)
def magic_process(matrix):
    magic_op = magic.MAGIC(knn=10)
    magiced = magic_op.fit_transform(matrix, genes="all_genes")

    print("after MAGIC:", magiced.shape,
          sum(magiced[magiced == 0].count(axis=1)) / sum(magiced.count()))
    print(magiced.head())
    return magiced, magic_op
Пример #7
0
def magic_impute(adata, knn=5, t=2, verbose=0, **kwargs):
    logg.info(
        "To be used carefully. Magic has not yet been tested for this application."
    )
    import magic

    magic_operator = magic.MAGIC(verbose=verbose, knn=knn, t=t, **kwargs)
    adata.layers["Ms"] = magic_operator.fit_transform(adata.layers["spliced"])
    adata.layers["Mu"] = magic_operator.transform(adata.layers["unspliced"])
def main(data_path, n_rows):
    data = Data(data_path, n_rows)
    data.load_data()
    magic_operator = magic.MAGIC()
    X_magic = magic_operator.fit_transform(data.dataframe)
    filename = 'magic_data_{}_rows.npy'
    output_file = 'magic_data_from_{}_{}_rows.npy'.format(
        data_path.replace('.', '').replace('/', ''), n_rows)
    np.save(output_file, X_magic)
    print('data saved in', output_file)
Пример #9
0
def test_anndata():
    try:
        anndata
    except NameError:
        # anndata not installed
        return
    scdata = anndata.read_csv("../data/test_data.csv")
    fast_magic_operator = magic.MAGIC(t='auto', a=None, k=10)
    sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes")
    assert np.all(sc_magic.var_names == scdata.var_names)
    assert np.all(sc_magic.obs_names == scdata.obs_names)
    sc_magic = fast_magic_operator.fit_transform(scdata, genes=['VIM', 'ZEB1'])
    assert np.all(sc_magic.var_names.values == np.array(['VIM', 'ZEB1']))
    assert np.all(sc_magic.obs_names == scdata.obs_names)
Пример #10
0
def run_MAGIC(train_adata, test_adata):
    import magic

    train_magic_op = magic.MAGIC()
    if scipy.sparse.issparse(train_adata.X):
        x_train = train_adata.X.toarray()
    else:
        x_train = train_adata.X
    train_emt_magic = train_magic_op.fit_transform(x_train, genes='all_genes')
    train_adata.X = train_emt_magic
    ## standardize the input
    sc.pp.scale(train_adata, zero_center=True, max_value=6)

    test_magic_op = magic.MAGIC()
    if scipy.sparse.issparse(test_adata.X):
        x_test = test_adata.X.toarray()
    else:
        x_test = test_adata.X
    test_emt_magic = test_magic_op.fit_transform(x_test, genes='all_genes')
    test_adata.X = test_emt_magic
    ## standardize the input
    sc.pp.scale(test_adata, zero_center=True, max_value=6)

    return train_adata, test_adata
Пример #11
0
def MAGIC(data):
    """"Adaptor method to call MAGIC to impute
		For this manuscript, we used default parameters
		to call MAGIC develope dby David van Dijk, et al., 2017.

	Parameter:
	---------
	data: data frame, data to be imputed

	Return:
	------
	Imputed gene expression as data frame.
	"""
    magic_operator = magic.MAGIC(verbose=0)
    return magic_operator.fit_transform(data)
Пример #12
0
def test_anndata():
    try:
        anndata
    except NameError:
        # anndata not installed
        return
    scdata = anndata.read_csv(data_path)
    fast_magic_operator = magic.MAGIC(
        t="auto", solver="approximate", decay=None, knn=10, verbose=False
    )
    sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes")
    assert np.all(sc_magic.var_names == scdata.var_names)
    assert np.all(sc_magic.obs_names == scdata.obs_names)
    sc_magic = fast_magic_operator.fit_transform(scdata, genes=["VIM", "ZEB1"])
    assert np.all(sc_magic.var_names.values == np.array(["VIM", "ZEB1"]))
    assert np.all(sc_magic.obs_names == scdata.obs_names)
Пример #13
0
def test_scdata():
    scdata = pd.read_csv("../data/test_data.csv")
    scdata_norm = magic.preprocessing.library_size_normalize(scdata)
    assert scdata.shape == scdata_norm.shape
    fast_magic_operator = magic.MAGIC(t='auto', a=20, k=10)
    str_gene_magic = fast_magic_operator.fit_transform(
        scdata_norm, genes=['VIM', 'ZEB1'])
    int_gene_magic = fast_magic_operator.fit_transform(
        scdata_norm, genes=[-2, -1])
    assert str_gene_magic.shape[0] == scdata_norm.shape[0]
    assert np.all(str_gene_magic == int_gene_magic)
    pca_magic = fast_magic_operator.fit_transform(
        scdata_norm, genes="pca_only")
    assert pca_magic.shape[0] == scdata_norm.shape[0]
    assert pca_magic.shape[1] == fast_magic_operator.n_pca
    fast_magic = fast_magic_operator.fit_transform(scdata_norm,
                                                   genes="all_genes")
    assert scdata_norm.shape == fast_magic.shape
Пример #14
0
def impute_magic_expression(expression_matrix, meta_data, **kwargs):
    """
    Use MAGIC (van Dijk et al Cell, 2018, 10.1016/j.cell.2018.05.061) to impute data

    :param expression_matrix: pd.DataFrame
    :param meta_data: pd.DataFrame
    :return imputed, meta_data: pd.DataFrame, pd.DataFrame
    """
    kwargs, random_seed, output_file = process_impute_args(**kwargs)

    import magic
    utils.Debug.vprint('Imputing data with MAGIC ... ')
    imputed = pd.DataFrame(magic.MAGIC(random_state=random_seed, **kwargs).fit_transform(expression_matrix.values),
                           index=expression_matrix.index, columns=expression_matrix.columns)

    if output_file is not None:
        imputed.to_csv(output_file, sep="\t")

    return imputed, meta_data
Пример #15
0
def test_scdata():
    scdata = scprep.io.load_csv("../data/test_data.csv")
    scdata = scprep.filter.remove_empty_cells(scdata)
    scdata = scprep.filter.remove_empty_genes(scdata)
    scdata_norm = scprep.normalize.library_size_normalize(scdata)
    scdata_norm = scprep.transform.sqrt(scdata_norm)
    assert scdata.shape == scdata_norm.shape
    np.random.seed(42)
    magic_op = magic.MAGIC(t='auto', a=20, k=10)
    str_gene_magic = magic_op.fit_transform(scdata_norm, genes=['VIM', 'ZEB1'])
    int_gene_magic = magic_op.fit_transform(scdata_norm, genes=[-2, -1])
    assert str_gene_magic.shape[0] == scdata_norm.shape[0]
    assert np.all(str_gene_magic == int_gene_magic)
    pca_magic = magic_op.fit_transform(scdata_norm, genes="pca_only")
    assert pca_magic.shape[0] == scdata_norm.shape[0]
    assert pca_magic.shape[1] == magic_op.n_pca
    magic_all_genes = magic_op.fit_transform(scdata_norm, genes="all_genes")
    assert scdata_norm.shape == magic_all_genes.shape
    dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True)
    np.testing.assert_allclose(dremi, 1.5687165, atol=0.0000005)
Пример #16
0
def main(count_table, out_file):
    print("started main")
    data = scprep.io.load_csv(count_table, cell_axis='column', delimiter='\t')
    print("loaded csv")

    # normalize with our method
    gtot = data.apply(sum, 0)
    ctot = data.apply(sum, 1)

    data_filt = data.loc[ctot >= 200, gtot >= 0]

    totu = data_filt.apply(lambda c: max(1, sum(c)), 1)
    data_norm = data_filt.div(totu, axis=0) * 1000

    print(data_norm.apply(sum, 0).head())
    print("normalized.")

    magic_op = magic.MAGIC()
    fig, ax = plt.subplots()
    magic_op.fit_transform(data_norm, plot_optimal_t=True, ax=ax)
    plt.savefig(out_file)
Пример #17
0
    print('... full PHATE in {:.2f}-min'.format((time.time() - start)/60))


    if True :
        # MELD
        adata.obs['res_sca1']=[1 if i=='SCA1' else -1 for i in adata.obs['genotype']]
        adata.obs['ees_sca1']=meld.MELD().fit_transform(G=G,RES=adata.obs['res_sca1'])
        adata.obs['ees_sca1']=adata.obs['ees_sca1']-adata.obs['ees_sca1'].mean() # mean center
        if True :
            # save adata obj with batch correction
            adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad'))
            print('\n... saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
            
    if True :
        # MAGIC
        magic_op=magic.MAGIC().fit(X=adata.X,graph=G) # running fit_transform produces wrong shape
        adata.layers['imputed_bbknn']=magic_op.transform(adata.X,genes='all_genes')
#         adata.layers['imputed_bbknn']=sparse.csr_matrix(magic_op.transform(adata.X,genes='all_genes')) # causes memory spike
        
        if True :
            # save adata obj with batch correction & imputation
            adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad'))
            print('\n... saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))

    print('Pre-processing dataset took {:.2f}-min'.format((time.time() - total)/60))

elif False :
    # save data objects
    start=time.time()
    adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad'))
    print('saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
Пример #18
0
    try:
        os.makedirs(dire_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise e


args = parse_args()
if args.t != "auto":
    args.t = int(args.t)
print("run with these parametres: %s" % str(args))

# Main Part

X = pd.read_csv(args.input, index_col=0)
X = X.transpose()

magic_operator = magic.MAGIC(k=args.k,
                             a=args.a,
                             t=args.t,
                             n_pca=args.n_pca,
                             knn_dist=args.knn_dist,
                             n_jobs=args.n_jobs)

X_magic = magic_operator.fit_transform(X, genes="all_genes")
X_magic = X_magic.transpose()

make_sure_dir_exists(args.outputdir)
X_magic.to_csv(os.path.join(args.outputdir, "magic_output.csv"))
Пример #19
0
import magic
import pandas as pd
import matplotlib.pyplot as plt

X = pd.read_csv('/home/rohit/Desktop/MAGIC-master/data/test_data.csv')
magic_operator = magic.MAGIC()
X_magic = magic_operator.fit_transform(X, genes='all_genes')
# plt.scatter(X_magic['VIM'], X_magic['CDH1'], c=X_magic['ZEB1'], s=1, cmap='inferno')
# plt.show()
# magic.plot.animate_magic(X, gene_x='VIM', gene_y='CDH1', gene_color='ZEB1', operator=magic_operator)

X_magic.to_csv('~/Desktop/exampleOutput.csv', index = False)
Пример #20
0
def main(args):

    # set arguments
    data_path = args.data_dir
    input_path = args.input_dir
    res_dir = args.res_dir
    test_file = args.test_file
    moduleGene_file = args.moduleGene_file
    cm_file = args.stoichiometry_matrix
    sc_imputation = args.sc_imputation

    # choose cpu or gpu automatically
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # read data
    print("Starting load data...")
    geneExpr = pd.read_csv(input_path + '/' + test_file, index_col=0)
    geneExpr = geneExpr.T
    geneExpr = geneExpr * 1.0
    if sc_imputation == True:
        magic_operator = magic.MAGIC()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            geneExpr = magic_operator.fit_transform(geneExpr)
    if geneExpr.max().max() > 50:
        geneExpr = (geneExpr + 1).apply(np.log2)
    geneExprSum = geneExpr.sum(axis=1)
    stand = geneExprSum.mean()
    geneExprScale = geneExprSum / stand
    geneExprScale = torch.FloatTensor(geneExprScale.values).to(device)

    BATCH_SIZE = geneExpr.shape[0]

    moduleGene = pd.read_csv(data_path + '/' + moduleGene_file,
                             sep=',',
                             index_col=0)
    moduleLen = [
        moduleGene.iloc[i, :].notna().sum() for i in range(moduleGene.shape[0])
    ]
    moduleLen = np.array(moduleLen)

    cmMat = pd.read_csv(data_path + '/' + cm_file, sep=',', header=None)
    cmMat = cmMat.values
    cmMat = torch.FloatTensor(cmMat).to(device)
    print("Load data done.")

    print("Starting process data...")
    emptyNode = []
    gene_names = geneExpr.columns
    cell_names = geneExpr.index.astype(str)
    n_modules = moduleGene.shape[0]
    n_genes = len(gene_names)
    n_cells = len(cell_names)
    n_comps = cmMat.shape[0]
    geneExprDf = pd.DataFrame(columns=['Module_Gene'] + list(cell_names))
    for i in range(n_modules):
        genes = moduleGene.iloc[i, :].values.astype(str)
        genes = [g for g in genes if g != 'nan']
        if not genes:
            emptyNode.append(i)
            continue
        temp = geneExpr.copy()
        temp.loc[:, [g for g in gene_names if g not in genes]] = 0
        temp = temp.T
        temp['Module_Gene'] = ['%02d_%s' % (i, g) for g in gene_names]
        geneExprDf = geneExprDf.append(temp, ignore_index=True, sort=False)
    geneExprDf.index = geneExprDf['Module_Gene']
    geneExprDf.drop('Module_Gene', axis='columns', inplace=True)
    X = geneExprDf.values.T
    X = torch.FloatTensor(X).to(device)

    #prepare data for constraint of module variation based on gene
    df = geneExprDf
    df.index = [i.split('_')[0] for i in df.index]
    df.index = df.index.astype(
        int
    )  # mush change type to ensure correct order, T column name order change!
    #module_scale = df.groupby(df.index).sum(axis=1).T   # pandas version update
    module_scale = df.groupby(df.index).sum().T
    module_scale = torch.FloatTensor(module_scale.values / moduleLen)
    print("Process data done.")

    # =============================================================================
    #NN
    torch.manual_seed(16)
    net = FLUX(X, n_modules, f_in=n_genes, f_out=1).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=LEARN_RATE)

    #Dataloader
    dataloader_params = {
        'batch_size': BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0,
        'pin_memory': False
    }

    dataSet = MyDataset(X, geneExprScale, module_scale)
    train_loader = torch.utils.data.DataLoader(dataset=dataSet,
                                               **dataloader_params)

    # =============================================================================

    # =============================================================================
    print("Starting train neural network...")
    start = time.time()
    #   training
    loss_v = []
    loss_v1 = []
    loss_v2 = []
    loss_v3 = []
    loss_v4 = []
    net.train()
    timestr = time.strftime("%Y%m%d-%H%M%S")
    lossName = "./output/lossValue_" + timestr + ".txt"
    file_loss = open(lossName, "a")
    for epoch in tqdm(range(EPOCH)):
        loss, loss1, loss2, loss3, loss4 = 0, 0, 0, 0, 0

        for i, (X, X_scale, m_scale) in enumerate(train_loader):

            X_batch = Variable(X.float().to(device))
            X_scale_batch = Variable(X_scale.float().to(device))
            m_scale_batch = Variable(m_scale.float().to(device))

            out_m_batch, out_c_batch = net(X_batch, n_modules, n_genes,
                                           n_comps, cmMat)
            loss_batch, loss1_batch, loss2_batch, loss3_batch, loss4_batch = myLoss(
                out_m_batch,
                out_c_batch,
                lamb1=LAMB_BA,
                lamb2=LAMB_NG,
                lamb3=LAMB_CELL,
                lamb4=LAMB_MOD,
                geneScale=X_scale_batch,
                moduleScale=m_scale_batch)

            optimizer.zero_grad()
            loss_batch.backward()
            optimizer.step()

            loss += loss_batch.cpu().data.numpy()
            loss1 += loss1_batch.cpu().data.numpy()
            loss2 += loss2_batch.cpu().data.numpy()
            loss3 += loss3_batch.cpu().data.numpy()
            loss4 += loss4_batch.cpu().data.numpy()

        #print('epoch: %02d, loss1: %.8f, loss2: %.8f, loss3: %.8f, loss4: %.8f, loss: %.8f' % (epoch+1, loss1, loss2, loss3, loss4, loss))
        file_loss.write(
            'epoch: %02d, loss1: %.8f, loss2: %.8f, loss3: %.8f, loss4: %.8f, loss: %.8f. \n'
            % (epoch + 1, loss1, loss2, loss3, loss4, loss))

        loss_v.append(loss)
        loss_v1.append(loss1)
        loss_v2.append(loss2)
        loss_v3.append(loss3)
        loss_v4.append(loss4)


# =============================================================================
    end = time.time()
    print("Training time: ", end - start)

    file_loss.close()
    plt.plot(loss_v, '--')
    plt.plot(loss_v1)
    plt.plot(loss_v2)
    plt.plot(loss_v3)
    plt.plot(loss_v4)
    plt.legend(['total', 'balance', 'negative', 'cellVar', 'moduleVar'])
    imgName = './' + res_dir + '/loss_' + timestr + ".png"
    plt.savefig(imgName)
    timeName = './' + res_dir + '/time_' + timestr + ".txt"
    f = open(timeName, "a")
    runTimeStr = str(end - start)
    f.write(runTimeStr)
    f.close()

    #    Dataloader
    dataloader_params = {
        'batch_size': 1,
        'shuffle': False,
        'num_workers': 0,
        'pin_memory': False
    }

    dataSet = MyDataset(X, geneExprScale, module_scale)
    test_loader = torch.utils.data.DataLoader(dataset=dataSet,
                                              **dataloader_params)

    #testing
    fluxStatuTest = np.zeros((n_cells, n_modules), dtype='f')  #float32
    balanceStatus = np.zeros((n_cells, n_comps), dtype='f')
    net.eval()
    for epoch in range(1):
        loss, loss1, loss2 = 0, 0, 0

        for i, (X, X_scale, _) in enumerate(test_loader):

            X_batch = Variable(X.float().to(device))
            out_m_batch, out_c_batch = net(X_batch, n_modules, n_genes,
                                           n_comps, cmMat)

            # save data
            fluxStatuTest[i, :] = out_m_batch.detach().numpy()
            balanceStatus[i, :] = out_c_batch.detach().numpy()

    # save to file
    fileName = "./" + res_dir + "/module" + str(n_modules) + "_cell" + str(n_cells) + "_batch" + str(BATCH_SIZE) + \
                "_LR" + str(LEARN_RATE) + "_epoch" + str(EPOCH) + "_SCimpute_" + str(sc_imputation)[0] + \
                "_lambBal" + str(LAMB_BA) + "_lambSca" + str(LAMB_NG) + "_lambCellCor" + str(LAMB_CELL) + "_lambModCor_1e-2" + \
                '_' + timestr + ".csv"
    setF = pd.DataFrame(fluxStatuTest)
    setF.columns = moduleGene.index
    setF.index = geneExpr.index.tolist()
    setF.to_csv(fileName)

    setB = pd.DataFrame(balanceStatus)
    setB.rename(columns=lambda x: x + 1)
    setB.index = setF.index
    balanceName = "./output/balance_" + timestr + ".csv"
    setB.to_csv(balanceName)

    print("scFEA job finished. Check result in the desired output folder.")

    return
Пример #21
0
 'MIXL1 (ENSG00000185155)', 'MYCBP (ENSG00000214114)', 'NANOG (ENSG00000111704)',
 'NES (ENSG00000132688)', 'NKX2-1 (ENSG00000136352)', 'NKX2-5 (ENSG00000183072)',
 'NKX2-8 (ENSG00000136327)', 'NPAS1 (ENSG00000130751)', 'NR2F1-AS1 (ENSG00000237187)',
 'OLIG1 (ENSG00000184221)', 'OLIG3 (ENSG00000177468)', 'ONECUT1 (ENSG00000169856)',
 'ONECUT2 (ENSG00000119547)', 'OTX2 (ENSG00000165588)', 'PAX3 (ENSG00000135903)',
 'PAX6 (ENSG00000007372)', 'PDGFRA (ENSG00000134853)', 'PECAM1 (ENSG00000261371)',
 'POU5F1 (ENSG00000204531)', 'SATB1 (ENSG00000182568)', 'SIX2 (ENSG00000170577)',
 'SIX3-AS1 (ENSG00000236502)', 'SIX6 (ENSG00000184302)', 'SOX13 (ENSG00000143842)',
 'SOX10 (ENSG00000100146)', 'SOX15 (ENSG00000129194)', 'SOX17 (ENSG00000164736)',
 'SOX9 (ENSG00000125398)', 'TTLL10 (ENSG00000162571)', 'TAL1 (ENSG00000162367)',
 'TBX15 (ENSG00000092607)', 'TBX18 (ENSG00000112837)', 'TBX5 (ENSG00000089225)',
 'TNNT2 (ENSG00000118194)', 'WT1 (ENSG00000184937)', 'ZBTB16 (ENSG00000109906)',
 'ZIC2 (ENSG00000043355)', 'ZIC5 (ENSG00000139800)', 'ACTB (ENSG00000075624)',
 'HAND1 (ENSG00000113196)']
import magic
data_magic = magic.MAGIC().fit_transform(data, genes=full_marker_genes)

data_phate = phate.PHATE().fit_transform(data)
# alternative: umap.UMAP(), sklearn.manifold.TSNE()
data_phate = pd.DataFrame(data_phate, index=data.index)
plt.figure(figsize=(10,10))
scprep.plot.scatter2d(data_phate, c=metadata['sample'], figsize=(12,8), cmap="Spectral",
                      ticks=False, label_prefix="PHATE")
plt.savefig("phatedata.pdf")
home = os.path.expanduser('./')
file_path = os.path.join(home, 'EBT_counts.pkl.gz')
if not os.path.exists(file_path):
    scprep.io.download.download_google_drive(id='1Xz0ONnRWp2MLC_R6r74MzNwaZ4DkQPcM',
                        destination=os.path.dirname(file_path))

data = pd.read_pickle(file_path)
Пример #22
0
def run_magic_from_file(
        filename,
        # data loading params
        sparse=True,
        gene_names=None,
        cell_names=None,
        cell_axis=None,
        gene_labels=None,
        allow_duplicates=None,
        genome=None,
        metadata_channels=None,
        # filtering params
        min_library_size=2000,
        min_cells_per_gene=10,
        # normalization params
        library_size_normalize=True,
        transform='sqrt',
        pseudocount=None,
        cofactor=None,
        # kernel params
        knn=5,
        decay=15,
        n_pca=100,
        knn_dist='euclidean',
        n_jobs=1,
        random_state=42,
        verbose=1,
        # magic params
        t_magic='auto',
        genes=None,
        # output params
        output='magic.csv',
        validate=False):
    """Run MAGIC on a file

    Parameters
    ----------
    filename : str
        Allowed types: csv, tsv, mtx, hdf5/h5 (10X format),
        directory/zip (10X format)
    sparse : bool (recommended: True for scRNAseq, False for CyTOF)
        Force data sparsity. If `None`, sparsity is determined by data type.
    gene_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says gene names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        gene names, list gives an array of gene names, `False` means
        no gene names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing gene names, list gives an array of gene names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says cell names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        cell names, list gives an array of cell names, `False` means
        no cell names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing cell names, list gives an array of cell names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_axis : {'row', 'column'}
        States whether cells are on rows or columns. If cell_axis=='row',
        data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of
        shape [n_genes, n_cells]. Only valid for filetype mtx and csv
    gene_labels : {'symbol', 'id', 'both'}
        Choice of gene labels for 10X data. Recommended: 'both'
        Only valid for directory, zip, hdf5, h5
    allow_duplicates : bool
        Allow duplicate gene names in 10X data. Recommended: True
        Only valid for directory, zip, hdf5, h5
    genome : str
        Genome name. Only valid for hdf5, h5
    metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
        Names of channels in fcs data which are not real measurements.
        Only valid if datatype is fcs.
    min_library_size : int or `None`, optional (default: 2000)
        Cutoff for library size normalization. If `None`,
        library size filtering is not used
    min_cells_per_gene : int or `None`, optional (default: 10)
        Minimum non-zero cells for a gene to be used. If `None`,
        genes are not removed
    library_size_normalize : `bool`, optional (default: True)
        Use library size normalization
    transform : {'sqrt', 'log', 'arcsinh', None}
        How to transform the data. If `None`, no transformation is done
    pseudocount : float (recommended: 1)
        Number of pseudocounts to add to genes prior to log transformation
    cofactor : float (recommended: 5)
        Factor by which to divide genes prior to arcsinh transformation
    knn : int, optional, default: 10
        number of nearest neighbors on which to build kernel
    decay : int, optional, default: 15
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_pca : int, optional, default: 100
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist : string, optional, default: 'euclidean'
        recommended values: 'euclidean', 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
    n_jobs : integer, optional, default: 1
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state : integer or numpy.RandomState, optional, default: None
        The generator used to initialize random PCA
        If an integer is given, it fixes the seed
        Defaults to the global `numpy` random number generator
    verbose : `int` or `boolean`, optional (default: 1)
        If `True` or `> 0`, print status messages
    t_magic : int, optional, default: 'auto'
        power to which the diffusion operator is powered for MAGIC.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data
    genes : list or {"all_genes", "pca_only"}, optional (default: None)
        List of genes to return from MAGIC,
        either as integer indices or column names
        if input data is a pandas DataFrame. If "all_genes", the entire
        smoothed matrix is returned. If "pca_only", PCA on the smoothed
        data is returned. If None, the entire matrix is also
        returned, but a warning may be raised if the resultant matrix
        is very large.
    output : str, optional (default: 'magic.csv')
        Output CSV file to save smoothed data matrix
    """
    # check arguments
    filetype = check_filetype(filename)
    load_fn, load_kws = check_load_args(filetype,
                                        sparse=sparse,
                                        gene_names=gene_names,
                                        cell_names=cell_names,
                                        cell_axis=cell_axis,
                                        gene_labels=gene_labels,
                                        allow_duplicates=allow_duplicates,
                                        genome=genome,
                                        metadata_channels=metadata_channels)
    transform_fn, transform_kws = check_transform_args(transform=transform,
                                                       pseudocount=pseudocount,
                                                       cofactor=cofactor)

    # set up logging
    # https://github.com/scottgigante/tasklogger
    tasklogger.set_level(verbose)

    # load data
    # example: scprep.io.load_csv("data.csv")
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io
    tasklogger.log_info("Loading data from {}...".format(filename))
    data = load_fn(filename, **load_kws)
    data = scprep.sanitize.check_numeric(data, copy=True)
    tasklogger.log_info("Loaded {} cells and {} genes.".format(
        data.shape[0], data.shape[1]))

    # filter data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter
    if min_library_size is not None:
        tasklogger.log_info("Filtering cells by library size >= {}...".format(
            min_library_size))
        data = scprep.filter.filter_library_size(data, cutoff=min_library_size)
        tasklogger.log_info("Retained {} cells.".format(data.shape[0]))
    if min_cells_per_gene is not None:
        tasklogger.log_info(
            "Filtering genes by min cells >= {}...".format(min_cells_per_gene))
        data = scprep.filter.filter_rare_genes(data,
                                               min_cells=min_cells_per_gene)
        tasklogger.log_info("Retained {} genes.".format(data.shape[1]))

    # normalize data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize
    if library_size_normalize:
        tasklogger.log_info("Library size normalizing data...")
        data = scprep.normalize.library_size_normalize(data)

    # transform data
    # example: data = scprep.transform.sqrt(data)
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform
    if transform is not None:
        tasklogger.log_info("Applying {} transform...".format(transform))
        data = transform_fn(data, **transform_kws)

    # run MAGIC
    # https://magic.readthedocs.io/
    magic_op = magic.MAGIC(knn=knn,
                           decay=decay,
                           t=t_magic,
                           n_pca=n_pca,
                           knn_dist=knn_dist,
                           n_jobs=n_jobs,
                           random_state=random_state,
                           verbose=verbose)
    magic_data = magic_op.fit_transform(data, genes=genes)

    # save as csv
    magic_data = pd.DataFrame(magic_data)
    if cell_axis in ['col', 'column']:
        magic_data = magic_data.T
    tasklogger.log_info("Saving data to {}...".format(output))
    magic_data.to_csv(output)
    tasklogger.log_info("Complete.".format(output))
    if validate:
        correct_magic_data = scprep.io.load_csv(
            'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/'
            'master/magic-validate.csv',
            sparse=False)
        try:
            np.testing.assert_equal(scprep.utils.toarray(magic_data),
                                    scprep.utils.toarray(correct_magic_data))
            tasklogger.log_debug(
                "Validation complete, output is equal to expected")
        except AssertionError:
            np.testing.assert_allclose(
                scprep.utils.toarray(magic_data),
                scprep.utils.toarray(correct_magic_data),
                atol=1e-14)
            tasklogger.log_debug(
                "Validation complete, output is numerically equivalent to expected"
            )
Пример #23
0
            'count': {
                'input': snakemake.input['cmat'],
                'output': snakemake.output['cmat']
            },
            'tpm': {
                'input': snakemake.input['tpm'],
                'output': snakemake.output['tpm']
            }
        }

        for key in data_holder.keys():
            # read in data
            data = pd.read_csv(data_holder[key]['input'], index_col=0)

            # impute with magic
            magic_op = magic.MAGIC()
            imputed = magic_op.fit_transform(data.T, genes='all_genes')

            # write data
            imputed.to_csv(data_holder[key]['output'])

            if not CLUSTER:
                # plot non-imputed data
                orig_heatmap = sns.clustermap(data.T, z_score=1, cmap='Blues')
                plt.savefig(
                    os.path.join(snakemake.params['plot_dir'],
                                 '{}_heatmap.png'.format(key)))
                plt.cla()

                # plot imputed data
                imputed_heatmap = sns.clustermap(imputed,
Пример #24
0
def test_pca_only():
    magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False)
    pca_magic = magic_op.fit_transform(scdata_norm, genes="pca_only")
    assert pca_magic.shape[0] == scdata_norm.shape[0]
    assert pca_magic.shape[1] == magic_op.n_pca
np.save("{}Phate2d.npy".format(data_name), phate_data[pth_idx])

ph.set_params(n_components=3)
phate3_data = ph.transform()
np.save("{}Phate3d.npy".format(data_name), phate3_data[pth_idx])

mnn_graph = graphtools.Graph(data.iloc[pth_idx],
                             sample_idx=sample_labels[pth_idx],
                             n_pca=100,
                             knn=5,
                             random_state=42,
                             decay=15,
                             kernel_symm='theta',
                             theta=0.99)
mg = magic.MAGIC(random_state=42,
                 a=mnn_graph.decay,
                 k=mnn_graph.knn - 1,
                 n_pca=mnn_graph.n_pca)
data_magic = mg.fit_transform(data, graph=mnn_graph)
_ = mg.fit_transform(data)

# reduce memory footprint
del mg.graph.data
del mg.graph.data_nu
del mg.graph._kernel
del mg.graph._diff_op
del mg.graph.subgraphs
del mg.graph.sample_idx

with open('magic.pickle', 'wb') as handle:
    pickle.dump(mg, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main():
    parser = argparse.ArgumentParser()

    run_group = parser.add_argument_group("run",
                                          description="Per-run parameters")
    run_group.add_argument("--seed", type=int, required=True)
    run_group.add_argument("--data_split",
                           type=float,
                           default=0.9,
                           help="Split for self-supervision")
    run_group.add_argument("--n_trials",
                           type=int,
                           default=10,
                           help="Number of times to resample")
    run_group.add_argument("--median_scale", action="store_true")

    data_group = parser.add_argument_group(
        "data", description="Input and output parameters")
    data_group.add_argument("--dataset", type=pathlib.Path, required=True)
    data_group.add_argument("--output_dir", type=pathlib.Path, required=True)
    data_group.add_argument("--genes",
                            type=int,
                            nargs="+",
                            required=True,
                            help="Genes to smooth (indices)")

    model_group = parser.add_argument_group(
        "model",
        description=
        "Model parameters. [max] or [min, max] or [min, max, interval]",
    )

    model_group.add_argument(
        "--neighbors",
        type=int,
        nargs="+",
        default=(1, 11),
        metavar="K",
        help="Number of neighbors in kNN graph",
    )
    model_group.add_argument(
        "--components",
        type=int,
        nargs="+",
        default=(5, 51, 5),
        metavar="PC",
        help="Maximum number of components to compute",
    )
    model_group.add_argument(
        "--time",
        type=int,
        nargs="+",
        default=(1, 6),
        metavar="T",
        help="Number of time steps for diffusion",
    )

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler())

    dataset_name = args.dataset.parent.name
    output_file = args.output_dir / f"{dataset_name}_magic_mse_{args.seed}.pickle"

    logger.info(f"writing output to {output_file}")

    seed = sum(map(ord, f"biohub_{args.seed}"))
    random_state = np.random.RandomState(seed)

    with open(args.dataset, "rb") as f:
        true_means, true_counts, umis = pickle.load(f)

    k_range = np.arange(*args.neighbors)
    pc_range = np.arange(*args.components)
    t_range = np.arange(*args.time)

    rec_loss = dict()
    mcv_loss = dict()

    # run n_trials for self-supervised sweep
    for i in range(args.n_trials):
        umis_X, umis_Y = ut.split_molecules(umis, args.data_split, 0.0,
                                            random_state)

        if args.median_scale:
            median_count = np.median(umis.sum(axis=1))

            umis_X = umis_X / umis_X.sum(axis=1, keepdims=True) * median_count
            umis_Y = umis_Y / umis_Y.sum(axis=1, keepdims=True) * median_count
        else:
            umis_Y = umis_Y * args.data_split / (1 - args.data_split)

        for n_pcs in pc_range:
            for k in k_range:
                for t in t_range:
                    magic_op = magic.MAGIC(n_pca=n_pcs, verbose=0)
                    magic_op.set_params(knn=k, t=t)
                    denoised = magic_op.fit_transform(umis_X, genes=args.genes)
                    denoised = np.maximum(denoised, 0)

                    rec_loss[i, n_pcs, k,
                             t] = mean_squared_error(denoised,
                                                     umis_X[:, args.genes])
                    mcv_loss[i, n_pcs, k,
                             t] = mean_squared_error(denoised,
                                                     umis_Y[:, args.genes])

    results = {
        "dataset": dataset_name,
        "method": "magic",
        "loss": "mse",
        "normalization": "sqrt",
        "param_range": [pc_range, k_range, t_range],
        "rec_loss": rec_loss,
        "mcv_loss": mcv_loss,
    }

    with open(output_file, "wb") as out:
        pickle.dump(results, out)
Пример #27
0
    def magic_impute(self, data):
        import magic

        model = magic.MAGIC(n_jobs=self.ncores)
        imputed = model.fit_transform(data.values)
        return pd.DataFrame(imputed)
Пример #28
0
    denoised_name = f"{outputDir}/{experiment_name}_denoised_{algorithm}.csv"
    print(denoised_name)
    if os.path.isfile(denoised_name):
        continue

    adata = sc.read(f"{inputDir}/{f}")
    adata = adata.transpose()
    adata.X = np.expm1(adata.X)
    sc.pp.sqrt(adata)

    n = find_pca_comp(
        adata,
        figName=f"{outputDir}/figures/{experiment_name}_variance.png",
        figTitle=f'{experiment_name} Explained Variance')

    magic_op = magic.MAGIC(t=6, n_pca=n)

    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    (mem_registered, adata_denoised) = memory_usage(
        (magic_op.fit_transform, (adata, ), {
            'genes': 'all_genes'
        }),
        retval=True,
        max_usage=True,
        include_children=True)

    end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
    real = end_time - start_time
    systime = end_resources.ru_stime - start_resources.ru_stime
    usertime = end_resources.ru_utime - start_resources.ru_utime
    cpu_time = systime + usertime
def main():
    usage = "" # TODO 
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="File to write H5 dataset")
    (options, args) = parser.parse_args()

    dataset_f = args[0]
    resolution = float(args[1])
    out_f = options.out_file

    with h5py.File(dataset_f, 'r') as in_f:
        print('Loading expression matrix from {}...'.format(dataset_f))
        X = in_f['expression'][:]
        cell_ids = [
            str(x)[2:-1]
            for x in in_f['experiment'][:]
        ]
        gene_ids = [
            str(x)[2:-1]
            for x in in_f['gene_id'][:]
        ]
        ad = AnnData(
            X=X,
            obs=pd.DataFrame(
                data=cell_ids, 
                columns=['cell']
            ),
            var=gene_ids
        )
        sc.pp.neighbors(ad)
        sc.tl.leiden(ad, resolution=resolution)

        new_X = None
        new_cell_ids = []
        clusters = []
        for clust in sorted(set(ad.obs['leiden'])):
            print('Processing cluster {}'.format(clust))
            indices = [
                int(x) 
                for x in ad.obs.loc[ad.obs['leiden'] == clust].index
            ]
            X = ad.X[indices]               
            print('Shape of cluster matrix: {}'.format(X.shape)) 
            magic_operator = magic.MAGIC()
            magic_X = magic_operator.fit_transform(X)
            if new_X is None:
                new_X = magic_X
            else:
                new_X= np.concatenate([new_X, magic_X])
            print('Current shape of final matrix: {}'.format(new_X.shape))
            clusters += [clust for i in indices]
            new_cell_ids += list(np.array(cell_ids)[indices])

        clusters = [
            x.encode('utf-8')
            for x in clusters
        ]
        new_cell_ids = [
            x.encode('utf-8')
            for x in new_cell_ids
        ]

        print('Writing results to {}...'.format(out_f))
        with h5py.File(out_f, 'w') as out_f:
            out_f.create_dataset(
                'expression', data=new_X, compression="gzip"
            )
            out_f.create_dataset('cluster', data=clusters)
            out_f.create_dataset('experiment', data=new_cell_ids)
            # Copy other datasets to new H5 file
            for k in in_f.keys():
                if k != 'expression' and k != 'experiment':
                    out_f.create_dataset(k,data=in_f[k][:])
Пример #30
0
import magic
import numpy as np
import pandas as pd

bmmsc_data = magic.io.load_csv('MATRIX.txt')
libsize = bmmsc_data.sum(axis=1)

bmmsc_data = magic.preprocessing.library_size_normalize(bmmsc_data)
bmmsc_data = np.sqrt(bmmsc_data)
bmmsc_data.head()

magic_op = magic.MAGIC(t=4, k=5)
bmmsc_magic = magic_op.fit_transform(bmmsc_data, genes='all_genes')
bmmsc_magic.head()

bmmsc_magic.to_csv('magic.csv', sep='\t')