Exemplo n.º 1
0
def main(transcriptome_file, regulator_file, species, out_file, n_workers,
         threads_per_worker):
    print('reading data')

    tf_info = pd.read_csv(regulator_file, sep='\t', index_col=0)
    tf_names = list(tf_info.loc[tf_info['Species'] == species].index)

    df = pd.read_csv(transcriptome_file, sep='\t', index_col=0)

    num_not_expressed = (df.std(axis=1) == 0).sum()

    print(
        f'removing {num_not_expressed} genes that have zero expression in all samples'
    )
    #Filter genes that are not expressed
    df = df.loc[df.std(axis=1) > 0]

    print('starting scheduler')
    client = Client(n_workers=n_workers,
                    threads_per_worker=threads_per_worker,
                    memory_limit='48GB')

    try:
        network = grnboost2(expression_data=df.T,
                            tf_names=tf_names,
                            client_or_address=client,
                            verbose=True)

        network.to_csv(out_file, sep='\t', header=False, index=False)
    except Exception as e:
        print('Module inference error')
        print(e)
    finally:
        client.close()
Exemplo n.º 2
0
def find_adjacencies_command(args):
    """
    Infer co-expression modules.
    """
    LOGGER.info("Loading expression matrix.")
    ex_mtx = _load_expression_matrix(args)
    tf_names = load_tf_names(args.tfs_fname.name)

    n_total_genes = len(ex_mtx.columns)
    n_matching_genes = len(ex_mtx.columns.isin(tf_names))
    if n_total_genes == 0:
        LOGGER.error("The expression matrix supplied does not contain any genes. Make sure the extension of the file matches the format (tab separation for TSV and comma sepatration for CSV).")
        sys.exit(1)
    if float(n_matching_genes)/n_total_genes < 0.80:
        LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.")

    LOGGER.info("Inferring regulatory networks.")
    client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers)
    try:
        network = grnboost2(expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client)
    finally:
        shutdown_callback(False)

    LOGGER.info("Writing results to file.")
    network.to_csv(args.output, index=False, sep='\t')
Exemplo n.º 3
0
def helper_grnboost2(X, theta_true, tf_names=[], BEELINE=False):  #_string
    print('Running GRNBoost2 method', X.shape)
    theta_true = theta_true.real
    ex_matrix = pd.DataFrame(X)
    if args.USE_TF_NAMES == 'yes' and len(tf_names) != 0:
        tf_names = ['G' + str(n) for n in tf_names]
    else:
        tf_names = None

    gene_names = ['G' + str(c) for c in ex_matrix.columns]
    ex_matrix.columns = gene_names
    network = grnboost2(expression_data=ex_matrix,
                        gene_names=gene_names,
                        tf_names=tf_names)  #, verbose=True)
    pred_edges = np.array(network[['TF', 'target', 'importance']])
    G_pred = nx.Graph()
    #    G_pred.add_nodes_from(['G'+str(n) for n in range(args.D)])
    G_pred.add_nodes_from(['G' + str(n) for n in range(len(gene_names))])
    G_pred.add_weighted_edges_from(pred_edges)
    #    pred_theta = nx.adj_matrix(G_pred).todense() + np.eye(args.D)
    pred_theta = nx.adj_matrix(G_pred).todense() + np.eye(len(gene_names))
    recovery_metrics = report_metrics(np.array(theta_true),
                                      np.array(pred_theta))
    print(
        'GRNBOOST2: FDR, TPR, FPR, SHD, nnz_true, nnz_pred, precision, recall, Fb, aupr, auc'
    )
    print('GRNBOOST2: Recovery of true theta: ',
          *np.around(recovery_metrics, 3))

    res = list(recovery_metrics)
    return res
Exemplo n.º 4
0
def process(mtx_fname, tfs, net_fname, client):
    network = grnboost2(expression_data=pd.read_csv(mtx_fname,
                                                    sep='\t',
                                                    index_col=0).T,
                        tf_names=tfs,
                        verbose=True,
                        client_or_address=client)
    network.to_csv(net_fname, index=False)
Exemplo n.º 5
0
def main(args):
    opts, args = parseArgs(args)
    inDF = pd.read_csv(opts.inFile, sep='\t', index_col=0, header=0)

    client = Client(processes=False)

    if opts.algo == 'GENIE3':
        network = genie3(inDF, client_or_address=client)
        network.to_csv(opts.outFile, index=False, sep='\t')

    elif opts.algo == 'GRNBoost2':
        network = grnboost2(inDF, client_or_address=client)
        network.to_csv(opts.outFile, index=False, sep='\t')

    else:
        print("Wrong algorithm name. Should either be GENIE3 or GRNBoost2.")
Exemplo n.º 6
0
def calcTFs(
        expr,
        tf_names,
        db,
        prefix,
        motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl',
        out_path='../data/pySCENIC',
        ppn=8):
    """Computes motifs, regulons and trancriptional factor activation using pySCENIC.

    Arguments
    ---------
    expr: `pandas DataFrame` 
        cell X gene raw counts; FPKM; not TPM as coexpression will be calculated
    tf_names: `list` (`str`)
        curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt
    db: `list` (`FeatherRankingDatabase()`)
        feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")]
    prefix: `str` (default: `None`)
        Specify name to save files (eg, cell line names)

    Returns
    -------
    Do not return but write files (the calc takes too long...)
    """

    # Inference of co-expression modules
    adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True)
    modules = list(modules_from_adjacencies(adjacencies, expr))

    # Calculate a list of enriched motifs and the corresponding target genes for all modules.
    with ProgressBar():
        df = prune2df(db, modules, motif_path, num_workers=ppn)

    # Create regulons from this table of enriched motifs.
    regulons = df2regulons(df)

    # Save the enriched motifs and the discovered regulons to disk.
    with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f:
        pickle.dump(regulons, f)

    auc_mtx = aucell(expr, regulons, num_workers=ppn)
    tfs = [tf.strip('(+)') for tf in auc_mtx.columns]
    auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix))

    print('finished calculation for %s' % (prefix))
Exemplo n.º 7
0
def inferGRN(filename,
             libpath,
             libname,
             lib_both=True,
             savedir=None,
             suffix=None,
             seed=None):
    """
    Top-level script for inferring gene regulatory network
    from a given dataset using the Arboreto GRNboost2 algorithm.
    :filename:  path to CSV file containing gene expression data.
    :libpath:   path to directory containing sub-folders for TF-target
                libraries.
    :libname:   string of TF-target library used for inference.
    :lib_both:  (optional) Boolean operator determining use of additional
                library (TRANSFACpredicted) for wider TF coverage
    :savedir:   (optional) path to directory for saving final CSV.
    :seed:      (optional) integer for inference algorithm seed
    """

    # import cpm + library data
    cpm = importData(filename)
    cpm_array, cpm_genes = processData(cpm)

    tf_all = importTFs(libpath, libname, lib_both)
    tf_names = tf_all["GeneSym"].to_list()

    # setup Dask cluster
    client = Client(LocalCluster())
    print(client.dashboard_link)

    # infer + refine GRN
    grn = grnboost2(expression_data=cpm_array,
                    gene_names=cpm_genes,
                    tf_names=tf_names,
                    client_or_address=client,
                    seed=seed)
    grn_refined = refineGRN(grn, libname, dir_path=libpath)

    if savedir is not None:
        saveGRN(grn_refined, savedir, suffix=suffix)

    client.shutdown()
    return grn_refined
def main(transcriptome_file, regulator_file, species, out_file_prefix, 
    n_random_samples, n_runs, n_workers, threads_per_worker):
    print('reading data')

    tf_info = pd.read_csv(regulator_file, sep = '\t', index_col = 0)
    tf_names = list(tf_info.loc[tf_info['Species'] == species].index)

    df = pd.read_csv(transcriptome_file, sep = '\t', index_col = 0)

    print('starting scheduler')
    client = Client(n_workers = n_workers,
                    threads_per_worker = threads_per_worker,
                    memory_limit='128GB')
    
    for i in range(n_runs):
        out_file = f'{out_file_prefix}_{i}.tsv'
        subsampled_df = df.sample(n_random_samples, axis = 1, random_state = i)
        
        #Filter genes that are not expressed
        num_not_expressed = (subsampled_df.std(axis = 1) == 0).sum()
        print(
            f'removing {num_not_expressed} genes that have zero', 
            'expression in all samples'
        )
        subsampled_df = subsampled_df.loc[df.std(axis = 1) > 0]

        try:
            network = grnboost2(expression_data = subsampled_df.T,
                tf_names = tf_names, client_or_address = client, verbose = True)

            network.to_csv(out_file, sep = '\t', header = False,
                        index = False)
        except Exception as e:
            print('Module inference error')
            print(e)
    client.close()
Exemplo n.º 9
0
                            "grn_output_" + inputFilename + ".tsv")
    db_fnames = glob.glob(DATABASES_GLOB)

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]
    print(dbs)
    print("running grnboost")
    print("tf_names head")
    print(tf_names[1:5])
    #print("gene names head")
    #print(ex_matrix.iloc[1:5,1:5])
    adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)
    adjacencies.head()
    print("identify modules")
    adjacencies.to_csv(out_file, sep='\t', index=False, header=False)
    print("grnboost done")
    modules = list(
        modules_from_adjacencies(adjacencies,
                                 ex_matrix,
                                 rho_mask_dropouts=True))

    #print("writing modules")
    #with open(MODULES_FNAME, 'wb') as f:
    #	pickle.dump(modules, f)

    print("Finding Enriched modules")
    # Calculate a list of enriched motifs and the corresponding target genes for all modules.
Exemplo n.º 10
0
import os
import pandas as pd
import argparse
from dask.distributed import Client
from distributed import Client, LocalCluster

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--cell_line', nargs=1, type=str, help='cell line to run on')
    parser.add_argument('--name', nargs=1, type=str, help='name of dataset')
    args = parser.parse_args()

    cl = args.cell_line[0]
    name = args.name[0]

    from arboreto.algo import grnboost2, genie3
    from arboreto.utils import load_tf_names

    ex_matrix = pd.read_csv('~/data/spate116/GCN/%s/%s_expression_matrix_imputed.tsv' % (cl, name), sep='\t').transpose()

    cluster = LocalCluster()
    client = Client(cluster)
    print('here')
    network = grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=client)
    network.to_csv('~/data/spate116/GCN/%s/%s_GRN.tsv' % (cl, name), sep='\t', header=True, index=False)
    client.close()
    cluster.close()
Exemplo n.º 11
0
# add 'G' for entrezgene id

dir=/path/to/your/file
#############################################################################
import os, sys, re, gc
import pandas as pd
os.chdir(dir)

from arboreto.algo import grnboost2, genie3
from arboreto.utils import load_tf_names

ex_matrix = pd.read_csv("feed2python.csv",index_col=0)
matrix = ex_matrix.T

# tf_names = load_tf_names("ChIPBaseV2_regNet_geo.csv")
df= pd.read_csv("regNet_tf.csv")
df2 = df.columns.get_values()
df2.tolist()
tf_names= df2[1:].tolist()
tf_names= [re.sub("X", "G", x) for x in tf_names]
# print(matrix.head(3))

network = grnboost2(expression_data=matrix, tf_names=tf_names)

network.to_csv('ex_GRNboost2_network.tsv', sep='\t', header=False, index=False)

# release the memory from python
del ex_matrix, matrix, df, df2, tf_names, network

gc.collect()
Exemplo n.º 12
0
                genes_to_use.add(target)

import numpy as np
from arboreto.algo import grnboost2, genie3
from sklearn.decomposition import PCA

pca = PCA(n_components=1)

if __name__ == '__main__':
    pcafile = open('pc_fraction_explained.log', 'w')
    data_to_use = Normal_Data.loc[list(genes_to_use), :].T
    pca.fit(data_to_use)
    pca_explained = pca.explained_variance_ratio_[0]
    pcafile.write(str(pca_explained) + '\n')
    TFs_to_use = [gene for gene in genes_to_use if gene in TFs]
    network = grnboost2(expression_data=data_to_use, tf_names=list(TFs_to_use))
    print(network.head())
    network.to_csv('networkfiles/biologicalnetwork.log',
                   sep='\t',
                   index=False,
                   header=False)

    original_data = Normal_Data.loc[list(genes_to_use), :]
    for i in range(100):
        print(i)
        data_to_use = original_data.copy()
        for j in range(len(Normal_Samples)):
            l = list(original_data.iloc[:, j])
            l = list(np.random.permutation(l))
            data_to_use[Normal_Samples[j]] = l
        print(data_to_use.shape)
Exemplo n.º 13
0
    #------------Phase I: Inference of co-expression modules--------------------------------
    #------------GRNBoost-------------------------------------------------------------------

    print("STARTING PHASE I")

    # Define cluster
    local_cluster = LocalCluster(n_workers=nCores, threads_per_worker=1)
    client = Client(local_cluster)
    print(client)

    N_SAMPLES = ex_matrix.shape[0]  # Full dataset
    print(N_SAMPLES)

    adjacencies = grnboost2(expression_data=ex_matrix.sample(n=N_SAMPLES,
                                                             replace=False),
                            tf_names=tf_names,
                            seed=123,
                            verbose=True,
                            client_or_address=client)
    print("DEFINED adjacencies, type and head:")

    adjacencies.to_csv(ADJACENCIES_FNAME, sep='\t')

    #load adjacencies
    adjacencies = pd.read_csv(ADJACENCIES_FNAME,
                              sep='\t',
                              header=0,
                              index_col=0)
    print("READ IN adjacencies, type and  head:")

    print(type(adjacencies))
    print(adjacencies.head())
Exemplo n.º 14
0
def run_grnboost2(Expr, filename='links.txt', gene_names=None, **kwargs):
    links = grnboost2(np.asmatrix(Expr.T), gene_names=gene_names, **kwargs)
    links.to_csv(filename, sep='\t', index=False, header=False)
Exemplo n.º 15
0
def crossvalidateGRN(filename,
                     libpath,
                     libname,
                     k,
                     lib_both=True,
                     savedir=None,
                     suffix=None,
                     seed=None):
    """
    Top-level script for k-fold cross validation of gene regulatory 
    network inference using the Arboreto GRNboost2 algorithm.
    :filename:  path to CSV file containing gene expression data.
    :libpath:   path to directory containing sub-folders for TF-target
                libraries.
    :libname:   string of TF-target library used for inference.
    :k:         integer specifying number of folds for CV
    :lib_both:  (optional) Boolean operator determining use of additional
                library (TRANSFACpredicted) for wider TF coverage
    :savedir:   (optional) path to directory for saving final CSV.
    :seed:      (optional) integer for inference algorithm seed
    """

    # import cpm + library data
    cpm = importData(filename)

    tf_all = importTFs(libpath, libname, lib_both)
    tf_names = tf_all["GeneSym"].to_list()

    # create and assign CV folds
    folds = gv.makeFolds(cpm, k)
    training, testing = gv.assignFolds(folds)

    # setup Dask cluster
    client = Client(LocalCluster())
    print(client.dashboard_link)

    # infer + refine GRN for each fold
    fold = 0
    while fold < k:
        cpm_fold = cpm.loc[:, training[fold]]
        cpm_array, cpm_genes = processData(cpm_fold)

        grn = grnboost2(expression_data=cpm_array,
                        gene_names=cpm_genes,
                        tf_names=tf_names,
                        client_or_address=client,
                        seed=seed)
        grn_refined = refineGRN(grn, libname, dir_path=libpath)

        if savedir is not None:
            saveGRN(grn_refined,
                    savedir,
                    fold=fold,
                    suffix=suffix,
                    trainingset=training,
                    testingset=testing)

        # store all refined GRNs
        grn_refined["fold"] = fold
        if fold == 0:
            grn_all = grn_refined
        else:
            grn_all = grn_all.append(grn_refined)

        fold = fold + 1

    client.shutdown()
    return grn_all
def run_boost():
    return grnboost2(expression_data=ex_matrix.to_numpy(),
                     gene_names=ex_matrix.columns,
                     client_or_address=custom_client)
Exemplo n.º 17
0
local_cluster = LocalCluster(n_workers=32,
                             threads_per_worker=1,
                             memory_limit=8e10)
custom_client = Client(local_cluster)
sys.stderr.write("done.\n")

# ex_matrix is a DataFrame with gene names as column names
sys.stderr.write("\nReading count matrix...")
ex_matrix = pd.read_csv(in_file, sep='\t', index_col=0, header=None).T
sys.stderr.write("done.\n")

# tf_names is read using a utility function included in Arboreto
sys.stderr.write("\nLoading putative transcription factors...")
tf_names = load_tf_names(tf_file)
sys.stderr.write("done.\n")
sys.stderr.write("\nPredicting co-expression network in chunks...\n")
i = 0
for chunk in grouper(tf_names, 20):
    sys.stderr.write("Working on chunk %s\n" % str(i))
    network = grnboost2(expression_data=ex_matrix,
                        tf_names=chunk,
                        client_or_address=custom_client)
    network.to_csv("network_reddien_" + str(i) + ".csv",
                   sep=",",
                   header=False,
                   index=False)
    i += 1
sys.stderr.write("done.\n")

sys.stderr.write("\n\n# All done\n")
                        index_col=0,
                        sep='\t',
                        skiprows=1,
                        header=None).T
#### get tfs
tf_path = '/ddn1/vol1/staging/leuven/stg_00002/lcb/kspan/analyses/ThreeLines10xSCENIC2/hg19_allTFs.lst'
tf_names = load_tf_names(tf_path)
tf_names = list(set(tf_names).intersection(ex_matrix.columns))
print(len(tf_names))

#run grnboost2
outfile = indir + 'grnboost2.tsv'
print('grnboost2 results will be printed to: ' + outfile)
start_time = time.time()
network = grnboost2(expression_data=ex_matrix,
                    tf_names=tf_names,
                    client_or_address=custom_client,
                    verbose=True)
print(time.time() - start_time, "seconds")
print(network.head())
network.to_csv(outfile, sep='\t', index=False, header=False)
sys.stdout = saveout
logs.close()

exit()

# packages in environment at /data/leuven/306/miniconda3/envs/arboreto: {
# Name                    Version                   Build  Channel
#arboreto                  0.1.5                      py_0    bioconda
#blas                      1.0                         mkl
#bokeh                     0.13.0                   py36_0
#ca-certificates           2018.03.07                    0
Exemplo n.º 19
0
    counts = pd.DataFrame(counts, index=gene_info.index, columns=barcodes)

    from arboreto.algo import grnboost2, genie3

    log_scaled_counts = (
        np.log(counts.divide(counts.sum(axis=0), axis=1) * 10000 + 1))

    log_scaled_counts = log_scaled_counts.loc[valid_genes, :]

    # About 14 minutes for 5000 genes x 3000 cells
    old_dir = os.getcwd()
    os.makedirs("/scratch/david.detomaso/temp",
                exist_ok=True)  # Need this or else the workers time out
    os.chdir("/scratch/david.detomaso/temp")

    a = time.time()
    network = grnboost2(log_scaled_counts.T)
    b = time.time()

    print(b - a)

    os.chdir(old_dir)

    # Need to convert to long

    net_wide = network.pivot(index='TF', columns='target', values='importance')
    z = net_wide.fillna(0)
    z = z + z.T

    z.to_csv(out_file_scores, sep="\t", compression="gzip")
Exemplo n.º 20
0
    def test_launch_grnboost2(self):
        network_df = grnboost2(df, tf_names=tfs)

        self.assertGreater(len(network_df), 100)
df_cnt = pd.DataFrame(adata.X.toarray(),
                      index=adata.obs.index,
                      columns=adata.var.index)

#2. tf genes
tf_name = load_tf_names(f_tf)

#3. ranking databases (only 2 mm10 dbs)
l_fname = list(Path(fd_db).glob('*.feather'))
l_db = [RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname]

#3. run
if __name__ == '__main__':
    #1. Inference of co-expression modules
    print('Inference...')
    df_adj = grnboost2(df_cnt, tf_names=tf_name, verbose=True)
    df_adj.to_csv(f'{fd_out}/adj_{sample}.csv', index=False)

    #2. prune
    df_adj = pd.read_csv(
        f'{fd_out}/adj_{sample}.csv')  #if missing, always stuck at 98%
    print('Prune...')
    l_mod = list(modules_from_adjacencies(df_adj, df_cnt))

    with ProgressBar():
        df_prune = prune2df(l_db, l_mod, f_motif)
    df_prune.to_csv(f'{fd_out}/prune_{sample}.csv')

    #3. create regulon
    print('Regulon...')
    regulon = df2regulons(df_prune)
Exemplo n.º 22
0
import pandas as pd

from distributed import Client, LocalCluster
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

if __name__ == '__main__':

    in_file = 'net1_expression_data.tsv'
    tf_file = 'net1_transcription_factors.tsv'
    out_file = 'net1_grn_output.tsv'

    # ex_matrix is a DataFrame with gene names as column names
    ex_matrix = pd.read_csv(in_file, sep='\t')

    # tf_names is read using a utility function included in Arboreto
    tf_names = load_tf_names(tf_file)

    # instantiate a custom Dask distributed Client
    client = Client(LocalCluster())

    # compute the GRN
    network = grnboost2(expression_data=ex_matrix,
                        tf_names=tf_names,
                        client_or_address=client)

    # write the GRN to file
    network.to_csv(out_file, sep='\t', index=False, header=False)
Exemplo n.º 23
0
        "GENIE3_import.csv", header=0,
        index_col=0).T  # loads expression matrix, make sure you transpose back
    databases_glob = os.path.join(
        "mm10__*.feather")  # loads cisTarget databases into memory
    db_fnames = glob.glob(databases_glob)

    def name(fname):
        return os.path.basename(fname).split(".")[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    # GENIE3 process: returns co-expression modules
    adjacencies = grnboost2(
        ex_matrix, tf_names=tf_names,
        verbose=True)  # runs improved GRNBoost instance of GENIE3
    modules = list(modules_from_adjacencies(
        adjacencies, ex_matrix))  # identifies modules from GENIE3

    # save GRNBoost2 product so we don't have to repeat again
    adjacencies.to_csv("grnboost_output.csv")

    # load product in case something goes wrong
    adjacencies = pd.read_csv("grnboost_output.csv", index_col=0)

    # cisTarget process: IDs cis-regulatory footprints from motifs around the TSS
    with ProgressBar(
    ):  # calculate a list of enriched motifs and the corresponding target genes for all modules
        df = prune2df(dbs, modules, "motifs-v9-nr-mgi.txt")
    regulons = df2regulons(
Exemplo n.º 24
0
input_file=cell_use+'_matrix.csv'
output_grnboost2=cell_use+'_grnboost2.csv'
output_genie3=cell_use+'_genie3.csv'

output_grnboost2_txt=cell_use+'_grnboost2.txt'
output_genie3_txt=cell_use+'_genie3.txt'


#load data
ex_matrix=pd.read_csv(input_file,sep=',')
ex_matrix=np.transpose(ex_matrix)
tf_names=load_tf_names('mm_mgi_tfs.txt')

#infer the gene regulatory network
network_n=grnboost2(ex_matrix, tf_names=tf_names, verbose=True) 

network_g=genie3(ex_matrix, tf_names=tf_names, verbose=True) 


#for following igraph analysis
network_n.to_csv(output_grnboost2,sep='\t')
network_g.to_csv(output_genie3,sep='\t')


#txt file with no header and index for FAC calculation
network_n.to_csv(output_grnboost2_txt,sep='\t',header=False,index=False)
network_g.to_csv(output_genie3_txt,sep='\t',header=False,index=False)


####### done!
Exemplo n.º 25
0
import os
import numpy as np
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

data_dir = '/home/brad/data2/rstudio/birds/scRNA/devin_combined/finch_cells/grn/export_to_numpy_glut'
expr_fname = os.path.join(data_dir, '1.1_exprMatrix_filtered_t.txt')
tf_fname = os.path.join(data_dir, '1.1_inputTFs.txt')

if __name__ == '__main__':
    # ex_matrix is a numpy ndarray, which has no notion of column names
    ex_matrix = np.genfromtxt(expr_fname, delimiter='\t', skip_header=1)

    # we read the gene names from the first line of the file
    with open(expr_fname) as file:
        gene_names = [gene.strip() for gene in file.readline().split('\t')]

    # sanity check to verify the ndarray's nr of columns equals the length of the gene_names list
    assert ex_matrix.shape[1] == len(gene_names)

    # tf_names is read using a utility function included in Arboreto
    tf_names = load_tf_names(tf_fname)

    network = grnboost2(expression_data=ex_matrix,
                        gene_names=gene_names,  # specify the gene_names
                        tf_names=tf_names)

    network.to_csv('output.tsv', sep='\t', index=False, header=False)
                                                                    
Exemplo n.º 26
0
dt = pd.read_csv(item, index_col=0)
dt.columns = [str(x) for x in dt.columns]
print(dt)
TFname = [
    'KLF6', 'TCEB3', 'LYL1', 'SMARCC1', 'TCOF1', 'ZNF267', 'ZEB2', 'MNDA',
    'ETS2', 'BAZ2B', 'POU2F2', 'MEF2C', 'KDM5A', 'PDLIM7', 'HDGF', 'ZBTB16',
    'ZNF350', 'STAT3', 'TAF1B', 'HIST2H2BE', 'DHX38', 'TP53', 'SMAD3', 'MXD4',
    'ARID5B', 'USF2', 'KDM2A', 'HIVEP3', 'MYBL1', 'HIST1H1E', 'ZNF593', 'BATF',
    'TAX1BP3', 'TRIM28', 'CBFB', 'CHD4', 'ZBTB38', 'PBX2', 'CTNNBIP1',
    'SERTAD2', 'ZMYND11', 'NCOA4', 'PER1', 'ID3', 'POLR2A', 'CDKN1A', 'TGFB1',
    'ZNF277', 'MAPK1', 'NEAT1', 'SP3', 'MAX', 'SMARCA2', 'REL', 'SIN3A',
    'NR4A1', 'ASCL2', 'JUND', 'TFDP2', 'BHLHE40', 'NFKBIA', 'HTT', 'SOX4',
    'SPI1', 'FOS', 'CITED2', 'CREM', 'PURA', 'HEXIM1', 'PKNOX1', 'CEBPB',
    'HHEX', 'BRD8', 'RUNX3', 'MAFB', 'EOMES', 'SERTAD3', 'ZNF143', 'ZNF467',
    'AKT1', 'ATF6', 'PTTG1', 'TBX21', 'UIMC1', 'IRF5', 'EED', 'ID1', 'IRF8',
    'HOPX', 'SUGP2', 'JUN', 'TAF6L', 'PDLIM1', 'SPIB', 'HIST1H1C', 'RNF19A',
    'CREBBP', 'IRF1', 'SUZ12', 'CHD8', 'HDAC5', 'BLZF1', 'SHPRH', 'CUX1',
    'RELB', 'GTF3C1', 'FOSB', 'MLXIP', 'NFIC', 'IRF7', 'BBC3', 'GTF2I', 'MKL1',
    'POLR1C', 'CEBPD', 'SMARCD2', 'IKZF3', 'SLA2'
]
client = Client(processes=False)
gene_name = list(dt.columns)
print(gene_name)
TFname = list(set(TFname) & set(gene_name))
print(dt)
network = grnboost2(dt,
                    client_or_address=client,
                    gene_names=list(dt.columns),
                    tf_names=TFname)
network.to_csv(item + 'TF_grnboost2.csv')
		network_fname = os.path.join(data_folder_iter, 'network.csv.gz')
		modules_fname = os.path.join(data_folder_iter, 'modules.p')
		motifs_fname = os.path.join(data_folder_iter, 'motifs.csv')
		regulons_fname = os.path.join(data_folder_iter, 'regulons.p')
		aucell_train_fname = os.path.join(data_folder_iter, 'aucell_train_scores.csv.gz')
		aucell_test_fname = os.path.join(data_folder_iter, 'aucell_test_scores.csv.gz')
		if not os.path.exists(data_folder_iter):
			os.makedirs(data_folder_iter)

		os.chdir(data_folder_iter)



		## Run GRNBoost2 (faster equivalent of GENIE3) from arboreto to infer co-expression modules
		if not os.path.isfile(network_fname):
			adjacencies = grnboost2(data_train, tf_names=tf_names, verbose=True, client_or_address=custom_client, seed=i)
			adjacencies.to_csv(network_fname, sep=',', header=True, index=False, compression='gzip')
		else:
			adjacencies = pd.read_csv(network_fname)



		## Derive potential regulons from co-expression modules
		if not os.path.isfile(modules_fname):
			modules = list(modules_from_adjacencies(adjacencies, data_train, keep_only_activating=False))
			pickle.dump(modules, open(modules_fname, 'wb'))
		else:
			modules = pickle.load(open(modules_fname, 'rb'))
		
		del adjacencies
# download and unzip file
url = 'https://tcga.xenahubs.net/download/TCGA.' + canType + '.sampleMap/HiSeqV2.gz'
wget.download(url, 'cancer_data_TCGA.gz')

with gzip.open('cancer_data_TCGA.gz', 'rb') as f_in:
    with open('file.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
os.remove("cancer_data_TCGA.gz")
############################################################################

df = pd.read_table("file.txt", index_col=0, sep='\t')
#ex_matrix.columns.get_values()
cols = [c for c in df.columns if c[13:15] == '03']
df = df[cols]

matrix = df.T

network = grnboost2(expression_data=matrix, tf_names=tf_names, verbose=True)

network.to_csv(canType + '_ex_GRNboost2_network.tsv',
               sep='\t',
               header=False,
               index=False)

# release the memory from python
#del ex_matrix, matrix, df, tf_names, network
os.remove('file.txt')
shutil.rmtree('dask-worker-space')
# end function