Exemplo n.º 1
0
def main():

  net = Network()

  # load genes of interest
  gene_info = net.load_json_to_dict('../grant_pois/gene_info_with_dark.json')

  # ENCODE, GTEx, etc
  # hzome_names = ['my_CCLE_exp.txt', 'ENCODE_TF_targets.txt', 'ChEA_TF_targets.txt']
  hzome_names = ['ENCODE_TF_targets.txt']

  # define separate sim_cutoffs for different files
  cutoffs = {}
  cutoffs['my_CCLE_exp.txt'] = 0.15
  cutoffs['ENCODE_TF_targets.txt'] = 0.35 ## 0.6
  cutoffs['ChEA_TF_targets.txt'] = 0.2
  cutoffs['my_gtex_Moshe_2017_exp.txt'] = 0.2

  genes_of_class = gene_info['KIN']['all']

  for hzome_name in hzome_names:

    hzome_filename = '../hzome_data/' + hzome_name

    print('loading data ')

    # load hzome data
    ####################
    if 'my_' in hzome_name:
      # if I am providing the data, then load in normal way
      net.load_file(hzome_filename)
      hzome_data = net.export_df()
    else:
      # load data in hzome format
      hzome_data = deepcopy(hzome_to_df.load_matrix(hzome_filename))

    print('data loaded\n')

    for gene_class in gene_info:
      calc_gene_sim_mat(hzome_data, net, gene_info, gene_class, hzome_name, cutoffs)
def make_ccle_matrix_subset():
    '''
  This will save a subset of the downsampled matrix using the proteins of interest
  '''
    from clustergrammer import Network
    import json_scripts

    print('-- load CCLE downsampled data')

    # load downsampled CCLE data
    net = Network()
    net.load_file('CCLE/CCLE_kmeans_ds_col_100.txt')

    df = net.export_df()

    # load proteins of interest
    filename = 'proteins_of_interest/proteins_of_interest.json'
    poi = json_scripts.load_to_dict(filename)

    all_poi = []
    for inst_type in poi:
        all_poi.extend(poi[inst_type])

    # only keep pois that are found in the CCLE
    all_genes = df.index.tolist()

    found_poi = list(set(all_genes) & set(all_poi))

    num_found_poi = len(found_poi)

    print(
        str(num_found_poi) +
        ' proteins of interest were found in the CCLE data')

    # filter dataframe using row list (transpose and transpose-back)
    ##################################################################
    df = df.transpose()
    df = df[found_poi]
    df = df.transpose()

    # save version without protein categories (e.g. kinase)
    df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi_no_cats.txt', sep='\t')

    row_cats = []

    for inst_gene in found_poi:

        # add protein type to gene names
        found_type = ''
        for inst_type in poi:

            if inst_gene in poi[inst_type]:
                found_type = inst_type

        gene_name = 'gene: ' + inst_gene
        cat_name = 'type: ' + found_type
        inst_tuple = (gene_name, cat_name)

        row_cats.append(inst_tuple)

    # redefine index
    df.index = row_cats

    print('-- save matrix with proteins_of_interest subset')
    df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi.txt', sep='\t')
def main():

    from clustergrammer import Network

    # load CCLE cell lines
    filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/ccle_cl_names.txt'
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()

    cl_names = []
    for inst_line in lines:
        inst_line = inst_line.strip()
        cl_names.append(inst_line)

    filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/CCLE_lung.txt'

    net = Network()

    net.load_file(filename)

    ccle_lung = net.export_df()

    cols = ccle_lung.columns.tolist()

    # simplify cols, disguard meta-data
    ######################################
    simple_cols = []

    for inst_col in cols:
        proc_col = inst_col[0].split(': ')[1].replace('NCI', '')

        if 'CALU' in proc_col:
            proc_col = proc_col.replace('CALU', 'Calu-')

        if 'LOU' in proc_col:
            proc_col = proc_col.replace('LOU', 'Lou-')

        if 'CAL' in proc_col:
            proc_col = proc_col.replace('CAL', 'CAL-')

        simple_cols.append(proc_col)

    ccle_lung.columns = simple_cols

    cols = ccle_lung.columns.tolist()

    found_cols = []

    for inst_col in cols:
        if inst_col in cl_names:
            found_cols.append(inst_col)

    # found all cell lines
    print('found ' + str(len(found_cols)))

    # save subset of cell lines that are also found in the CST PTM data
    ccle_cst_lung = ccle_lung[cl_names]

    save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/CCLE_CST_lung.txt'
    ccle_cst_lung.to_csv(save_filename, sep='\t')