def create_and_save_partitions(dataset, study_name, meta_label, test_groups, pretest_groups, valid_groups, save_text_files=True): # determine dataset orientation orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat' # discard null categories tobediscarded = np.in1d( dataset.rowmeta[meta_label], ['-666', '', 'NA', 'N/A', 'na', 'n/a', 'NaN', 'NAN', 'nan']) dataset.discard(tobediscarded, 0) print('discarding {0!s} samples...'.format(tobediscarded.sum()), flush=True) print(dataset, flush=True) # partition the data tobepopped = np.in1d(dataset.rowmeta[meta_label], test_groups) dataset_test = dataset.pop(tobepopped, 0) print(' TEST', flush=True) print(dataset_test, flush=True) tobepopped = np.in1d(dataset.rowmeta[meta_label], pretest_groups) dataset_pretest = dataset.pop(tobepopped, 0) print(' PRETEST', flush=True) print(dataset_pretest, flush=True) tobepopped = np.in1d(dataset.rowmeta[meta_label], valid_groups) dataset_valid = dataset.pop(tobepopped, 0) print(' VALID', flush=True) print(dataset_valid, flush=True) dataset_train = dataset print(' TRAIN', flush=True) print(dataset_train, flush=True) # save data partitions savefolder = '../partitioned_data/{0}/{1}'.format(study_name, orientation) print(' SAVING PARTITIONS TO {0}'.format(savefolder), flush=True) os.makedirs(savefolder) datasetIO.save_datamatrix('{0}/test.pickle'.format(savefolder), dataset_test) datasetIO.save_datamatrix('{0}/pretest.pickle'.format(savefolder), dataset_pretest) datasetIO.save_datamatrix('{0}/valid.pickle'.format(savefolder), dataset_valid) datasetIO.save_datamatrix('{0}/train.pickle'.format(savefolder), dataset_train) if save_text_files: os.mkdir('{0}/test'.format(savefolder)) datasetIO.save_splitdata('{0}/test'.format(savefolder), dataset_test) os.mkdir('{0}/pretest'.format(savefolder)) datasetIO.save_splitdata('{0}/pretest'.format(savefolder), dataset_pretest) os.mkdir('{0}/valid'.format(savefolder)) datasetIO.save_splitdata('{0}/valid'.format(savefolder), dataset_valid) os.mkdir('{0}/train'.format(savefolder)) datasetIO.save_splitdata('{0}/train'.format(savefolder), dataset_train)
gene_atb.discard(tobediscarded, 0) print(gene_atb, flush=True) # discard pseudogenes print('discarding pseudogenes data...', flush=True) print(np.unique(gene_atb.rowmeta['locus_type']).tolist()) tobediscarded = ~np.in1d(gene_atb.rowmeta['locus_type'], ['RNA, long non-coding', 'RNA, micro', 'T cell receptor gene', 'gene with protein product', 'immunoglobulin gene', 'protocadherin']) gene_atb.discard(tobediscarded, 0) print(gene_atb, flush=True) # add mp metadata print('adding mouse phenotype metadata data...', flush=True) with open('../../original_data/impc/mpid_name_dict.pickle', 'rb') as fr: mpid_name = pickle.load(fr) gene_atb.columnmeta['mp_name'] = np.array([mpid_name[mpid] if mpid in mpid_name else 'nan' for mpid in gene_atb.columnlabels], dtype='object') print('missing phenotype names for {0!s} phenotype ids'.format((gene_atb.columnmeta['mp_name'] == 'nan').sum()), flush=True) # save the data print('saving prepared data...', flush=True) gene_atb.matrixname += '_prepared' datasetIO.save_datamatrix('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.pickle', gene_atb) datasetIO.save_datamatrix('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.txt.gz', gene_atb) savefolder = '../../input_data/impc_binary' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, gene_atb) shutil.copyfile('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder)) print('done.', flush=True)
snp_genome.columnmeta[metalabel][np.in1d( snp_genome.columnmeta[metalabel], low_freq_uvals)] = 'NA' # save the data print('saving prepared data...', flush=True) snp_genome.matrixname += '_prepared' datasetIO.save_datamatrix( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle', snp_genome) datasetIO.save_datamatrix( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz', snp_genome) savefolder = '../../input_data/1000genomes_genomes' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, snp_genome) shutil.copyfile( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder)) print('done.', flush=True) # visualization pca_model = PCA(n_components=2).fit(snp_genome.matrix) pca_matrix = pca_model.transform(snp_genome.matrix) fg, ax = plt.subplots(1, 1, figsize=(6.5, 4.3)) ax.set_position([0.15 / 6.5, 0.15 / 4.3, 4.0 / 6.5, 4.0 / 4.3]) ax.plot(pca_matrix[:, 0],
dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle') print(dataset, flush=True) # discard samples print('discarding samples...', flush=True) dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0) print(dataset, flush=True) # save the data print('saving data...', flush=True) datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', dataset) datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', dataset) savefolder = '../../input_data/pratfelip_transposed_plus_clinical_no_stabledisease' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, dataset) shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder)) # load the data print('loading dataset...', flush=True) dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_ft_pratfelip_only_clinical_and_deconv.pickle') print(dataset, flush=True) # discard samples print('discarding samples...', flush=True) dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0) print(dataset, flush=True) # save the data print('saving data...', flush=True)
print('rgep_genes: {0!s}'.format(len(gene_cell)), flush=True) print(atb_gene) # add cell type metadata print('adding cell type metadata...', flush=True) atb_gene.columnmeta['rgep_cell_type'] = np.array( [gene_cell[gene_sym] for gene_sym in atb_gene.columnmeta['symbol']], dtype='object') # save the data print('saving filtered data...', flush=True) datasetIO.save_datamatrix( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle' .format(rgep_name), atb_gene) datasetIO.save_datamatrix( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz' .format(rgep_name), atb_gene) savefolder = '../../input_data/hugolo_transposed_filtered_by_{0}_rgep'.format( rgep_name) if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, atb_gene) shutil.copyfile( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle' .format(rgep_name), '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz' .format(rgep_name), '{0}/datamatrix.txt.gz'.format(savefolder)) print('done.', flush=True)
snp_haplome.reorder(np.random.permutation(snp_haplome.shape[1]), 1) print(snp_haplome, flush=True) # save the data print('saving prepared data...', flush=True) snp_haplome.matrixname += '_prepared' datasetIO.save_datamatrix( '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.pickle', snp_haplome) datasetIO.save_datamatrix( '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.txt.gz', snp_haplome) savefolder = '../../input_data/1000genomes_haplomes' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, snp_haplome) shutil.copyfile( '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile( '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder)) print('done.', flush=True) # visualization pca_model = PCA(n_components=2).fit(snp_haplome.matrix) pca_matrix = pca_model.transform(snp_haplome.matrix) fg, ax = plt.subplots(1, 1, figsize=(6.5, 4.3)) ax.set_position([0.15 / 6.5, 0.15 / 4.3, 4.0 / 6.5, 4.0 / 4.3]) ax.plot(pca_matrix[:, 0],