def random_select_subset(args): import h5py import numpy as np from utils import prepare_output_file if ':' not in args.input_file: raise ValueError('missing group name in input file: ' + args.input_file) logger.info('read input file: ' + args.input_file) input_file, group_name = args.input_file.split(':') with h5py.File(input_file, 'r') as f: X = f['/%s/X' % group_name][:] chrom = f['/%s/chrom' % group_name][:] positions = f['/%s/position' % group_name][:] logger.info('select %d SNP subsets of size %d' % (args.n_groups, args.n_snps)) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: if args.method == 'random_choice': for i in range(args.n_groups): ind = np.random.choice(X.shape[0], size=args.n_snps, replace=False) f.create_dataset('/%d/X' % i, data=X[ind]) f.create_dataset('/%d/chrom' % i, data=chrom[ind]) f.create_dataset('/%d/position' % i, data=positions[ind]) elif args.method == 'seq': for i in range(X.shape[0] / args.n_snps): ind = np.r_[(i * args.n_snps):((i + 1) * args.n_snps)] f.create_dataset('/%d/X' % i, data=X[ind]) f.create_dataset('/%d/chrom' % i, data=chrom[ind]) f.create_dataset('/%d/position' % i, data=positions[ind])
def classify_diseases(args): from keras.models import load_model from models import custom_objects import numpy as np import h5py globals().update(locals()) logger.info('load model from file: ' + args.model_file) model = load_model(args.model_file, custom_objects=custom_objects) logger.info('read image ids from file: ' + args.image_id_file) image_id = read_hdf5(args.image_id_file, args.image_id_dataset) logger.info('read image data from file: ' + args.input_file) fin = h5py.File(args.input_file, 'r') X = fin['X'][:] image_id_X = fin['image_id'][:] fin.close() if model.input.shape[3] > 1: logger.info('convert gray-scale images to 3-channel images') X = np.repeat(X[array_lookup(image_id_X, image_id)], 3, axis=3) else: X = np.take(X, array_lookup(image_id_X, image_id), axis=0) logger.info('predict') y = model.predict(X, batch_size=args.batch_size) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) fout = h5py.File(args.output_file, 'w') fout.create_dataset('y', data=y) fout.create_dataset('image_id', data=image_id) fout.close()
def random_cv_split(args): import numpy as np import h5py from utils import read_hdf5_single, cv_split_emaize, get_indices_table, prepare_output_file, read_hdf5_dataset logger.info('read training indices file: ' + args.train_index_file) train_indices_all = read_hdf5_dataset(args.train_index_file) logger.info('read parent table file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) indices_table, mask = get_indices_table(train_indices_all, parent_table) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: for k in range(args.n_datasets): row_indices = np.random.choice(indices_table.shape[0], 5, replace=False) col_indices = np.random.choice(indices_table.shape[1], 5, replace=False) test_indices = np.union1d( indices_table[row_indices, :].reshape((-1, )), indices_table[:, col_indices].reshape((-1, ))) train_indices = np.setdiff1d(train_indices_all, test_indices) test_indices = np.intersect1d(test_indices, train_indices_all) train_indices = np.intersect1d(train_indices, train_indices_all) g = f.create_group(str(k)) g.create_dataset('train', data=train_indices) g.create_dataset('test', data=test_indices)
def plot_model(args): import keras logger.info('load model from file: ' + args.model_file) model = keras.models.load_model(args.model_file) logger.info('save plot model file: ' + args.output_file) prepare_output_file(args.output_file) keras.utils.plot_model(model, args.output_file, show_shapes=True)
def run_generate_parent_table(args): from utils import generate_parent_table, prepare_output_file import numpy as np import h5py logger.info('read phenotypes from file: ' + args.input_file) parent_table = generate_parent_table(args.input_file) logger.info('save parent table to file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('data', data=parent_table)
def random_select(args): import h5py import numpy as np from utils import prepare_output_file logger.info('read genomic positions from file: ' + args.genomic_pos_file) positions = {} with h5py.File(args.genomic_pos_file, 'r') as f: for i in range(1, 11): positions['chr%d' % i] = f['chr%d' % i][:] n_snps_per_chrom = { chrom: positions[chrom].shape[0] for chrom in positions.keys() } n_snps_total = sum(n_snps_per_chrom.values()) X = [[] for i in range(args.n_select)] chroms = [[] for i in range(args.n_select)] positions_sel = [[] for i in range(args.n_select)] n_sel_total = 0 for i_chrom in range(1, 11): chrom = 'chr%d' % i_chrom genotype_file = os.path.join(args.input_dir, chrom) logger.info('read genotype file: ' + genotype_file) with h5py.File(genotype_file, 'r') as f: n_sel = int( np.round(args.n_snps * float(n_snps_per_chrom[chrom]) / n_snps_total)) logger.info('select %d SNPs on chromosome %d' % (n_sel, i_chrom)) X_chrom = f['data'][:] for i_select in range(args.n_select): ind = np.random.choice(n_snps_per_chrom[chrom], size=n_sel, replace=False) X[i_select].append(X_chrom[ind]) chroms[i_select].append(np.full(n_sel, i_chrom, dtype='int8')) positions_sel[i_select].append(positions[chrom][ind]) del X_chrom n_sel_total += n_sel logger.info('number of SNPs selected: %d' % n_sel_total) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) fout = h5py.File(args.output_file, 'w') for i_select in range(args.n_select): X[i_select] = np.concatenate(X[i_select], axis=0) chroms[i_select] = np.concatenate(chroms[i_select]) positions_sel[i_select] = np.concatenate(positions_sel[i_select]) g = fout.create_group(str(i_select)) g.create_dataset('X', data=X[i_select]) g.create_dataset('chrom', data=chroms[i_select]) g.create_dataset('position', data=positions_sel[i_select]) fout.close()
def phenotypes_to_hdf5(args): import pandas as pd import h5py from utils import prepare_output_file logger.info('read phenotype file: ' + args.input_file) phenotypes = pd.read_table(args.input_file) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: for col in phenotypes.columns: if phenotypes[col].dtype == 'O': f.create_dataset(col, data=phenotypes[col].values.astype('S')) else: f.create_dataset(col, data=phenotypes[col].values)
def convert_train_test_indices(args): import numpy as np import h5py from utils import prepare_output_file logger.info('read training sample indices from file: ' + args.train_index_file) train_index = np.loadtxt(args.train_index_file, dtype='int') logger.info('read training sample indices from file: ' + args.test_index_file) test_index = np.loadtxt(args.test_index_file, dtype='int') logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('train', data=train_index) f.create_dataset('test', data=test_index)
def phenotypes_to_train_test_indices(args): import pandas as pd import numpy as np import h5py from utils import prepare_output_file logger.info('read input file: ' + args.input_file) phenotypes = pd.read_table(args.input_file) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('train', data=np.nonzero( (phenotypes['type'] == 'training').values)[0]) f.create_dataset('test', data=np.nonzero( (phenotypes['type'] == 'test').values)[0])
def normalize_genotypes(args): from utils import read_hdf5_dataset, prepare_output_file import numpy as np import h5py logger.info('read input file: ' + args.input_file) X, dataset = read_hdf5_dataset(args.input_file, return_name=True) n_snps, n_samples = X.shape # allele frequencies p = X.sum(axis=1).astype('float32')/n_samples multiplier = 1.0/np.sqrt(2.0*p*(1.0 - p)) multiplier = multiplier.astype('float32') logger.info('save mean and multipliers to output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('mean', data=p) f.create_dataset('multiplier', data=multiplier)
def extract_snp_pos(args): import h5py import subprocess import numpy as np from utils import prepare_output_file logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) fout = h5py.File(args.output_file, 'w') for i in range(1, 11): genotype_file = os.path.join(args.input_dir, 'chr%d_emaize.genoMat' % i) logger.info('read genotype matrix file: ' + genotype_file) p = subprocess.Popen(['awk', 'NR>1{print $4}', genotype_file], stdout=subprocess.PIPE) positions = np.loadtxt(p.stdout, dtype='int64') fout.create_dataset('chr%d' % i, data=positions) fout.close()
def create_gsm(args): import h5py import numpy as np from utils import prepare_output_file, read_hdf5_dataset ''' logger.info('read genomic positions from file: ' + args.genomic_pos_file) positions = {} with h5py.File(args.genomic_pos_file, 'r') as f: for i in range(1, 11): positions['chr%d'%i] = f['chr%d'%i][:] n_snps_per_chrom = {chrom:positions[chrom].shape[0] for chrom in positions.keys()} n_snps_total = sum(n_snps_per_chrom.values()) X = [] for chrom in positions.keys(): genotype_file = os.path.join(args.input_dir, chrom) logger.info('read genotype file: ' + genotype_file) with h5py.File(genotype_file, 'r') as f: n_sel = int(np.round(args.n_snps*float(n_snps_per_chrom[chrom])/n_snps_total)) ind = np.random.choice(n_snps_per_chrom[chrom], size=n_sel) X.append(f['data'][:][ind]) X = np.concatenate(X, axis=0).astype('float32') ''' logger.info('read genotypes from file: ' + args.input_file) X = read_hdf5_dataset(args.input_file).astype('float64') logger.info('number of selected SNPs: %d' % X.shape[0]) logger.info('calculate GSM') X -= X.mean(axis=1)[:, np.newaxis] X_std = np.sqrt(np.sum(X**2, axis=1)) X_std[np.isclose(X_std, 0.0)] = 1.0 X = X / X_std[:, np.newaxis] logger.info('calculate K') K = np.dot(X.T, X) logger.info('run SVD on X') U, S, V = np.linalg.svd(X.T, full_matrices=False) V = V.T logger.info('save GSM to file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('K', data=K) f.create_dataset('U', data=U) f.create_dataset('S', data=S) f.create_dataset('V', data=V)
def convert_2bit_to_minor(args): from utils import read_hdf5_dataset, prepare_output_file import numpy as np import h5py import numba @numba.jit(nopython=True) def _2bit_to_minor(X_2bit, X_minor): n_snps = X_minor.shape[0] n_samples = X_minor.shape[1] max_freq = n_samples for i in range(n_snps): freq = 0 for j in range(n_samples): count = X_2bit[i, 1, j] - X_2bit[i, 0, j] + 1 freq += count X_minor[i, j] = count if freq > n_samples: for j in range(n_samples): X_minor[i, j] = 2 - X_minor[i, j] logger.info('read input file: ' + args.input_file) X_2bit, dataset = read_hdf5_dataset(args.input_file, return_name=True) n_snps, n_samples = X_2bit.shape n_snps /= 2 logger.info('number of SNPs: %d, number of samples: %d'%(n_snps, n_samples)) X_2bit = X_2bit.reshape((n_snps, 2, n_samples)) logger.info('convert from 2bit code to minor copy numbers') # assume that the second allele in the 2bit representation is the minor allele # 10 -> 0, 11 -> 1, 01 -> 2 #X_minor = np.einsum('ijk,j->ik', X_2bit, np.array([-1, 1])) + 1 # swap the two alleles to make sure that the number represents a minor allele #allele_freq = np.expand_dims(np.sum(X_2bit[:, 1, :], axis=1), axis=1) #X_minor = np.where(allele_freq <= n_samples/2, X_minor, 2 - X_minor) X_minor = np.empty((n_snps, n_samples), dtype='int8') _2bit_to_minor(X_2bit, X_minor) logger.info('save minor allele copy numbers to output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset(dataset, data=X_minor)
def analyze_relevance(args): import keras from utils import read_hdf5, array_lookup logger.info('load model from file: ' + args.model_file) model = keras.models.load_model(args.model_file) logger.info('read image id file: ' + args.image_id_file) image_id = read_hdf5(args.image_id_file, args.image_id_dataset) logger.info('read input file: ' + args.input_file) X, image_id_X = read_hdf5(args.input_file, [args.input_dataset, 'image_id']) if model.input.shape[3] > 1: logger.info('convert gray-scale images to 3-channel images') X = np.repeat(X[array_lookup(image_id_X, image_id)], 3, axis=3) else: X = np.take(X, array_lookup(image_id_X, image_id), axis=0) logger.info('read target file: ' + args.target_file) y, image_id_y = read_hdf5(args.target_file, [args.target_dataset, 'image_id']) y = np.take(y, array_lookup(image_id_y, image_id), axis=0) if len(y.shape) == 1: y = np.expand_dims(y, axis=1) if args.method == 'deep_taylor': logger.info('input tensor name: ' + model.input.name.split(':')[0]) logger.info('output tensor name: ' + model.output.name.split(':')[0]) logger.info('start Deep Taylor decomposition') relevance_maps = deep_taylor(model, X, y) elif args.method == 'sensitivity': logger.info('start sensitivity analysis') relevance_maps = sensitivity_analysis(model, X, y, args.batch_size) logger.info('save relevance maps to file: ' + args.output_file) prepare_output_file(args.output_file) fout = h5py.File(args.output_file, 'w') fout.create_dataset('X', data=X) fout.create_dataset('relevance_map', data=relevance_maps) fout.create_dataset('image_id', data=image_id) fout.close()
def evaluate(args): import h5py from sklearn.metrics import r2_score, mean_squared_error from scipy.stats import pearsonr from utils import prepare_output_file, read_hdf5_dataset logger.info('read prediction file: ' + args.input_file) with h5py.File(args.input_file, 'r') as f: y_true = f['y_true'][:] y_pred = f['y_pred'][:] logger.info('read sample indices file: ' + args.sample_indices_file) indices = read_hdf5_dataset(args.sample_indices_file) y_true = y_true[indices] y_pred = y_pred[indices] logger.info('save metrics file: ' + args.output_file) prepare_output_file(args.output_file) with open(args.output_file, 'w') as f: f.write('r2\tmse\tpcc\n') f.write('%f' % r2_score(y_true, y_pred)) f.write('\t%f' % mean_squared_error(y_true, y_pred)) f.write('\t%f' % pearsonr(y_true, y_pred)[0]) f.write('\n')
kfold_male = KFold(args.k_male, shuffle=True) for train_index_female, _ in kfold_female.split(ind_female_training): train_index_female = ind_female_training[train_index_female] for train_index_male, _ in kfold_male.split(ind_male_training): train_index_male = ind_male_training[train_index_male] parent_table_train = np.ravel( parent_table[:, train_index_female][train_index_male]) train_index.append( parent_table_train[is_training[parent_table_train]]) parent_table_test = np.setdiff1d( np.ravel(parent_table_training), parent_table_train) test_index.append( parent_table_test[is_training[parent_table_test]]) import h5py logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as fout: if args.max_size is not None: if len(train_index) > args.max_size: logger.info('randomly sample %d sets from %s sets' % (args.max_size, len(train_index))) sel = np.random.choice(len(train_index), size=args.max_size, replace=False) train_index = [train_index[i] for i in sel] test_index = [test_index[i] for i in sel] for i in range(len(train_index)): g = fout.create_group(str(i)) g.create_dataset('train', data=train_index[i]) g.create_dataset('test', data=test_index[i])
def run_fastlmm(args): from pysnptools.snpreader import SnpData, Pheno, SnpReader from utils import prepare_output_file, read_cvindex from fastlmm.inference import FastLMM import dill as pickle logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.cvindex_file is not None: logger.info('read indices from file: ' + args.cvindex_file) train_index, test_index = read_cvindex(args.cvindex_file) else: train_index = np.nonzero((phenotypes['type'] == 'training').values)[0] test_index = np.nonzero((phenotypes['type'] == 'test').values)[0] n_snps_total = get_num_snps(args.snp_file) n_snps_sel = min(n_snps_total, args.n_snps) logger.info('number of sampled SNPs: %d' % n_snps_sel) sel_snps = np.random.choice(n_snps_total, size=n_snps_sel) logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, transpose=args.transpose_x, snp_indices=sel_snps, std_filter_indices=train_index) logger.info('number of sampled SNPs after filtering by std: %d' % test_snps.shape[1]) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0) if args.seed: logger.info('set random seed for numpy: %d' % args.seed) np.seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes.copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('train FastLMM model for %s' % trait) model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True) model.fit(X=test_snps[train_index, :], y=pheno, K0_train=K0, penalty=args.penalty, Smin=1.0) logger.info('fitted h2: %f' % model.h2raw) logger.info('predict using the FastLMM model for %s' % trait) y_mean, y_var = model.predict(X=test_snps[test_index, :], K0_whole_test=K0[test_index, :]) y_true = phenotypes[trait][test_index].values result_file = os.path.join(args.output_dir, 'predictions.%s' % trait) logger.info('save predictions to file: ' + result_file) prepare_output_file(result_file) with h5py.File(result_file, 'w') as f: f.create_dataset('y_mean', data=y_mean.val) f.create_dataset('y_var', data=y_var.val) f.create_dataset('y_true', data=y_true) f.create_dataset('h2raw', data=model.h2raw) f.create_dataset('sel_snps', data=sel_snps) model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait) logger.info('save model to file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f)
def summarize_cv(args): import pandas as pd columns = {} colnames = {} with h5py.File(os.path.join(args.input_dir, 'cv_split'), 'r') as f: n_folds = len(f.keys()) cv_split = {} for cv_fold in f.keys(): cv_split[int(cv_fold)] = {} g = f[cv_fold] for key in g.keys(): cv_split[int(cv_fold)][key] = g[key][:] colnames['classification'] = [] if args.task == 'classification': with h5py.File(os.path.join(args.input_dir, 'targets'), 'r') as f: class_name = f['class_name'][:] y = f['y'][:] image_id_y = f['image_id'][:] colnames['classification'] = ['train_size', 'test_size'] colnames['classification'] += ['class_size(%s)'%(class_name[i]) for i in range(len(class_name))] columns['cv_fold'] = np.full(n_folds, -1, dtype='int32') colnames['metric'] = [] for cv_fold in range(n_folds): cv_dir = os.path.join(args.input_dir, 'cv', str(cv_fold)) columns['cv_fold'][cv_fold] = cv_fold if not os.path.isdir(cv_dir): continue pred_file = os.path.join(cv_dir, 'predictions') with h5py.File(pred_file, 'r') as f: g = f['metrics'] # get column names if len(colnames['metric']) == 0 : colnames['metric'] = [] for metric in g.keys(): if len(g[metric].shape) == 0: colnames['metric'].append(metric) elif metric == 'accuracy_by_class': colnames['metric'] += ['%s(%s)'%(metric, class_name[i]) for i in range(g[metric].shape[0])] for metric in colnames['metric']: columns[metric] = np.full(n_folds, np.nan, dtype='float64') if args.task == 'classification': for colname in ['train_size', 'test_size']: columns[colname] = np.zeros(n_folds, dtype='int32') for i in range(len(class_name)): columns['class_size(%s)'%(class_name[i])] = np.zeros(n_folds, dtype='int32') for metric in g.keys(): if len(g[metric].shape) == 0: columns[metric][cv_fold] = g[metric][()] elif metric == 'accuracy_by_class': metric_vals = g[metric][:] for i in range(g[metric].shape[0]): columns['%s(%s)'%(metric, class_name[i])][cv_fold] = metric_vals[i] if args.task == 'classification': columns['train_size'][cv_fold] = cv_split[cv_fold]['train'].shape[0] columns['test_size'][cv_fold] = cv_split[cv_fold]['test'].shape[0] for i in range(len(class_name)): y_test = y[array_lookup(image_id_y, cv_split[cv_fold]['test'])] # one-hot coding for multi-class if len(y_test.shape) > 1: columns['class_size(%s)'%(class_name[i])][cv_fold] = np.sum(y_test[:, i]) # two-class else: columns['class_size(%s)' % (class_name[i])][cv_fold] = np.sum(y_test == i) summary = pd.DataFrame(columns) attribute_keys = [] if args.attribute is not None: for a in args.attribute: if '=' not in a: raise ValueError('missing = in attribute: ' + a) ind = a.index('=') key = a[:ind].strip() val = a[(ind + 1):].strip() summary[key] = val attribute_keys.append(key) summary = summary[attribute_keys + ['cv_fold'] + colnames['classification'] + colnames['metric']] logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) summary.to_csv(args.output_file, sep='\t', quoting=False, index=False)
def plot_predictions(args): import h5py from utils import read_hdf5_single, prepare_output_file, read_hdf5_dataset import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages plt.rcParams['font.family'] = 'Arial' plt.rcParams['font.size'] = 12 plt.rcParams['legend.fontsize'] = 12 import numpy as np def normalize_phenotype(x, range_pheno=4.0): return (np.clip(x, -range_pheno, range_pheno) + range_pheno) / 2.0 / range_pheno logger.info('read parent table file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('read predictions from file: ' + args.input_file) with h5py.File(args.input_file, 'r') as f: y_true = f['y_true'][:] y_pred = f['y_pred'][:] logger.info('read training indices from file: ' + args.train_indices_file) train_index = read_hdf5_dataset(args.train_indices_file) logger.info('read test indices from file: ' + args.test_indices_file) test_index = read_hdf5_dataset(args.test_indices_file) y_pred_train = np.full(y_pred.shape, np.nan) y_pred_train[train_index] = y_pred[train_index] y_pred_test = np.full(y_pred.shape, np.nan) y_pred_test[test_index] = y_pred[test_index] logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with PdfPages(args.output_file) as pdf: fig, axes = plt.subplots(4, 1, figsize=(10, 8)) axes[0].matshow(np.take(np.ravel(normalize_phenotype(y_true)), parent_table), cmap=plt.cm.RdBu_r) axes[0].set_title('True phenotypes') axes[1].matshow(np.take(np.ravel(normalize_phenotype(y_pred)), parent_table), cmap=plt.cm.RdBu_r) axes[1].set_title('Predicted phenotypes') axes[2].matshow(np.take(np.ravel(normalize_phenotype(y_pred_train)), parent_table), cmap=plt.cm.RdBu_r) axes[2].set_title('Predicted phenotypes (train)') axes[3].matshow(np.take(np.ravel(normalize_phenotype(y_pred_test)), parent_table), cmap=plt.cm.RdBu_r) axes[3].set_title('Predicted phenotypes (test)') plt.tight_layout() pdf.savefig(fig) plt.clf() fig, axes = plt.subplots(2, 3, figsize=(10, 6)) axes[0, 0].hist(y_true[~np.isnan(y_true)], bins=50) axes[0, 0].set_title('True phenotypes') axes[0, 1].hist(y_true[train_index], bins=50) axes[0, 1].set_title('True phenotypes (train)') axes[0, 2].hist(y_true[test_index], bins=50) axes[0, 2].set_title('True phenotypes (test)') axes[1, 0].hist(y_pred, bins=50) axes[1, 0].set_title('Predicted phenotypes') axes[1, 1].hist(y_pred[train_index], bins=50) axes[1, 1].set_title('Predicted phenotypes (train)') axes[1, 2].hist(y_pred[test_index], bins=50) axes[1, 2].set_title('Predicted phenotypes (test)') for i in range(2): for j in range(3): axes[i, j].set_xlim(-5, 5) plt.tight_layout() pdf.savefig(fig) plt.clf() fig, axes = plt.subplots(1, 3, figsize=(10, 4)) axes[0].scatter(y_true[~np.isnan(y_true)], y_pred[~np.isnan(y_true)], s=3) axes[0].set_xlabel('True phenotypes') axes[0].set_ylabel('Predicted phenotypes') axes[0].set_title('All samples') axes[1].scatter(y_true[train_index], y_pred[train_index], s=3) axes[1].set_xlabel('True phenotypes') axes[1].set_ylabel('Predicted phenotypes') axes[1].set_title('Training samples') axes[2].scatter(y_true[test_index], y_pred[test_index], s=3) axes[2].set_xlabel('True phenotypes') axes[2].set_ylabel('Predicted phenotypes') axes[2].set_title('Training samples') plt.tight_layout() pdf.savefig(fig)