Пример #1
0
def filter_features_2bit(args):
    from utils import read_hdf5_dataset

    logger.info('read genotypes from: ' + args.genotype_file)
    genotypes = read_hdf5_dataset(args.genotype_file)
    indices = None
    if args.indices_file is not None:
        logger.info('read indices from: ' + args.indices_file)
        indices = read_hdf5_dataset(args.indices_file)
        genotypes = np.take(genotypes, indices, axis=0)
        logger.info('number of samples: %d' % indices.shape[0])
    pvalues = {}
    for phenotype_file in args.phenotype_file:
        logger.info('read phenotypes from: ' + phenotype_file)
        phenotypes, dataset = read_hdf5_dataset(phenotype_file,
                                                return_name=True)
        if indices is not None:
            phenotypes = np.take(phenotypes, indices)
        if args.metric == 'anova':
            logger.info('calculate ANOVA p-values')
            pvalues[dataset] = fast_anova_2bit(genotypes, phenotypes)
    logger.info('save p-values to file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        for dataset in pvalues.keys():
            f.create_dataset(dataset, data=pvalues[dataset])
Пример #2
0
def random_cv_split(args):
    import numpy as np
    import h5py
    from utils import read_hdf5_single, cv_split_emaize, get_indices_table, prepare_output_file, read_hdf5_dataset

    logger.info('read training indices file: ' + args.train_index_file)
    train_indices_all = read_hdf5_dataset(args.train_index_file)
    logger.info('read parent table file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)
    indices_table, mask = get_indices_table(train_indices_all, parent_table)

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        for k in range(args.n_datasets):
            row_indices = np.random.choice(indices_table.shape[0],
                                           5,
                                           replace=False)
            col_indices = np.random.choice(indices_table.shape[1],
                                           5,
                                           replace=False)
            test_indices = np.union1d(
                indices_table[row_indices, :].reshape((-1, )),
                indices_table[:, col_indices].reshape((-1, )))
            train_indices = np.setdiff1d(train_indices_all, test_indices)
            test_indices = np.intersect1d(test_indices, train_indices_all)
            train_indices = np.intersect1d(train_indices, train_indices_all)
            g = f.create_group(str(k))
            g.create_dataset('train', data=train_indices)
            g.create_dataset('test', data=test_indices)
Пример #3
0
def test_single_snp(args):
    import fastlmm
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from fastlmm.association import single_snp
    from utils import read_hdf5_dataset
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import fastlmm.util.util as flutil

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.sample_indices_file is not None:
        logger.info('read indices from file: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(
            (phenotypes['type'] == 'training').values)[0]
    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices)
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes[phenotypes['type'] == 'training'].copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno[['fid', 'iid', trait]].to_csv(pheno_file,
                                               index=False,
                                               sep='\t',
                                               header=False)
        pheno = Pheno(pheno_file)
        logger.info('run FastLMM for single SNP test for %s' % trait)
        results_df = single_snp(test_snps,
                                pheno,
                                K0=K0,
                                count_A1=True,
                                GB_goal=args.GB_goal)
        result_file = os.path.join(args.output_dir, 'single_snp.' + trait)
        logger.info('save results to file: ' + result_file)
        results_df.to_hdf(result_file, trait)

        if args.manhattan:
            plot_file = os.path.join(args.output_dir,
                                     'manhattan.%s.pdf' % trait)
            logger.info('create Manhattan plot: ' + plot_file)
            plt.clf()
            flutil.manhattan_plot(results_df.as_matrix(
                ["Chr", "ChrPos", "PValue"]),
                                  pvalue_line=1e-5,
                                  xaxis_unit_bp=False)
            plt.savefig(plot_file)
Пример #4
0
def anova_linregress(args):
    from utils import read_hdf5_dataset
    from tqdm import tqdm
    from statsmodels.sandbox.stats.multicomp import multipletests

    logger.info('read genotypes from: ' + args.genotype_file)
    genotypes = read_hdf5_dataset(args.genotype_file)
    indices = None
    if args.sample_indices_file is not None:
        logger.info('read indices from: ' + args.sample_indices_file)
        indices = read_hdf5_dataset(args.sample_indices_file)
        genotypes = np.take(genotypes, indices, axis=1)
        logger.info('number of samples: %d' % indices.shape[0])

    logger.info('read phenotypes from: ' + args.phenotype_file)
    phenotypes, dataset = read_hdf5_dataset(args.phenotype_file,
                                            return_name=True)
    logger.info('perform ANOVA for dataset: %s' % dataset)
    if indices is not None:
        phenotypes = np.take(phenotypes, indices)
    if args.batch_size is not None:
        slicegen = BatchSliceGenerator(genotypes.shape[0],
                                       batch_size=args.batch_size)
        outputs = []
        for start, stop in tqdm(slicegen(), total=slicegen.n_batches):
            outputs.append(fast_linregress(genotypes[start:stop], phenotypes))
        w, b, pvalues = [np.concatenate(a) for a in zip(*outputs)]
        del outputs
    else:
        w, b, pvalues = fast_linregress(genotypes, phenotypes)
    reject, qvalues, _, _ = multipletests(pvalues,
                                          alpha=args.alpha,
                                          method='fdr_bh')
    reject = np.nonzero(reject)[0]

    logger.info('save results to file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('pvalue', data=pvalues)
        f.create_dataset('slope', data=w.astype('float32'))
        f.create_dataset('intercept', data=b.astype('float32'))
        f.create_dataset('qvalue', data=qvalues)
        f.create_dataset('reject', data=reject)
Пример #5
0
def normalize_genotypes(args):
    from utils import read_hdf5_dataset, prepare_output_file
    import numpy as np
    import h5py

    logger.info('read input file: ' + args.input_file)
    X, dataset = read_hdf5_dataset(args.input_file, return_name=True)
    n_snps, n_samples = X.shape
    # allele frequencies
    p = X.sum(axis=1).astype('float32')/n_samples
    multiplier = 1.0/np.sqrt(2.0*p*(1.0 - p))
    multiplier = multiplier.astype('float32')
    logger.info('save mean and multipliers to output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('mean', data=p)
        f.create_dataset('multiplier', data=multiplier)
Пример #6
0
def create_gsm(args):
    import h5py
    import numpy as np
    from utils import prepare_output_file, read_hdf5_dataset
    '''
    logger.info('read genomic positions from file: ' + args.genomic_pos_file)
    positions = {}
    with h5py.File(args.genomic_pos_file, 'r') as f:
        for i in range(1, 11):
            positions['chr%d'%i] = f['chr%d'%i][:]
    n_snps_per_chrom = {chrom:positions[chrom].shape[0] for chrom in positions.keys()}
    n_snps_total = sum(n_snps_per_chrom.values())
    X = []
    for chrom in positions.keys():
        genotype_file = os.path.join(args.input_dir, chrom)
        logger.info('read genotype file: ' + genotype_file)
        with h5py.File(genotype_file, 'r') as f:
            n_sel = int(np.round(args.n_snps*float(n_snps_per_chrom[chrom])/n_snps_total))
            ind = np.random.choice(n_snps_per_chrom[chrom], size=n_sel)
            X.append(f['data'][:][ind])
    X = np.concatenate(X, axis=0).astype('float32')
    '''
    logger.info('read genotypes from file: ' + args.input_file)
    X = read_hdf5_dataset(args.input_file).astype('float64')
    logger.info('number of selected SNPs: %d' % X.shape[0])
    logger.info('calculate GSM')
    X -= X.mean(axis=1)[:, np.newaxis]
    X_std = np.sqrt(np.sum(X**2, axis=1))
    X_std[np.isclose(X_std, 0.0)] = 1.0
    X = X / X_std[:, np.newaxis]
    logger.info('calculate K')
    K = np.dot(X.T, X)
    logger.info('run SVD on X')
    U, S, V = np.linalg.svd(X.T, full_matrices=False)
    V = V.T
    logger.info('save GSM to file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('K', data=K)
        f.create_dataset('U', data=U)
        f.create_dataset('S', data=S)
        f.create_dataset('V', data=V)
Пример #7
0
def convert_2bit_to_minor(args):
    from utils import read_hdf5_dataset, prepare_output_file
    import numpy as np
    import h5py
    import numba

    @numba.jit(nopython=True)
    def _2bit_to_minor(X_2bit, X_minor):
        n_snps = X_minor.shape[0]
        n_samples = X_minor.shape[1]
        max_freq = n_samples
        for i in range(n_snps):
            freq = 0
            for j in range(n_samples):
                count = X_2bit[i, 1, j] - X_2bit[i, 0, j] + 1
                freq += count
                X_minor[i, j] = count
            if freq > n_samples:
                for j in range(n_samples):
                    X_minor[i, j] = 2 - X_minor[i, j]

    logger.info('read input file: ' + args.input_file)
    X_2bit, dataset = read_hdf5_dataset(args.input_file, return_name=True)
    n_snps, n_samples = X_2bit.shape
    n_snps /= 2
    logger.info('number of SNPs: %d, number of samples: %d'%(n_snps, n_samples))
    X_2bit = X_2bit.reshape((n_snps, 2, n_samples))
    logger.info('convert from 2bit code to minor copy numbers')
    # assume that the second allele in the 2bit representation is the minor allele
    # 10 -> 0, 11 -> 1, 01 -> 2
    #X_minor = np.einsum('ijk,j->ik', X_2bit, np.array([-1, 1])) + 1
    # swap the two alleles to make sure that the number represents a minor allele
    #allele_freq = np.expand_dims(np.sum(X_2bit[:, 1, :], axis=1), axis=1)
    #X_minor = np.where(allele_freq <= n_samples/2, X_minor, 2 - X_minor)
    X_minor = np.empty((n_snps, n_samples), dtype='int8')
    _2bit_to_minor(X_2bit, X_minor)
    logger.info('save minor allele copy numbers to output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset(dataset, data=X_minor)
Пример #8
0
def evaluate(args):
    import h5py
    from sklearn.metrics import r2_score, mean_squared_error
    from scipy.stats import pearsonr
    from utils import prepare_output_file, read_hdf5_dataset

    logger.info('read prediction file: ' + args.input_file)
    with h5py.File(args.input_file, 'r') as f:
        y_true = f['y_true'][:]
        y_pred = f['y_pred'][:]
    logger.info('read sample indices file: ' + args.sample_indices_file)
    indices = read_hdf5_dataset(args.sample_indices_file)
    y_true = y_true[indices]
    y_pred = y_pred[indices]

    logger.info('save metrics file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with open(args.output_file, 'w') as f:
        f.write('r2\tmse\tpcc\n')
        f.write('%f' % r2_score(y_true, y_pred))
        f.write('\t%f' % mean_squared_error(y_true, y_pred))
        f.write('\t%f' % pearsonr(y_true, y_pred)[0])
        f.write('\n')
Пример #9
0
def run_regression(args):
    import h5py
    import numpy as np
    from utils import read_hdf5_dataset, standardize_genotypes
    from sklearn.metrics import r2_score, mean_squared_error
    from scipy.stats import pearsonr

    if args.gsm_file is not None:
        logger.info('read GSM file: ' + args.gsm_file)
        with h5py.File(args.gsm_file, 'r') as f:
            U = f['U'][:]
            S = f['S'][:]
            U = U*S[np.newaxis, :]
            U = U[:, S**2 > 0.5]
            U = standardize_genotypes(U)
            X = U
    else:
        logger.info('read genotype file: ' + args.genotype_file)
        X = read_hdf5_dataset(args.genotype_file)
        if args.transpose_x:
            logger.info('transpose X')
            X = X.T
        X = standardize_genotypes(X)
    logger.info('read phenotype file: ' + args.phenotype_file)
    y = read_hdf5_dataset(args.phenotype_file)
    logger.info('read training indices file: ' + args.train_index_file)
    train_index = read_hdf5_dataset(args.train_index_file)
    logger.info('read test indices file: ' + args.test_index_file)
    test_index = read_hdf5_dataset(args.test_index_file)

    if not os.path.exists(args.output_dir):
        logger.info('create output directory: ' + args.output_dir)
        os.makedirs(args.output_dir)
    logger.info('use model: ' + args.model_name)
    if args.model_name == 'mlp':
        import keras
        from keras.models import Sequential
        from keras.layers import Dense, Activation
        from keras import backend as K

        if K.backend() == 'tensorflow':
            # replace the original get_session() function
            keras.backend.tensorflow_backend.get_session.func_code = _get_session.func_code
        logger.info('build the model')
        model = Sequential()  # Feedforward
        model.add(Dense(500, input_dim=X.shape[1]))
        model.add(Activation('tanh'))
        model.add(Dense(100))
        model.add(Activation('tanh'))
        model.add(Dense(1))

        optimizer = keras.optimizers.RMSprop()
        model.compile(loss='mean_squared_error', optimizer=optimizer)
        callbacks = [keras.callbacks.CSVLogger(os.path.join(args.output_dir, 'train_log.csv'))]
        logger.info('build the model')
        model.fit(X[train_index], y[train_index], 
            epochs=args.max_epochs,
            callbacks=callbacks)
        '''
        logger.info('save the model')
        model.save(os.path.join(args.output_dir, 'model'))'''
    else:
        logger.info('build the model')
        import dill as pickle
        if args.model_name == 'gpr':
            from sklearn.gaussian_process import GaussianProcessRegressor
            from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
            kernel = DotProduct(sigma_0=1.0)**4 + WhiteKernel()
            model = GaussianProcessRegressor(kernel=kernel, optimizer=None)
        elif args.model_name == 'ridge':
            from sklearn.linear_model import Ridge
            model = Ridge(alpha=1)
        logger.info('train the model')
        model.fit(X[train_index], y[train_index])
        '''
        logger.info('save the model')
        with open(os.path.join(args.output_dir, 'model'), 'wb') as fout:
            pickle.dump(model, fout)'''

    logger.info('test the model')
    y_pred = np.ravel(model.predict(X))

    logger.info('save predictions on the test set')
    fout = h5py.File(os.path.join(args.output_dir, 'predictions'), 'w')
    fout.create_dataset('y_true', data=y)
    fout.create_dataset('y_pred', data=y_pred)
    fout.close()

    for phase in ('train', 'test'):
        if phase == 'train':
            y_ = y[train_index]
            y_pred_ = y_pred[train_index]
        else:
            y_ = y[test_index]
            y_pred_ = y_pred[test_index]
        metrics = {}
        metrics['mean_squared_error'] = mean_squared_error(y_, y_pred_)
        metrics['r2_score'] = r2_score(y_, y_pred_)
        metrics['pearsonr'] = pearsonr(y_, y_pred_)[0]
        for metric_name, metric_value in metrics.items():
            logger.info('%s.%s = %f'%(phase, metric_name, metric_value))
        logger.info('save metrics')
        with open(os.path.join(args.output_dir, 'metrics.%s.txt'%phase), 'w') as fout:
            fout.write('%s\t%f\n'%(metric_name, metric_value))
Пример #10
0
def run_metric_regressor(args):
    from utils import read_hdf5_dataset
    import h5py
    from metric_regressor import MetricRegressor
    import dill as pickle
    import numpy as np
    from utils import read_hdf5_single, cv_split_emaize, standardize_genotypes, get_indices_table

    logger.info('read genotype file: ' + args.genotype_file)
    X = read_hdf5_dataset(args.genotype_file)
    if args.transpose_genotype:
        X = X.T

    X = standardize_genotypes(X)
    logger.info('read GSM file: ' + args.gsm_file)
    with h5py.File(args.gsm_file, 'r') as f:
        U = f['U'][:]
        S = f['S'][:]
        U = U[:, S ** 2 > 0.5]
        U = standardize_genotypes(U)
    logger.info('read phenotype file: ' + args.phenotype_file)
    y = read_hdf5_dataset(args.phenotype_file)
    logger.info('read parent table file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)
    logger.info('read training indices file: ' + args.train_index_file)
    train_index = read_hdf5_dataset(args.train_index_file)
    logger.info('read test indices file: ' + args.train_index_file)
    test_index = read_hdf5_dataset(args.test_index_file)

    indices_table, mask = get_indices_table(train_index, parent_table)
    if args.cv_type == 's1f':
        train_index_list, test_index_list, s0_index_list = cv_split_emaize(indices_table, mask,
                                                                           k=parent_table.shape[0], method='s1f')
    elif args.cv_type == 's0':
        train_index_list, test_index_list, s0_index_list = cv_split_emaize(indices_table, mask,
                                                                           k1=parent_table.shape[0] / 5,
                                                                           k2=parent_table.shape[1] / 5,
                                                                           method='s0')
    elif args.cv_type == 's1m':
        train_index_list, test_index_list, s0_index_list = cv_split_emaize(indices_table, mask,
                                                                           k=parent_table.shape[1], method='s1m')
    else:
        raise ValueError('unkown cross-validation type: %s'%args.cv_type)

    logger.info('%d rows and %d columns in the indices table' % (indices_table.shape[0], indices_table.shape[1]))
    logger.info('number of cross-validation folds %d' % len(train_index_list))
    logger.info('number principal components to use: %d' % U.shape[1])

    X = np.concatenate([X, U], axis=1)
    y = y.reshape((-1, 1))
    X_train, y_train = X[train_index], y[train_index]

    def cv_generator(batch_size=5):
        while True:
            for i in range(len(train_index_list)):
                train_index = train_index_list[i]
                if args.cv_type == 's0':
                    test_index = test_index_list[i][s0_index_list[i]]
                else:
                    test_index = test_index_list[i]
                if (len(train_index) < batch_size) or (len(test_index) < batch_size):
                    continue
                train_index = np.random.choice(train_index, size=batch_size, replace=False)
                test_index = np.random.choice(test_index, size=batch_size, replace=False)
                yield (X_train[train_index], X_train[test_index],
                       y_train[train_index], y_train[test_index])

    model = MetricRegressor(input_dim=X.shape[1], hidden_dim=args.n_hidden, alpha=args.alpha,
                            sparse_rate=args.sparse_rate, kernel=args.kernel)
    model.fit(X_train, y_train, data_generator=cv_generator(batch_size=args.batch_size),
              lr=args.lr, max_iter=args.max_iter, n_batches=len(train_index_list))
    y_pred = model.predict(X)

    logger.info('save results to output directory: ' + args.output_dir)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    model_file = os.path.join(args.output_dir, 'model')
    #with open(model_file, 'wb') as f:
    #    pickle.dump(model, f)
    model.save(model_file)
    pred_file = os.path.join(args.output_dir, 'predictions')
    with h5py.File(pred_file, 'w') as f:
        f.create_dataset('y_true', data=y)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('mses', data=model.mses_)
        f.create_dataset('velocities', data=model.velocities_)
        f.create_dataset('mse_grads', data=model.mse_grads_)
Пример #11
0
def plot_predictions(args):
    import h5py
    from utils import read_hdf5_single, prepare_output_file, read_hdf5_dataset
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    plt.rcParams['font.family'] = 'Arial'
    plt.rcParams['font.size'] = 12
    plt.rcParams['legend.fontsize'] = 12
    import numpy as np

    def normalize_phenotype(x, range_pheno=4.0):
        return (np.clip(x, -range_pheno, range_pheno) +
                range_pheno) / 2.0 / range_pheno

    logger.info('read parent table file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)
    logger.info('read predictions from file: ' + args.input_file)
    with h5py.File(args.input_file, 'r') as f:
        y_true = f['y_true'][:]
        y_pred = f['y_pred'][:]
    logger.info('read training indices from file: ' + args.train_indices_file)
    train_index = read_hdf5_dataset(args.train_indices_file)
    logger.info('read test indices from file: ' + args.test_indices_file)
    test_index = read_hdf5_dataset(args.test_indices_file)

    y_pred_train = np.full(y_pred.shape, np.nan)
    y_pred_train[train_index] = y_pred[train_index]
    y_pred_test = np.full(y_pred.shape, np.nan)
    y_pred_test[test_index] = y_pred[test_index]

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with PdfPages(args.output_file) as pdf:
        fig, axes = plt.subplots(4, 1, figsize=(10, 8))
        axes[0].matshow(np.take(np.ravel(normalize_phenotype(y_true)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[0].set_title('True phenotypes')

        axes[1].matshow(np.take(np.ravel(normalize_phenotype(y_pred)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[1].set_title('Predicted phenotypes')

        axes[2].matshow(np.take(np.ravel(normalize_phenotype(y_pred_train)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[2].set_title('Predicted phenotypes (train)')

        axes[3].matshow(np.take(np.ravel(normalize_phenotype(y_pred_test)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[3].set_title('Predicted phenotypes (test)')

        plt.tight_layout()
        pdf.savefig(fig)

        plt.clf()
        fig, axes = plt.subplots(2, 3, figsize=(10, 6))
        axes[0, 0].hist(y_true[~np.isnan(y_true)], bins=50)
        axes[0, 0].set_title('True phenotypes')
        axes[0, 1].hist(y_true[train_index], bins=50)
        axes[0, 1].set_title('True phenotypes (train)')
        axes[0, 2].hist(y_true[test_index], bins=50)
        axes[0, 2].set_title('True phenotypes (test)')
        axes[1, 0].hist(y_pred, bins=50)
        axes[1, 0].set_title('Predicted phenotypes')
        axes[1, 1].hist(y_pred[train_index], bins=50)
        axes[1, 1].set_title('Predicted phenotypes (train)')
        axes[1, 2].hist(y_pred[test_index], bins=50)
        axes[1, 2].set_title('Predicted phenotypes (test)')
        for i in range(2):
            for j in range(3):
                axes[i, j].set_xlim(-5, 5)
        plt.tight_layout()
        pdf.savefig(fig)

        plt.clf()
        fig, axes = plt.subplots(1, 3, figsize=(10, 4))
        axes[0].scatter(y_true[~np.isnan(y_true)],
                        y_pred[~np.isnan(y_true)],
                        s=3)
        axes[0].set_xlabel('True phenotypes')
        axes[0].set_ylabel('Predicted phenotypes')
        axes[0].set_title('All samples')

        axes[1].scatter(y_true[train_index], y_pred[train_index], s=3)
        axes[1].set_xlabel('True phenotypes')
        axes[1].set_ylabel('Predicted phenotypes')
        axes[1].set_title('Training samples')

        axes[2].scatter(y_true[test_index], y_pred[test_index], s=3)
        axes[2].set_xlabel('True phenotypes')
        axes[2].set_ylabel('Predicted phenotypes')
        axes[2].set_title('Training samples')

        plt.tight_layout()
        pdf.savefig(fig)
Пример #12
0
def select_best_subset(args):
    import h5py
    from tqdm import tqdm
    import pandas as pd
    from scipy.stats import pearsonr
    from sklearn.metrics import mean_squared_error
    from utils import prepare_output_file, read_hdf5_dataset

    logger.info('read sample indices of test dataset from ' +
                args.test_index_file)
    test_index = read_hdf5_dataset(args.test_index_file)

    traits = args.traits.split(',')

    def iterator():
        cv_type = 's1f'
        for gamma in args.gammas.split(','):
            for n_snps in [int(a) for a in args.n_snps.split(',')]:
                for trait in traits:
                    for snp_set in range(args.n_groups):
                        filename = args.input_dir + '/gamma={gamma}/{n_snps}/{trait}/{snp_set}/{cv_type}/predictions'.format(
                            n_snps=n_snps,
                            trait=trait,
                            snp_set=snp_set,
                            cv_type=cv_type,
                            gamma=gamma)
                        yield ('random_choice', gamma, trait, n_snps, snp_set,
                               cv_type, filename)

    def query_dict(records, **kwargs):
        '''
        Search for records (dicts) that match the key-value pairs
        :param records: a list of dicts
        :param kwargs: key-value pairs
        :return: a list of records that match the query arguments
        '''
        results = []
        for record in records:
            val = True
            for key in kwargs:
                val = val and (record[key] == kwargs[key])
            if val:
                results.append(record)
        return results

    logger.info('read prediction results')
    predictions = []
    for method, gamma, trait, n_snps, snp_set, cv_type, filename in tqdm(
            list(iterator())):
        with h5py.File(filename, 'r') as f:
            predictions.append({
                'trait': trait,
                'gamma': f['best_gamma'][()],
                'alpha': f['best_alpha'][()],
                'method': method,
                'n_snps': n_snps,
                'snp_set': snp_set,
                'y_pred': f['y_pred'][:],
                'cv_type': cv_type,
                'mse_cv': np.ravel(f['mse_cv']),
                'pcc_cv': np.ravel(f['pcc_cv'])
            })

    logger.info('summarize cross-validation metrics')
    summary = []
    for pred in tqdm(predictions):
        summary.append(
            (pred['method'], pred['gamma'], pred['alpha'], pred['trait'],
             pred['n_snps'], pred['snp_set'], pred['cv_type'],
             np.min(pred['pcc_cv']), np.min(pred['mse_cv']),
             np.mean(pred['pcc_cv']), np.mean(pred['mse_cv']),
             np.max(pred['pcc_cv']), np.max(pred['mse_cv']),
             np.median(pred['pcc_cv']), np.median(pred['mse_cv'])))

    summary = pd.DataFrame.from_records(
        summary,
        columns=('method', 'gamma', 'alpha', 'trait', 'n_snps', 'snp_set',
                 'cv_type', 'pcc_cv_min', 'mse_cv_min', 'pcc_cv_mean',
                 'mse_cv_mean', 'pcc_cv_max', 'mse_cv_max', 'pcc_cv_median',
                 'mse_cv_median'))
    ascending = args.by.startswith('mse')
    summary_best = summary.sort_values(['trait', args.by],
                                       ascending=ascending).groupby(['trait'
                                                                     ]).head(3)

    if not os.path.exists(args.output_dir):
        logger.info('create output directory: ' + args.output_dir)
        os.makedirs(args.output_dir)

    summary_file = os.path.join(args.output_dir, 'summary.txt')
    logger.info('write summary of all SNP subsets to file: ' + summary_file)
    summary.to_csv(summary_file, sep='\t', index=False)

    summary_best_file = os.path.join(args.output_dir, 'summary_best.txt')
    logger.info('write summary of best SNP subsets to file: ' +
                summary_best_file)
    summary_best.to_csv(summary_best_file, sep='\t', index=False)

    logger.info('extract predictions from best SNP subsets')
    rank = {}
    for trait in traits:
        rank[trait] = 0
    for index, record in summary_best.iterrows():
        pred_file = args.input_dir + '/gamma={gamma:.2f}/{n_snps}/{trait}/{snp_set}/{cv_type}/predictions'.format(
            **record.to_dict())
        with h5py.File(pred_file, 'r') as f:
            y_pred = f['y_pred'][:]
        trait = record['trait']
        test_pred_file = os.path.join(
            args.output_dir, 'prediction.%s.%d.txt' % (trait, rank[trait]))
        logger.info('save test predictions to file: ' + test_pred_file)
        np.savetxt(test_pred_file, y_pred[test_index])
        rank[trait] += 1
Пример #13
0
def single_model(args):
    import h5py
    import pandas as pd
    import numpy as np
    import dill as pickle
    from utils import read_hdf5_dataset, prepare_output_file, read_hdf5_single
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import r2_score, mean_squared_error
    from tqdm import tqdm

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    #phenotypes = pd.read_table(args.phenotype_file)
    phenotypes = read_hdf5_dataset(args.phenotype_file)
    logger.info('read genotypes from file: ' + args.genotype_file)
    X = read_hdf5_dataset(args.genotype_file)
    if args.transpose_x:
        logger.info('transpose X')
        X = X.T
    y = phenotypes
    if args.feature_indices_file:
        logger.info('read feature indices from: ' + args.feature_indices_file)
        feature_indices = read_hdf5_dataset(args.feature_indices_file)
        X = np.take(X, feature_indices, axis=1)
    if args.normalize_x:
        logger.info('normalize X')
        X = StandardScaler().fit_transform(X)
    if args.sample_indices_file:
        logger.info('read sample indices from: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(~np.isnan(phenotypes))[0]
    X_train = X[sample_indices]
    y_train = y[sample_indices]
    logger.info('read parent table from file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)

    logger.info('use model ' + args.model_name)
    logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape)))
    if args.model_name == 'ridge':
        from sklearn.linear_model import Ridge
        model = Ridge(alpha=10000)
        model.fit(X_train, y_train)
        y_pred = np.ravel(model.predict(X))
        y_pred_train = y_pred[sample_indices]
    elif args.model_name == 'ridge_cv':
        from sklearn.linear_model import Ridge
        alphas = 10.0**np.arange(1, 6)
        train_masks, test_masks = generate_cv_masks(sample_indices,
                                                    parent_table,
                                                    k_female=5,
                                                    k_male=5)
        cv_metrics = {}
        cv_metrics['mse'] = np.zeros((len(alphas), train_masks.shape[0]))
        cv_metrics['r2'] = np.zeros((len(alphas), train_masks.shape[0]))
        pbar = tqdm(total=len(alphas) * train_masks.shape[0])
        for i, alpha in enumerate(alphas):
            for j in range(train_masks.shape[0]):
                model = Ridge(alpha=alpha)
                model.fit(X[train_masks[j]], y[train_masks[j]])
                y_pred = model.predict(X[test_masks[j]])
                cv_metrics['mse'][i, j] = mean_squared_error(
                    y[test_masks[j]], y_pred)
                cv_metrics['r2'][i, j] = r2_score(y[test_masks[j]], y_pred)
                pbar.update(1)
        pbar.close()
        best_alpha = alphas[cv_metrics['r2'].mean(axis=1).argmax()]
        logger.info('optmized alpha = %f' % best_alpha)
        model = Ridge(alpha=best_alpha)
        model.fit(X_train, y_train)
        y_pred = np.ravel(model.predict(X))
        y_pred_train = y_pred[sample_indices]
    elif args.model_name == 'gpr':
        from sklearn.gaussian_process import GaussianProcessRegressor
        from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF
        kernel = RBF() + WhiteKernel()
        model = GaussianProcessRegressor(kernel=kernel)
        model.fit(X_train, y_train)
        logger.info('kernel params: %s' % repr(model.get_params()))
        y_pred_train = np.ravel(model.predict(X_train))
        y_pred = np.ravel(model.predict(X))
    elif args.model_name == 'gpy':
        from GPy.kern import Linear
        from GPy.models import GPRegression
        kernel = Linear(input_dim=2, name='linear')
        model = GPRegression(X_train, y_train, kernel=kernel)
        model.optimize()

    else:
        raise ValueError('unknown model name: ' + args.model_name)

    logger.info('r2 score = %f' % r2_score(y_train, y_pred_train))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    model_file = os.path.join(args.output_dir, 'model')
    logger.info('save model file: ' + model_file)
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    pred_file = os.path.join(args.output_dir, 'predictions')
    logger.info('save predictions to file: ' + pred_file)
    with h5py.File(pred_file, 'w') as f:
        if args.output_residuals:
            f.create_dataset('residual', data=(y - y_pred))
        f.create_dataset('y_true', data=y)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('y_pred_train', data=y_pred_train)
        f.create_dataset('indices_train', data=sample_indices)
        if args.model_name == 'ridge_cv':
            f.create_dataset('alpha', data=alphas)
            g = f.create_group('cv_metrics')
            for key in cv_metrics.keys():
                g.create_dataset(key, data=cv_metrics[key])
Пример #14
0
def run_mixed_ridge(args):
    from utils import read_hdf5_dataset
    import h5py
    from models import MixedRidge
    import dill as pickle
    from utils import read_hdf5_single, cv_split_emaize, standardize_genotypes, get_indices_table

    logger.info('read genotype file: ' + args.genotype_file)
    X = read_hdf5_dataset(args.genotype_file)
    if args.transpose_genotype:
        X = X.T

    X = standardize_genotypes(X)
    logger.info('read GSM file: ' + args.gsm_file)
    with h5py.File(args.gsm_file, 'r') as f:
        U = f['U'][:]
        S = f['S'][:]
        #U = U*S[np.newaxis, :]
        U = U[:, S**2 > 0.5]
        U = standardize_genotypes(U)
    logger.info('read phenotype file: ' + args.phenotype_file)
    y = read_hdf5_dataset(args.phenotype_file)
    logger.info('read parent table file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)
    logger.info('read training indices file: ' + args.train_index_file)
    train_index = read_hdf5_dataset(args.train_index_file)
    logger.info('read test indices file: ' + args.test_index_file)
    test_index = read_hdf5_dataset(args.test_index_file)

    indices_table, mask = get_indices_table(train_index, parent_table)
    if args.cv_type == 's1f':
        if args.k is None:
            k = parent_table.shape[0]
        else:
            k = args.k
        train_index_list, test_index_list, s0_index_list = cv_split_emaize(
            indices_table, mask, k=k, method='s1f')
    elif args.cv_type == 's0':
        train_index_list, test_index_list, s0_index_list = cv_split_emaize(
            indices_table,
            mask,
            k1=parent_table.shape[0] / 5,
            k2=parent_table.shape[1] / 5,
            method='s0')
    elif args.cv_type == 's1m':
        if args.k is None:
            k = parent_table.shape[1]
        else:
            k = args.k
        train_index_list, test_index_list, s0_index_list = cv_split_emaize(
            indices_table, mask, k=k, method='s1m')
    else:
        raise ValueError('unkown cross-validation type: %s' % args.cv_type)
    logger.info('%d rows and %d columns in the indices table' %
                (indices_table.shape[0], indices_table.shape[1]))
    logger.info('number of cross-validation folds %d' % len(train_index_list))

    logger.info('number principal components to use: %d' % U.shape[1])
    alpha_list = [0.001, 0.01, 0.1]
    gamma_list = [0.0, 0.05, 0.1, 0.15, 0.2]
    alpha_list = [0.001]
    gamma_list = [0.15]
    alpha_list = [float(a) for a in args.alphas.split(',')]
    gamma_list = [float(a) for a in args.gammas.split(',')]
    metrics = {
        'pcc_cv':
        np.zeros((len(alpha_list), len(gamma_list), len(test_index_list))),
        'mse_cv':
        np.zeros((len(alpha_list), len(gamma_list), len(test_index_list)))
    }
    X_train, U_train, y_train = X[train_index], U[train_index], y[train_index]
    n_samples_total = np.prod(parent_table.shape)

    test_index_mask = np.zeros((len(test_index_list), n_samples_total),
                               dtype='bool')
    if args.cv_type == 's0':
        for i in range(len(s0_index_list)):
            test_index_mask[i, train_index[s0_index_list[i]]] = True
    else:
        for i in range(len(test_index_list)):
            test_index_mask[i, train_index[test_index_list[i]]] = True
    for i, alpha in enumerate(alpha_list):
        #model.optimize_grid(X[train_index], U[train_index], y[train_index])
        '''
        mse_cv = np.zeros(len(train_index_list))
        pcc_cv = np.zeros(len(train_index_list))
        for j in range(len(train_index_list)):
            model.fit(X_train[train_index_list[j]],
                      U_train[train_index_list[j]],
                      y_train[train_index_list[j]],
                      gamma=0.1)
            y_pred_cv = model.predict(X_train[test_index_list[j]],
                                      U_train[test_index_list[j]])
            mse_cv[j] = mean_squared_error(y_train[test_index_list[j]], y_pred_cv)
            pcc_cv[j] = pearsonr(y_train[test_index_list[j]], y_pred_cv)[0]
        logger.info('cross-validation (real) MSE = %f, %f, %f'%(np.nanmin(mse_cv), np.nanmean(mse_cv), np.nanmax(mse_cv)))
        logger.info('cross-validation (real) PCC = %f, %f, %f' % (np.nanmin(pcc_cv), np.nanmean(pcc_cv), np.nanmax(pcc_cv)))
        '''
        for j, gamma in enumerate(gamma_list):
            model = MixedRidge(alphas=alpha)
            model.fit(X_train, U_train, y_train, gamma=gamma, cv=True)
            mse_cv = model.kfold(test_index_list,
                                 subset_indices=s0_index_list,
                                 return_mean=False)
            pcc_cv = model.pcc_cv
            metrics['pcc_cv'][i, j] = pcc_cv
            metrics['mse_cv'][i, j] = mse_cv
            logger.info(
                'cross-validation (fast) MSE = %f, %f, %f' %
                (np.nanmin(mse_cv), np.nanmean(mse_cv), np.nanmax(mse_cv)))
            logger.info(
                'cross-validation (fast) PCC = %f, %f, %f' %
                (np.nanmin(pcc_cv), np.nanmean(pcc_cv), np.nanmax(pcc_cv)))
            logger.info('alpha=%f, gamma=%f' % (alpha, gamma))

    pcc_cv_mean = np.nanmean(metrics['pcc_cv'], axis=2)
    i_best = np.argmax(np.max(pcc_cv_mean, axis=1))
    best_alpha = alpha_list[i_best]
    best_gamma = gamma_list[np.argmax(pcc_cv_mean[i_best])]
    logger.info('best model: alpha=%f, gamma=%f' % (best_alpha, best_gamma))

    best_model = MixedRidge(alphas=best_alpha)
    best_model.fit(X_train, U_train, y_train, gamma=best_gamma)
    y_pred_best = best_model.predict(X, U)
    '''
    logger.info('best model on test data: pcc=%f, mse=%f'%(
        pearsonr(y[test_index], y_pred_best[test_index])[0],
        mean_squared_error(y[test_index], y_pred_best[test_index])))
        '''
    logger.info('save results to output directory: ' + args.output_dir)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    #model_file = os.path.join(args.output_dir, 'model')
    #with open(model_file, 'wb') as f:
    #    pickle.dump(best_model, f)
    pred_file = os.path.join(args.output_dir, 'predictions')
    with h5py.File(pred_file, 'w') as f:
        f.create_dataset('best_alpha', data=best_alpha)
        f.create_dataset('best_gamma', data=best_gamma)
        f.create_dataset('y_true', data=y)
        f.create_dataset('y_pred', data=y_pred_best)
        f.create_dataset('pcc_cv', data=metrics['pcc_cv'])
        f.create_dataset('mse_cv', data=metrics['mse_cv'])
        f.create_dataset('alpha_list', data=np.asarray(alpha_list))
        f.create_dataset('gamma_list', data=np.asarray(gamma_list))
        f.create_dataset('test_index_mask', data=test_index_mask)
Пример #15
0
def mixed_model(args):
    import h5py
    import numpy as np
    import dill as pickle
    from utils import read_hdf5_dataset, read_hdf5_single
    from sklearn.metrics import r2_score, mean_squared_error

    logger.info('read predictions of the first model: ' + args.input_file1)
    with h5py.File(args.input_file1, 'r') as f:
        X1 = f['y_pred'][:][:, np.newaxis]
    logger.info('read predictions of the second model: ' + args.input_file2)
    with h5py.File(args.input_file2, 'r') as f:
        X2 = f['y_pred'][:][:, np.newaxis]
    X = np.concatenate([X1, X2], axis=1)
    logger.info('read phenotypes from file: ' + args.phenotype_file)
    y = read_hdf5_dataset(args.phenotype_file)
    logger.info('read parent table from file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)

    if args.sample_indices_file:
        logger.info('read sample indices from: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(~np.isnan(y))[0]
    X_train = X[sample_indices]
    y_train = y[sample_indices]

    logger.info('use model ' + args.model_name)
    logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape)))
    if args.model_name == 'linear':
        from sklearn.linear_model import LinearRegression
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X)
        y_pred_train = y_pred[sample_indices]
        logger.info('coefficients: ' + ', '.join([str(a)
                                                  for a in model.coef_]))
    elif args.model_name == 'linear_cv':
        mix_factors, mse_train, mse_test = linear_cv(X, y, sample_indices,
                                                     parent_table)
        best_mix_factor = mix_factors[np.argmin(mse_test.mean(axis=1))]
        logger.info('best mix factor: %f' % best_mix_factor)
        X_mixed = (X[:, 0] * (1 - best_mix_factor) +
                   X[:, 1] * best_mix_factor)[:, np.newaxis]
        from sklearn.linear_model import LinearRegression
        model = LinearRegression()
        model.fit(X_mixed[sample_indices], y_train)
        y_pred = model.predict(X_mixed)
        y_pred_train = y_pred[sample_indices]
    else:
        raise ValueError('unknown model name: ' + args.model_name)

    logger.info('r2 score = %f' % r2_score(y_train, y_pred_train))
    logger.info('mse = %f' % mean_squared_error(y_train, y_pred_train))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    model_file = os.path.join(args.output_dir, 'model')
    logger.info('save model file: ' + model_file)
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)

    pred_file = os.path.join(args.output_dir, 'predictions')
    logger.info('save predictions to file: ' + pred_file)
    with h5py.File(pred_file, 'w') as f:
        f.create_dataset('y_true', data=y)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('y_pred_train', data=y_pred_train)
        f.create_dataset('indices_train', data=sample_indices)
        if args.model_name == 'linear_cv':
            f.create_dataset('mse_train', data=mse_train)
            f.create_dataset('mse_test', data=mse_test)
            f.create_dataset('mix_factors', data=mix_factors)