Exemplo n.º 1
0
def leave_k_out_blup(
        num_cvs=20,
        genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/',
        k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    res_dict = {}
    for phenotype in phenotypes:
        env_dict = {}
        for env in envs:
            print phenotype, env
            s1 = time.time()
            #Load data..
            d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict,
                                                             phenotype,
                                                             env,
                                                             k_thres=k_thres)
            Y_means = d['Y_means']
            snps = d['snps']
            assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
            K = kinship.calc_ibd_kinship(snps)
            print '\nKinship calculated'
            assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
            n = len(Y_means)
            #partition genotypes in k parts.
            gt_ids = d['gt_ids']
            num_ids = len(gt_ids)
            chunk_size = num_ids / num_cvs

            #Create k CV sets of prediction and validation data

            cv_chunk_size = int((n / num_cvs) + 1)
            ordering = sp.random.permutation(n)

            a = sp.arange(n)
            osb_ys = []
            pred_ys = []
            p_herits = []
            for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                cv_str = 'cv_%d' % cv_i
                #print 'Working on CV %d' % cv_i
                end_i = min(n, i + cv_chunk_size)
                validation_filter = sp.in1d(a, ordering[i:end_i])
                training_filter = sp.negative(validation_filter)

                train_snps = snps[:, training_filter]
                val_snps = snps[:, validation_filter]

                train_Y = Y_means[training_filter]
                val_Y = Y_means[validation_filter]

                #Calc. kinship
                K_train = K[training_filter, :][:, training_filter]
                K_cross = K[validation_filter, :][:, training_filter]
                #Do gBLUP
                lmm = lm.LinearMixedModel(train_Y)
                lmm.add_random_effect(K_train)
                r1 = lmm.get_REML()

                #Now the BLUP.
                y_mean = sp.mean(lmm.Y)
                Y = lmm.Y - y_mean
                p_herit = r1['pseudo_heritability']
                p_herits.append(p_herit)
                #delta = (1 - p_herit) / p_herit
                #        if K_inverse == None:
                #            K_inverse = K.I
                #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
                #        u_blup = M.I * Y
                M = sp.mat(p_herit * sp.mat(K_train) +
                           (1 - p_herit) * sp.eye(K_train.shape[0]))
                u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                osb_ys.extend(val_Y)
                pred_ys.extend(u_mean_pred)
            corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
            print 'Correlation:', corr
            r2 = corr**2
            print 'R2:', r2
            mean_herit = sp.mean(p_herits)
            print 'Avg. heritability:', mean_herit
            env_dict[env] = {
                'R2': r2,
                'obs_y': osb_ys,
                'pred_y': pred_ys,
                'corr': corr,
                'avg_herit': mean_herit
            }

        res_dict[phenotype] = env_dict

    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for phenotype in phenotypes:
        phen_g = h5f.create_group(phenotype)
        for env in envs:
            d = res_dict[phenotype][env]
            env_g = phen_g.create_group(env)
            env_g.create_dataset('R2', data=[d['R2']])
            env_g.create_dataset('corr', data=[d['corr']])
            env_g.create_dataset('obs_y', data=d['obs_y'])
            env_g.create_dataset('pred_y', data=d['pred_y'])
            env_g.create_dataset('avg_herit', data=[d['avg_herit']])
    h5f.close()
Exemplo n.º 2
0
def leave_k_out_blup(num_repeats=20, num_cvs=5, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    rep_dict = {}
    for rep_i in range(num_repeats):
        res_dict = {}
        for phenotype in phenotypes:
            env_dict = {}
            for env in envs:
                print phenotype, env
                s1 = time.time()
                # Load data..
                d = hdf5_data.coordinate_cegs_genotype_phenotype(
                    phen_dict, phenotype, env, k_thres=k_thres)
                Y_means = d['Y_means']
                snps = d['snps']
                assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
                K = kinship.calc_ibd_kinship(snps)
                print '\nKinship calculated'
                assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
                n = len(Y_means)
                # partition genotypes in k parts.
                gt_ids = d['gt_ids']
                num_ids = len(gt_ids)
                chunk_size = num_ids / num_cvs

                # Create k CV sets of prediction and validation data

                cv_chunk_size = int((n / num_cvs) + 1)
                ordering = sp.random.permutation(n)

                a = sp.arange(n)
                osb_ys = []
                pred_ys = []
                p_herits = []
                for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                    cv_str = 'cv_%d' % cv_i
                    # print 'Working on CV %d' % cv_i
                    end_i = min(n, i + cv_chunk_size)
                    validation_filter = sp.in1d(a, ordering[i:end_i])
                    training_filter = sp.negative(validation_filter)

                    train_snps = snps[:, training_filter]
                    val_snps = snps[:, validation_filter]

                    train_Y = Y_means[training_filter]
                    val_Y = Y_means[validation_filter]

                    #Calc. kinship
                    K_train = K[training_filter, :][:, training_filter]
                    K_cross = K[validation_filter, :][:, training_filter]
                    # Do gBLUP
                    lmm = lm.LinearMixedModel(train_Y)
                    lmm.add_random_effect(K_train)
                    r1 = lmm.get_REML()

                    # Now the BLUP.
                    y_mean = sp.mean(lmm.Y)
                    Y = lmm.Y - y_mean
                    p_herit = r1['pseudo_heritability']
                    p_herits.append(p_herit)
                    #delta = (1 - p_herit) / p_herit
            #        if K_inverse == None:
            #            K_inverse = K.I
            #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
            #        u_blup = M.I * Y
                    M = sp.mat(p_herit * sp.mat(K_train) +
                               (1 - p_herit) * sp.eye(K_train.shape[0]))
                    u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                    osb_ys.extend(val_Y)
                    pred_ys.extend(u_mean_pred)
                corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
                print 'Correlation:', corr
                r2 = corr**2
                print 'R2:', r2
                mean_herit = sp.mean(p_herits)
                print 'Avg. heritability:', mean_herit
                env_dict[env] = {'R2': r2, 'obs_y': osb_ys,
                                 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit}

            res_dict[phenotype] = env_dict
        rep_dict[rep_i] = res_dict
    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for rep_i in range(num_repeats):
        res_dict = rep_dict[rep_i]
        rep_g = h5f.create_group('repl_%d' % rep_i)
        for phenotype in phenotypes:
            phen_g = rep_g.create_group(phenotype)
            for env in envs:
                d = res_dict[phenotype][env]
                env_g = phen_g.create_group(env)
                env_g.create_dataset('R2',  data=[d['R2']])
                env_g.create_dataset('corr',  data=[d['corr']])
                env_g.create_dataset('obs_y',  data=d['obs_y'])
                env_g.create_dataset('pred_y',  data=d['pred_y'])
                env_g.create_dataset('avg_herit',  data=[d['avg_herit']])
    h5f.close()
Exemplo n.º 3
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplemented

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            #Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env,
                                         phen_type)
            agr.plot_simple_qqplots_pvals(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                [res['ps']],
                result_labels=[label_str],
                line_colors=['green'],
                num_dots=1000,
                title=None,
                max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    #Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i],
                         y_log_pvals[i],
                         c=colors[i],
                         ls='',
                         marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf,
                       xmin,
                       xmax,
                       colors='k',
                       linestyles='--',
                       alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png'
                % (kinship_type, phenotype, env, phen_type))
Exemplo n.º 4
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplementedError

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            # Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (
                kinship_type, phenotype, env, phen_type)
            agr.plot_simple_qqplots_pvals('/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                                          [res['ps']], result_labels=[
                                              label_str], line_colors=['green'],
                                          num_dots=1000, title=None, max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    # Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i], y_log_pvals[i],
                         c=colors[i], ls='', marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf, xmin, xmax, colors='k',
                       linestyles='--', alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' %
                        (kinship_type, phenotype, env, phen_type))