Пример #1
0
    def get_blup(self, pid, K):
        """
        Returns the REML estimate for the BLUP and the pseudo-heritability.
        """
        from scipy import stats
        import linear_models as lm
        phen_vals = self.get_values(pid)
        lmm = lm.LinearMixedModel(phen_vals)
        if len(phen_vals) == len(set(phen_vals)):
            lmm.add_random_effect(K)
        else:
            Z = self.get_incidence_matrix(pid)
            lmm.add_random_effect(Z * K * Z.T)
        r1 = lmm.get_REML()
        ll1 = r1['max_ll']
        rlm = lm.LinearModel(phen_vals)
        ll0 = rlm.get_ll()
        lrt_stat = 2 * (ll1 - ll0)
        pval = stats.chi2.sf(lrt_stat, 1)

        #Now the BLUP.
        y_mean = sp.mean(lmm.Y)
        Y = lmm.Y - y_mean
        p_herit = r1['pseudo_heritability']
        delta = (1 - p_herit) / p_herit
#        if K_inverse == None:
#            K_inverse = K.I
#        M = (sp.eye(K.shape[0]) + delta * K_inverse)
#        u_blup = M.I * Y
        M = (K + delta * sp.eye(K.shape[0]))
        u_mean_pred = K * (M.I * Y)
        blup_residuals = Y - u_mean_pred
        return {'pseudo_heritability':r1['pseudo_heritability'], 'pval':pval, 'u_blup':u_mean_pred, 'blup_residuals':blup_residuals}
 def _get_estimates_(self):
     print "Initializing mixed model..."
     self.lmm = lm.LinearMixedModel(self.pvls)
     self.lmm.add_random_effect(self.k)
     eig_L = self.lmm._get_eigen_L_()
     print "Estimating variance components..."
     self.est = self.lmm.get_estimates(eig_L, self.k)
Пример #3
0
def _test_scz_():
    # Load Schizophrenia data

    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(
        snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Пример #4
0
 def __init__(self, y, fixed_effects=None, K=None, Z=None):
     self.lmm = linear_models.LinearMixedModel(Y=y)
     if Z is not None:
         self.lmm.add_random_effect(Z * K * Z.T)
         if fixed_effects is not None:
             for cofactor in fixed_effects:
                 self.lmm.add_factor(Z * cofactor)
     else:
         self.lmm.add_random_effect(K)
         if fixed_effects:
             for cofactor in fixed_effects:
                 self.lmm.add_factor(cofactor)
Пример #5
0
    def get_pseudo_heritability(self, K):
        """
        Returns the REML estimate of the heritability.

        methods: 'avg' (averages), 'repl' (replicates)
        """
        from scipy import stats
        import linear_models as lm
        lmm = lm.LinearMixedModel(self.phen_vals)
        if len(self.values) == len(set(self.values)):
            lmm.add_random_effect(K)
        else:
            Z = self.get_incidence_matrix()
            lmm.add_random_effect(Z * K * Z.T)
        r1 = lmm.get_REML()
        ll1 = r1['max_ll']
        rlm = lm.LinearModel(self.values)
        ll0 = rlm.get_ll()
        lrt_stat = 2 * (ll1 - ll0)
        pval = stats.chi2.sf(lrt_stat, 1)
        return {'pseudo_heritability': r1['pseudo_heritability'], 'pval': pval}
Пример #6
0
def leave_k_out_blup(
        num_cvs=20,
        genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/',
        k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    res_dict = {}
    for phenotype in phenotypes:
        env_dict = {}
        for env in envs:
            print phenotype, env
            s1 = time.time()
            #Load data..
            d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict,
                                                             phenotype,
                                                             env,
                                                             k_thres=k_thres)
            Y_means = d['Y_means']
            snps = d['snps']
            assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
            K = kinship.calc_ibd_kinship(snps)
            print '\nKinship calculated'
            assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
            n = len(Y_means)
            #partition genotypes in k parts.
            gt_ids = d['gt_ids']
            num_ids = len(gt_ids)
            chunk_size = num_ids / num_cvs

            #Create k CV sets of prediction and validation data

            cv_chunk_size = int((n / num_cvs) + 1)
            ordering = sp.random.permutation(n)

            a = sp.arange(n)
            osb_ys = []
            pred_ys = []
            p_herits = []
            for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                cv_str = 'cv_%d' % cv_i
                #print 'Working on CV %d' % cv_i
                end_i = min(n, i + cv_chunk_size)
                validation_filter = sp.in1d(a, ordering[i:end_i])
                training_filter = sp.negative(validation_filter)

                train_snps = snps[:, training_filter]
                val_snps = snps[:, validation_filter]

                train_Y = Y_means[training_filter]
                val_Y = Y_means[validation_filter]

                #Calc. kinship
                K_train = K[training_filter, :][:, training_filter]
                K_cross = K[validation_filter, :][:, training_filter]
                #Do gBLUP
                lmm = lm.LinearMixedModel(train_Y)
                lmm.add_random_effect(K_train)
                r1 = lmm.get_REML()

                #Now the BLUP.
                y_mean = sp.mean(lmm.Y)
                Y = lmm.Y - y_mean
                p_herit = r1['pseudo_heritability']
                p_herits.append(p_herit)
                #delta = (1 - p_herit) / p_herit
                #        if K_inverse == None:
                #            K_inverse = K.I
                #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
                #        u_blup = M.I * Y
                M = sp.mat(p_herit * sp.mat(K_train) +
                           (1 - p_herit) * sp.eye(K_train.shape[0]))
                u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                osb_ys.extend(val_Y)
                pred_ys.extend(u_mean_pred)
            corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
            print 'Correlation:', corr
            r2 = corr**2
            print 'R2:', r2
            mean_herit = sp.mean(p_herits)
            print 'Avg. heritability:', mean_herit
            env_dict[env] = {
                'R2': r2,
                'obs_y': osb_ys,
                'pred_y': pred_ys,
                'corr': corr,
                'avg_herit': mean_herit
            }

        res_dict[phenotype] = env_dict

    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for phenotype in phenotypes:
        phen_g = h5f.create_group(phenotype)
        for env in envs:
            d = res_dict[phenotype][env]
            env_g = phen_g.create_group(env)
            env_g.create_dataset('R2', data=[d['R2']])
            env_g.create_dataset('corr', data=[d['corr']])
            env_g.create_dataset('obs_y', data=d['obs_y'])
            env_g.create_dataset('pred_y', data=d['pred_y'])
            env_g.create_dataset('avg_herit', data=[d['avg_herit']])
    h5f.close()
Пример #7
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplemented

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            #Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env,
                                         phen_type)
            agr.plot_simple_qqplots_pvals(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                [res['ps']],
                result_labels=[label_str],
                line_colors=['green'],
                num_dots=1000,
                title=None,
                max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    #Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i],
                         y_log_pvals[i],
                         c=colors[i],
                         ls='',
                         marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf,
                       xmin,
                       xmax,
                       colors='k',
                       linestyles='--',
                       alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png'
                % (kinship_type, phenotype, env, phen_type))
Пример #8
0
    def _emmax_permutations(self,
                            snps,
                            phenotypes,
                            num_perm,
                            K=None,
                            Z=None,
                            method='REML'):
        """
                EMMAX permutation test
                Single SNPs
                
                Returns the list of max_pvals and max_fstats 
                """
        lmm = lm.LinearMixedModel(phenotypes)
        lmm.add_random_effect(Z * K * Z.T)

        eig_L = lmm._get_eigen_L_()

        print 'Getting variance estimates'
        res = lmm.get_estimates(eig_L, method=method)

        q = 1  # Single SNP is being tested
        p = len(lmm.X.T) + q
        n = lmm.n
        n_p = n - p
        H_sqrt_inv = res['H_sqrt_inv']

        Y = H_sqrt_inv * lmm.Y  #The transformed outputs.
        h0_X = H_sqrt_inv * lmm.X
        (h0_betas, h0_rss, h0_rank, h0_s) = linalg.lstsq(h0_X, Y)
        Y = Y - h0_X * h0_betas

        num_snps = len(snps)
        max_fstat_list = []
        min_pval_list = []
        chunk_size = len(Y)
        print "Working with chunk size: " + str(chunk_size)
        print "and " + str(num_snps) + " SNPs."
        Ys = sp.mat(sp.zeros((chunk_size, num_perm)))

        for perm_i in range(num_perm):
            #print 'Permutation nr. % d' % perm_i
            sp.random.shuffle(Y)
            Ys[:, perm_i] = Y

        min_rss_list = sp.repeat(h0_rss, num_perm)
        for i in range(0, num_snps,
                       chunk_size):  #Do the dot-product in chunks!
            snps_chunk = sp.matrix(snps[i:(i + chunk_size)])
            snps_chunk = snps_chunk * Z.T
            Xs = snps_chunk * (H_sqrt_inv.T)
            Xs = Xs - sp.mat(sp.mean(Xs, axis=1))
            for j in range(len(Xs)):  # for each snp
                (betas, rss_list, p,
                 sigma) = linalg.lstsq(Xs[j].T, Ys,
                                       overwrite_a=True)  # read the lstsq lit
                for k, rss in enumerate(rss_list):
                    if not rss:
                        print 'No predictability in the marker, moving on...'
                        continue
                    if min_rss_list[k] > rss:
                        min_rss_list[k] = rss
                if num_snps >= 10 and (i + j + 1) % (
                        num_snps / num_perm) == 0:  #Print dots
                    sys.stdout.write('.')
                    sys.stdout.flush()

        if num_snps >= 10:
            sys.stdout.write('\n')

        #min_rss = min(rss_list)
        max_f_stats = ((h0_rss / min_rss_list) - 1.0) * n_p / float(q)
        min_pvals = (stats.f.sf(max_f_stats, q, n_p))

        res_d = {'min_ps': min_pvals, 'max_f_stats': max_f_stats}
        print "There are: " + str(len(min_pvals))
        return res_d
Пример #9
0
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(
        snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()

        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])

        #ATT permutations (Implement)

        #PC permutations (Implement)

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Пример #10
0
def run_emmax(hdf5_filename='/home/bv25/data/Ls154/Ls154_12.hdf5',
              out_file='/home/bv25/data/Ls154/Ls154_results.hdf5',
              min_maf=0.1,
              recalculate_kinship=True,
              chunk_size=1000):
    """
    Apply the EMMAX algorithm to hdf5 formated genotype/phenotype data 
    """

    ih5f = h5py.File(hdf5_filename)
    gg = ih5f['genot_data']
    ig = ih5f['indiv_data']
    n_indivs = len(ig['indiv_ids'][...])

    if recalculate_kinship:
        print 'Calculating kinship.'
        k_mat = sp.zeros((n_indivs, n_indivs), dtype='single')

        chromosomes = gg.keys()
        n_snps = 0
        for chrom in chromosomes:
            print 'Working on Chromosome %s' % chrom
            cg = gg[chrom]
            freqs = cg['freqs'][...]
            mafs = sp.minimum(freqs, 1 - freqs)
            maf_filter = mafs > min_maf
            print 'Filtered out %d SNPs with MAF<%0.2f.' % (
                len(maf_filter) - sum(maf_filter), min_maf)
            snps = cg['raw_snps'][...]
            snps = snps[maf_filter]
            num_snps = len(snps)

            for chunk_i, i in enumerate(range(0, num_snps, chunk_size)):
                end_i = min(i + chunk_size, num_snps)
                x = snps[i:end_i]
                x = x.T
                x = (x - sp.mean(x, 0)) / sp.std(x, 0)
                x = x.T
                n_snps += len(x)
                k_mat += sp.dot(x.T, x)
                del x
                sys.stdout.write(
                    '\b\b\b\b\b\b\b%0.2f%%' %
                    (100.0 * (min(1,
                                  ((chunk_i + 1.0) * chunk_size) / num_snps))))
                sys.stdout.flush()
            sys.stdout.write('\b\b\b\b\b\b\b100.00%\n')
        k_mat = k_mat / float(n_snps)
        c = sp.sum(
            (sp.eye(len(k_mat)) -
             (1.0 / len(k_mat)) * sp.ones(k_mat.shape)) * sp.array(k_mat))
        scalar = (len(k_mat) - 1) / c
        print 'Kinship scaled by: %0.4f' % scalar
        k = scalar * k_mat
    else:
        assert 'kinship' in ih5f.keys(
        ), 'Kinship is missing.  Please calculate that first!'
        k = ih5f['kinship']

    # Get the phenotypes
    phenotypes = ig['phenotypes'][...]

    # Initialize the mixed model
    lmm = lm.LinearMixedModel(phenotypes)
    lmm.add_random_effect(k)
    # Calculate pseudo-heritability, etc.
    print 'Calculating the eigenvalues of K'
    s0 = time.time()
    eig_L = lmm._get_eigen_L_()
    print 'Done.'
    print 'Took %0.2f seconds' % (time.time() - s0)
    print "Calculating the eigenvalues of S(K+I)S where S = I-X(X'X)^-1X'"
    s0 = time.time()
    eig_R = lmm._get_eigen_R_(X=lmm.X)
    print 'Done'
    print 'Took %0.2f seconds' % (time.time() - s0)

    print 'Getting variance estimates'
    s0 = time.time()
    res = lmm.get_estimates(eig_L, method='REML',
                            eig_R=eig_R)  # Get the variance estimates..
    print 'Done.'
    print 'Took %0.2f seconds' % (time.time() - s0)
    print 'pseudo_heritability:', res['pseudo_heritability']

    # Initialize results file
    oh5f = h5py.File(out_file)

    # Store phenotype_data
    oh5f.create_dataset('pseudo_heritability',
                        data=sp.array(res['pseudo_heritability']))
    oh5f.create_dataset('ve', data=sp.array(res['ve']))
    oh5f.create_dataset('vg', data=sp.array(res['vg']))
    oh5f.create_dataset('max_ll', data=sp.array(res['max_ll']))
    oh5f.create_dataset('num_snps', data=ih5f['num_snps'])

    # Construct results data containers
    chrom_res_group = oh5f.create_group('chrom_results')

    for chrom in gg.keys():
        crg = chrom_res_group.create_group(chrom)
        # Get the SNPs
        print 'Working on Chromosome: %s' % chrom
        freqs = gg[chrom]['freqs'][...]
        mafs = sp.minimum(freqs, 1 - freqs)
        maf_filter = mafs > min_maf
        print 'Filtered out %d SNPs with MAF<%0.2f.' % (
            len(maf_filter) - sum(maf_filter), min_maf)
        snps = gg[chrom]['raw_snps'][...]
        snps = snps[maf_filter]
        positions = gg[chrom]['positions'][...]
        positions = positions[maf_filter]

        # Now run EMMAX
        print "Running EMMAX"
        s1 = time.time()
        r = lmm._emmax_f_test_(snps,
                               res['H_sqrt_inv'],
                               with_betas=False,
                               emma_num=0,
                               eig_L=eig_L)
        secs = time.time() - s1
        if secs > 60:
            mins = int(secs) / 60
            secs = secs % 60
            print 'Took %d mins and %0.1f seconds.' % (mins, secs)
        else:
            print 'Took %0.1f seconds.' % (secs)
        crg.create_dataset('ps', data=r['ps'])
        crg.create_dataset('positions', data=positions)
        oh5f.flush()

    ih5f.close()
    oh5f.close()
Пример #11
0
def run_emmax_perm(hdf5_filename='/home/bv25/data/Ls154/Ls154_12.hdf5',
                   out_file='/home/bv25/data/Ls154/Ls154_results_perm.hdf5',
                   min_maf=0.1,
                   recalculate_kinship=True,
                   chunk_size=1000,
                   num_perm=500):
    """
    Apply the EMMAX algorithm to hdf5 formated genotype/phenotype data 
    """

    ih5f = h5py.File(hdf5_filename)
    gg = ih5f['genot_data']
    ig = ih5f['indiv_data']
    n_indivs = len(ig['indiv_ids'][...])

    print 'Calculating kinship.'
    k_mat = sp.zeros((n_indivs, n_indivs), dtype='single')

    chromosomes = gg.keys()
    #    chromosomes = chromosomes[-1:]
    n_snps = 0
    for chrom in chromosomes:
        print 'Working on Chromosome %s' % chrom
        cg = gg[chrom]
        freqs = cg['freqs'][...]
        mafs = sp.minimum(freqs, 1 - freqs)
        maf_filter = mafs > min_maf
        print 'Filtered out %d SNPs with MAF<%0.2f.' % (
            len(maf_filter) - sum(maf_filter), min_maf)
        snps = cg['raw_snps'][...]
        snps = snps[maf_filter]
        num_snps = len(snps)

        for chunk_i, i in enumerate(range(0, num_snps, chunk_size)):
            end_i = min(i + chunk_size, num_snps)
            x = snps[i:end_i]
            x = x.T
            x = (x - sp.mean(x, 0)) / sp.std(x, 0)
            x = x.T
            n_snps += len(x)
            k_mat += sp.dot(x.T, x)
            del x
            sys.stdout.write(
                '\b\b\b\b\b\b\b%0.2f%%' %
                (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps))))
            sys.stdout.flush()
        sys.stdout.write('\b\b\b\b\b\b\b100.00%\n')
    k_mat = k_mat / float(n_snps)
    c = sp.sum((sp.eye(len(k_mat)) -
                (1.0 / len(k_mat)) * sp.ones(k_mat.shape)) * sp.array(k_mat))
    scalar = (len(k_mat) - 1) / c
    print 'Kinship scaled by: %0.4f' % scalar
    k = scalar * k_mat

    # Store the kinship
    # Initialize results file
    oh5f = h5py.File(out_file)
    oh5f.create_dataset('kinship', data=k)
    oh5f.flush()

    chromosomes = gg.keys()
    num_tot_snps = 0
    num_12_chr_snps = 0
    for chrom in chromosomes:
        cg = gg[chrom]
        freqs = cg['freqs'][...]
        mafs = sp.minimum(freqs, 1 - freqs)
        maf_filter = mafs > min_maf
        n_snps = sum(maf_filter)
        num_tot_snps += n_snps
        if chrom != chromosomes[-1]:
            num_12_chr_snps += n_snps

    # Get the phenotypes
    phenotypes = ig['phenotypes'][...]

    # Initialize the mixed model
    lmm = lm.LinearMixedModel(phenotypes)
    lmm.add_random_effect(k)
    # Calculate pseudo-heritability, etc.
    print 'Calculating the eigenvalues of K'
    s0 = time.time()
    eig_L = lmm._get_eigen_L_()
    print 'Done.'
    print 'Took %0.2f seconds' % (time.time() - s0)
    print "Calculating the eigenvalues of S(K+I)S where S = I-X(X'X)^-1X'"
    s0 = time.time()
    eig_R = lmm._get_eigen_R_(X=lmm.X)
    print 'Done'
    print 'Took %0.2f seconds' % (time.time() - s0)

    print 'Getting variance estimates'
    s0 = time.time()
    res = lmm.get_estimates(eig_L, method='REML',
                            eig_R=eig_R)  # Get the variance estimates..
    print 'Done.'
    print 'Took %0.2f seconds' % (time.time() - s0)
    print 'pseudo_heritability:', res['pseudo_heritability']

    # Store phenotype_data
    oh5f.create_dataset('pseudo_heritability',
                        data=sp.array(res['pseudo_heritability']))
    oh5f.create_dataset('ve', data=sp.array(res['ve']))
    oh5f.create_dataset('vg', data=sp.array(res['vg']))
    oh5f.create_dataset('max_ll', data=sp.array(res['max_ll']))
    oh5f.create_dataset('num_snps', data=sp.array(n_snps))

    # Construct results data containers
    chrom_res_group = oh5f.create_group('chrom_results')
    #    all_snps = sp.empty((n_snps, n_indivs))
    chr12_snps = sp.empty((num_12_chr_snps, n_indivs))
    i = 0
    for chrom in gg.keys():
        crg = chrom_res_group.create_group(chrom)
        # Get the SNPs
        print 'Working on Chromosome: %s' % chrom
        freqs = gg[chrom]['freqs'][...]
        mafs = sp.minimum(freqs, 1 - freqs)
        maf_filter = mafs > min_maf
        print 'Filtered out %d SNPs with MAF<%0.2f.' % (
            len(maf_filter) - sum(maf_filter), min_maf)
        snps = gg[chrom]['raw_snps'][...]
        snps = snps[maf_filter]
        positions = gg[chrom]['positions'][...]
        positions = positions[maf_filter]
        n = len(snps)
        #        all_snps[i:i + n] = snps
        if chrom != chromosomes[-1]:
            chr12_snps[i:i + n] = snps
        # Now run EMMAX
        print "Running EMMAX"
        s1 = time.time()
        r = lmm._emmax_f_test_(snps,
                               res['H_sqrt_inv'],
                               with_betas=False,
                               emma_num=0,
                               eig_L=eig_L)
        secs = time.time() - s1
        if secs > 60:
            mins = int(secs) / 60
            secs = secs % 60
            print 'Took %d mins and %0.1f seconds.' % (mins, secs)
        else:
            print 'Took %0.1f seconds.' % (secs)
        crg.create_dataset('ps', data=r['ps'])
        crg.create_dataset('positions', data=positions)
        oh5f.flush()
        i += n

    print 'Starting permutation test for detecting the genome-wide significance threshold'
    s1 = time.time()
    perm_res = lmm._emmax_permutations_(chr12_snps,
                                        k,
                                        res['H_sqrt_inv'],
                                        num_perm=num_perm)
    secs = time.time() - s1
    if secs > 60:
        mins = int(secs) / 60
        secs = secs % 60
        print 'Took %d mins and %0.1f seconds.' % (mins, secs)
    else:
        print 'Took %0.1f seconds.' % (secs)

    perm_res['min_ps'].sort()
    perm_res['max_f_stats'].sort()
    perm_res['max_f_stats'][::-1]  # reverse array
    five_perc_i = int(num_perm / 20)
    print "The 0.05 genome-wide significance threshold is %0.4e, and the corresponding statistic is %0.4e." % (
        perm_res['min_ps'][five_perc_i], perm_res['max_f_stats'][five_perc_i])
    oh5f.create_dataset('perm_min_ps', data=perm_res['min_ps'])
    oh5f.create_dataset('perm_max_f_stats', data=perm_res['max_f_stats'])
    oh5f.create_dataset('five_perc_perm_min_ps',
                        data=perm_res['min_ps'][five_perc_i])
    oh5f.create_dataset('five_perc_perm_max_f_stats',
                        data=perm_res['max_f_stats'][five_perc_i])

    ih5f.close()
    oh5f.close()