Пример #1
0
 def test_gensmall(self):
     # Just checking that doesn't generate errors
     for iid_count in [10, 5, 3, 2, 1, 0]:
         for sid_count in [0, 10, 5, 3, 2, 1]:
             for chr_count in [30, 10, 5, 3, 2, 1, 0]:
                 if chr_count == 0 and sid_count > 0:
                     continue  # not break
                 logging.debug("{0}, {1}, {2}".format(iid_count, sid_count, chr_count))
                 snpdata = snp_gen(
                     fst=0.1,
                     dfr=0.5,
                     iid_count=iid_count,
                     sid_count=sid_count,
                     maf_low=0.05,
                     seed=6,
                     chr_count=chr_count,
                 )
                 assert snpdata.iid_count <= iid_count
                 assert snpdata.sid_count == sid_count
                 assert len(snpdata.pos) == 0 or max(snpdata.pos[:, 0]) <= chr_count
                 assert len(snpdata.pos) == 0 or max(snpdata.pos[:, 1]) <= int(
                     max(1, np.ceil(float(sid_count) / chr_count))
                 )
                 assert len(snpdata.pos) == 0 or max(snpdata.pos[:, 2]) <= int(
                     max(1, np.ceil(float(sid_count) / chr_count))
                 )
Пример #2
0
 def gen_and_compare(self, output_file, **kwargs):
     gen_snpdata = snp_gen(**kwargs)
     # pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True)
     # Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file)  #comment out
     ref_snpdata = Bed(self.currentFolder + "/expected/" + output_file).read()
     assert TestSnpGen.is_same(gen_snpdata, ref_snpdata), "Failure on " + output_file
     return gen_snpdata
Пример #3
0
 def test_gensmall(self):
     #Just checking that doesn't generate errors
     for iid_count in [10, 5, 3, 2, 1, 0]:
         for sid_count in [0, 10, 5, 3, 2, 1]:
             for chr_count in [30, 10, 5, 3, 2, 1, 0]:
                 if chr_count == 0 and sid_count > 0:
                     continue  # not break
                 logging.debug("{0}, {1}, {2}".format(
                     iid_count, sid_count, chr_count))
                 snpdata = snp_gen(fst=.1,
                                   dfr=.5,
                                   iid_count=iid_count,
                                   sid_count=sid_count,
                                   maf_low=.05,
                                   seed=6,
                                   chr_count=chr_count)
                 assert snpdata.iid_count <= iid_count
                 assert snpdata.sid_count == sid_count
                 assert len(snpdata.pos) == 0 or max(
                     snpdata.pos[:, 0]) <= chr_count
                 assert len(
                     snpdata.pos) == 0 or max(snpdata.pos[:, 1]) <= int(
                         max(1, np.ceil(float(sid_count) / chr_count)))
                 assert len(
                     snpdata.pos) == 0 or max(snpdata.pos[:, 2]) <= int(
                         max(1, np.ceil(float(sid_count) / chr_count)))
Пример #4
0
 def gen_and_compare(self, output_file, **kwargs):
     gen_snpdata = snp_gen(**kwargs)
     #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True)
     #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file)  #comment out
     ref_snpdata = Bed(self.currentFolder + "/expected/" +
                       output_file).read()
     assert TestSnpGen.is_same(gen_snpdata,
                               ref_snpdata), "Failure on " + output_file
     return gen_snpdata
def generate_discrete_ascertained(prevalence, iid_count, snp_args, phenotype_args, seed=0):
    """
    Generate discrete ascertained data. Internally, case will be generated at the requested
    prevalence. Before returning, however, the control will randomly sampled so 
    that in the returned data, case and control have number of examples.

    :param prevalence: Prior probability of a case, e.g. .1
    :type prevalence: a float between 0.0 and 1.0 (exclusive)

    :param iid_count: The number of examples desired in the returned data. Because of
    rounding during data generate the actual number may be lower. Of this happens,
    a warning will be shown.
    :type iid_count: int

    :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include
    'iid_count' or 'seed'
    :type snp_args: dictionary

    :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include
    'snp_count' or 'seed'
    :type phenotype_args: dictionary

    :param seed: a random seed to control random number generation
    :type seed: int

    :rtype: a :class:`pysnptools.snpreader.SnpData' of genotype data and a nparray of 0,1 phenotype values.

    :Example:

    >>> snp_args = {"fst":.1,"dfr":.5,"sid_count":200,"maf_low":.05}
    >>> phenotype_args = {"causals":10,"genetic_var":0.5, "noise_var":0.5}
    >>> snps,pheno = generate_discrete_ascertained(prevalence=.1,iid_count=100,seed=5,snp_args=snp_args,phenotype_args=phenotype_args)
    >>> print int(snps.val.shape[0]),int(snps.val.shape[1]),int(len(pheno))
    98 200 98

    """
    assert 0<prevalence and prevalence <= .5, "Expect prevalence to be between 0.0 (exclusive) and .5 (inclusive)"
    assert int(iid_count) == iid_count and iid_count >= 0, "Expect iid_count to be a non-negative integer"

    # generate more examples than we ultimately want
    iid_count2 = int(float(iid_count) / 2.0 / prevalence)
    from GWAS_benchmark import snp_gen
    snp2 = snp_gen(iid_count=iid_count2, seed=seed, **snp_args)
    pheno2 = generate_phenotype(snp_data=snp2, seed=seed, **phenotype_args)

    # Sort by snps by pheno2 value
    snps2_sorted = snp2[pheno2.argsort(),:]

    # we want the top snp_count*prevalence for cases
    # and a random sample, of the same size, from the rest for control
    case_count = int(snps2_sorted.iid_count * prevalence)
    case_index = range(-1,-(case_count+1),-1) # e.g. if case_count is 3, then -1,-2,-3
    control_count = case_count

    if control_count + case_count != iid_count:
        logging.warn("iid_count is {0} instead of {1} because of rounding".format(control_count + case_count, iid_count))

    if seed is not None:
        np.random.seed(int(seed % sys.maxint))
    #print "gda", snps2_sorted.iid_count,case_count,control_count

    #the "if..else" is a work around because the linux version of np.random.choice doesn't like to select zero items from an empty list. We need to call random in either case so that the random seed ends up in the expected state
    control_index = np.random.choice(np.arange(snps2_sorted.iid_count-case_count if control_count > 0 else 1), control_count, replace=False)
    
    snp_final = snps2_sorted[np.concatenate((control_index,case_index)),:].read()
    pheno_final = np.zeros(control_count+case_count)
    pheno_final[control_count:]=1

    return snp_final, pheno_final
def generate_discrete_ascertained(prevalence,
                                  iid_count,
                                  snp_args,
                                  phenotype_args,
                                  seed=0):
    """
    Generate discrete ascertained data. Internally, case will be generated at the requested
    prevalence. Before returning, however, the control will randomly sampled so 
    that in the returned data, case and control have number of examples.

    :param prevalence: Prior probability of a case, e.g. .1
    :type prevalence: a float between 0.0 and 1.0 (exclusive)

    :param iid_count: The number of examples desired in the returned data. Because of
    rounding during data generate the actual number may be lower. Of this happens,
    a warning will be shown.
    :type iid_count: int

    :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include
    'iid_count' or 'seed'
    :type snp_args: dictionary

    :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include
    'snp_count' or 'seed'
    :type phenotype_args: dictionary

    :param seed: a random seed to control random number generation
    :type seed: int

    :rtype: a :class:`pysnptools.snpreader.SnpData' of genotype data and a nparray of 0,1 phenotype values.

    :Example:

    >>> snp_args = {"fst":.1,"dfr":.5,"sid_count":200,"maf_low":.05}
    >>> phenotype_args = {"causals":10,"genetic_var":0.5, "noise_var":0.5}
    >>> snps,pheno = generate_discrete_ascertained(prevalence=.1,iid_count=100,seed=5,snp_args=snp_args,phenotype_args=phenotype_args)
    >>> print int(snps.val.shape[0]),int(snps.val.shape[1]),int(len(pheno))
    98 200 98

    """
    assert 0 < prevalence and prevalence <= .5, "Expect prevalence to be between 0.0 (exclusive) and .5 (inclusive)"
    assert int(
        iid_count
    ) == iid_count and iid_count >= 0, "Expect iid_count to be a non-negative integer"

    # generate more examples than we ultimately want
    iid_count2 = int(float(iid_count) / 2.0 / prevalence)
    from GWAS_benchmark import snp_gen
    snp2 = snp_gen(iid_count=iid_count2, seed=seed, **snp_args)
    pheno2 = generate_phenotype(snp_data=snp2, seed=seed, **phenotype_args)

    # Sort by snps by pheno2 value
    snps2_sorted = snp2[pheno2.argsort(), :]

    # we want the top snp_count*prevalence for cases
    # and a random sample, of the same size, from the rest for control
    case_count = int(snps2_sorted.iid_count * prevalence)
    case_index = range(-1, -(case_count + 1),
                       -1)  # e.g. if case_count is 3, then -1,-2,-3
    control_count = case_count

    if control_count + case_count != iid_count:
        logging.warn(
            "iid_count is {0} instead of {1} because of rounding".format(
                control_count + case_count, iid_count))

    if seed is not None:
        np.random.seed(int(seed % sys.maxint))
    #print "gda", snps2_sorted.iid_count,case_count,control_count

    #the "if..else" is a work around because the linux version of np.random.choice doesn't like to select zero items from an empty list. We need to call random in either case so that the random seed ends up in the expected state
    control_index = np.random.choice(
        np.arange(snps2_sorted.iid_count -
                  case_count if control_count > 0 else 1),
        control_count,
        replace=False)

    snp_final = snps2_sorted[np.concatenate(
        (control_index, case_index)), :].read()
    pheno_final = np.zeros(control_count + case_count)
    pheno_final[control_count:] = 1

    return snp_final, pheno_final