예제 #1
0
def get_data_top_k_independent(input_fn, ks, birth, expr):
  if birth:
    config.birth_type = birth
    dobs = utils.synthetic_dob(2504)
  else:
    dobs = []
  return get_idabs_top_k_independent(input_fn, ks, birth, dobs, expr)
예제 #2
0
def get_data_top_k_independent(input_fn, ks, birth, expr):
    if birth:
        config.birth_type = birth
        dobs = utils.synthetic_dob(2504)
    else:
        dobs = []
    return get_idabs_top_k_independent(input_fn, ks, birth, dobs, expr)
예제 #3
0
def snp_sampling(input_fn):
  """ Vary the size of subsets of SNPs (10%, 20%, ..., 100%), 
      and show how identifiability is affected. """
  subset_sizes = [10 * (i+1) for i in range(10)]
  sub_fn = input_fn[:-4] + '.subset.vcf'
  uniqs_dob = []
  uniqs = []
  dobs = utils.synthetic_dob(2504)
  for size in subset_sizes:
    print 'Using random', size, '% of SNPs'
    utils.make_toy(data_path+input_fn, data_path+sub_fn, size)
    uniq_dob, snp_num = snp_info.identifiability(data_path+sub_fn, use_dob=True, dobs=dobs)
    uniq, snp_num = snp_info.identifiability(data_path+sub_fn, use_dob=False, dobs=dobs)
    uniqs_dob.append(uniq_dob)
    uniqs.append(uniq)
    print

  if config.birth_type == 'date':
    dob = 'DOB'
  elif config.birth_type == 'year':
    dob = 'YOB'
  plt.plot(subset_sizes, uniqs, 'b', label='w/o '+dob)
  plt.plot(subset_sizes, uniqs_dob, 'r', label='w/ '+dob)
  ax = plt.subplot()
  d = (subset_sizes[1] - subset_sizes[0]) * 0.1
  xlim = [subset_sizes[0] - d, subset_sizes[-1] + d]
  ylim = [-0.05, 1.05]
  ax.set_xlim(xlim)
  ax.set_ylim(ylim)
  plt.legend(loc='best', shadow=True)
  plt.title('Identifiability using random X % of '+ str(snp_num) +' SNPs')
  plt.savefig(plot_path+'snp_sampling_'+input_fn[:-4]+'_'+dob+'.pdf')
  plt.close()
예제 #4
0
def get_data_vary_af_threshold(input_fn, thresholds, birth, expr):
    """ Vary allele frequency threshold and plot change in identifiability. """
    if birth:
        config.birth_type = birth
        dobs = utils.synthetic_dob(2504)
    else:
        dobs = []
    return get_idabs_vary_af_threshold(input_fn, thresholds, birth, expr, dobs)
예제 #5
0
def get_data_top_k_entropy(input_fn, ks, birth, expr):
    if birth:
        config.birth_type = birth
        dobs = utils.synthetic_dob(2504)
    else:
        dobs = []
    print len(dobs)
    return get_idabs_top_k_entropy(input_fn, ks, birth, dobs, expr)
예제 #6
0
def get_data_vary_rsq(input_fn, rsqs, birth, expr):
    if birth:
        config.birth_type = birth
        dobs = utils.synthetic_dob(2504)
    else:
        dobs = []
    print len(dobs)
    return get_idabs_vary_rsq(input_fn, rsqs, birth, dobs, expr)
예제 #7
0
def get_data_top_k_entropy(input_fn, ks, birth, expr):
  if birth:
    config.birth_type = birth
    dobs = utils.synthetic_dob(2504)
  else:
    dobs = []
  print len(dobs)
  return get_idabs_top_k_entropy(input_fn, ks, birth, dobs, expr)
예제 #8
0
def get_data_vary_af_threshold(input_fn, thresholds, birth, expr):
  """ Vary allele frequency threshold and plot change in identifiability. """
  if birth:
    config.birth_type = birth
    dobs = utils.synthetic_dob(2504)
  else:
    dobs = []
  return get_idabs_vary_af_threshold(input_fn, thresholds, birth, expr, dobs)
예제 #9
0
def get_data_vary_rsq(input_fn, rsqs, birth, expr):
  if birth:
    config.birth_type = birth
    dobs = utils.synthetic_dob(2504)
  else:
    dobs = []
  print len(dobs)
  return get_idabs_vary_rsq(input_fn, rsqs, birth, dobs, expr)
예제 #10
0
def expr_vary_C(W, C, crange, birth=''):
  if birth:
    config.birth_type = birth
    dobs = utils.synthetic_dob(2504)
  else:
    dobs = []
  idabs = []
  for c in crange:
    value, knap_snps = solve_knapsack(W, c)
    idab, _ = snp_info.identifiability(input_vcf, True, dobs=dobs, snps=knap_snps)
    print 'accuracy at', c, 'th round:', idab
    idabs.append(idab)
  return idabs
예제 #11
0
def expr_vary_W(W, C, birth=''):
  idabs = []
  if birth:
    config.birth_type = birth
    dobs = utils.synthetic_dob(2504)
  else:
    dobs = []
  for w in range(1, W+1):
    value, knap_snps = solve_knapsack(w, C)
    idab, _ = snp_info.identifiability(input_vcf, False, dobs=dobs, snps=knap_snps)
    print 'accuracy at', w, 'th round:', idab
    idabs.append(idab)
  return idabs
예제 #12
0
def make_hash(input_fn, output_fn, use_dob=True, dobs=None):
    """ Generate hash code for individuals in input file. """
    vcf_reader = vcf.Reader(open(data_path + input_fn, 'rb'))
    m = len(vcf_reader.samples)
    n = sum(1 for _ in vcf_reader)
    if dobs == None:
        dobs = utils.synthetic_dob(m)
    genotypes = snp_info.genotypes(input_fn, m, n, use_dob, dobs)

    hashes = hash_func(genotypes)
    with open(data_path + output_fn, 'wb') as outfile:
        for hash in hashes:
            outfile.write(hash)
            outfile.write('\n')
예제 #13
0
def make_hash(input_fn, output_fn, use_dob=True, dobs=None):
  """ Generate hash code for individuals in input file. """
  vcf_reader = vcf.Reader(open(data_path+input_fn, 'rb'))
  m = len(vcf_reader.samples)
  n = sum(1 for _ in vcf_reader)
  if dobs == None:
    dobs = utils.synthetic_dob(m)
  genotypes = snp_info.genotypes(input_fn, m, n, use_dob, dobs)

  hashes = hash_func(genotypes)
  with open(data_path+output_fn, 'wb') as outfile:
    for hash in hashes:
      outfile.write(hash)
      outfile.write('\n')
예제 #14
0
def snp_sampling(input_fn):
    """ Vary the size of subsets of SNPs (10%, 20%, ..., 100%), 
      and show how identifiability is affected. """
    subset_sizes = [10 * (i + 1) for i in range(10)]
    sub_fn = input_fn[:-4] + '.subset.vcf'
    uniqs_dob = []
    uniqs = []
    dobs = utils.synthetic_dob(2504)
    for size in subset_sizes:
        print 'Using random', size, '% of SNPs'
        utils.make_toy(data_path + input_fn, data_path + sub_fn, size)
        uniq_dob, snp_num = snp_info.identifiability(data_path + sub_fn,
                                                     use_dob=True,
                                                     dobs=dobs)
        uniq, snp_num = snp_info.identifiability(data_path + sub_fn,
                                                 use_dob=False,
                                                 dobs=dobs)
        uniqs_dob.append(uniq_dob)
        uniqs.append(uniq)
        print

    if config.birth_type == 'date':
        dob = 'DOB'
    elif config.birth_type == 'year':
        dob = 'YOB'
    plt.plot(subset_sizes, uniqs, 'b', label='w/o ' + dob)
    plt.plot(subset_sizes, uniqs_dob, 'r', label='w/ ' + dob)
    ax = plt.subplot()
    d = (subset_sizes[1] - subset_sizes[0]) * 0.1
    xlim = [subset_sizes[0] - d, subset_sizes[-1] + d]
    ylim = [-0.05, 1.05]
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    plt.legend(loc='best', shadow=True)
    plt.title('Identifiability using random X % of ' + str(snp_num) + ' SNPs')
    plt.savefig(plot_path + 'snp_sampling_' + input_fn[:-4] + '_' + dob +
                '.pdf')
    plt.close()
예제 #15
0
def test_synthetic_dob():
  dobs = utils.synthetic_dob(2504)
  assert len(dobs) == 2504, "Error in utils.synthetic_dob()"
예제 #16
0
def test_synthetic_dob():
    dobs = utils.synthetic_dob(2504)
    assert len(dobs) == 2504, "Error in utils.synthetic_dob()"