Пример #1
0
def add_pheno(plink_in, multigenecity, out, h=0.85, p_cases=0.5):
  plink_file = plinkfile.open( plink_in )
  if not plink_file.one_locus_per_row( ):
    print( "This script requires that snps are rows and samples columns." )
    exit(1)

  sample_list = plink_file.get_samples()

  locus_list = plink_file.get_loci()
  n = len(sample_list)
  p = len(locus_list)
  edge_offset = 100
  causal_mut_index = np.linspace(edge_offset, p-edge_offset, multigenecity, dtype=int)
  gen_effect_size_unnormalized = {item: np.random.normal(loc=0,
    scale=float(h)/np.sqrt(multigenecity)) for item in causal_mut_index}
  print causal_mut_index
  causal_mutations = set()
  mutation_meta = {}
  prs = np.zeros(n)
  for i, variant in enumerate(locus_list):
    row = plink_file.next()
    if i in causal_mut_index:
      genotypes = np.fromiter(row, dtype=float)
      genotypes[genotypes==3] = np.mean(genotypes[genotypes!=3])
      prs += genotypes * gen_effect_size_unnormalized[i]

  plink_file.close()
  del causal_mut_index, gen_effect_size_unnormalized
  env_rs_unnormalized = np.random.normal(loc=0, scale=np.sqrt(1-h**2), size=n)

  gen_effect_size = h * (prs - np.mean(prs)) / np.std(prs)
  env_effect_size = np.sqrt(1-h**2) * (env_rs_unnormalized - np.mean(env_rs_unnormalized)
      ) / np.std(env_rs_unnormalized)

  burden = gen_effect_size + env_effect_size
  sorted_i = np.argsort(burden)[::-1]

  ncases = int(n * p_cases)
  cases_i = set(sorted_i[:ncases])

  # write new plink file
  for i, sample in enumerate(sample_list):
    sample_list[i].affection = int(i in cases_i)

  #plink_write = plinkfile.create(out, sample_list)
  plink_write = plinkfile.WritablePlinkFile( out, sample_list )
  #plinkio doesn't have seek? so we close it when we don't need it and reopen it here
  plink_file = plinkfile.open( plink_in )
  for i, variant in tqdm.tqdm(enumerate(locus_list)):
    row = plink_file.next()
    plink_write.write_row(variant, row)

  plink_write.close()
  plink_file.close()
def main(plink_fn, pheno_fn, output_plink_fn):
    plink_file = plinkfile.open(plink_fn)
    if not plink_file.one_locus_per_row():
        print("This script requires that snps are rows and samples columns.")
        exit(1)
    sample_list = plink_file.get_samples()
    for x in sample_list:
        print x.iid
    pheno_df = pd.read_csv(pheno_fn, sep="\t")
    iids = pheno_df['Strain'] + '__' + pheno_df['Animal_Id'].astype(str)
    # print iids
    strain_count = pheno_df.groupby(['Strain']).count()['Animal_Id'].to_dict()
    # print strain_count
    output_sample_list = create_output_sample_list(sample_list, iids)
    # for x in output_sample_list:
    #     print x.iid
    out_plink = plinkfile.create(output_plink_fn, output_sample_list)
    locus_list = plink_file.get_loci()
    count = 0
    for locus, row in zip(locus_list, plink_file):
        count += 1
        if count % 1000 == 0:
            print 'At SNP {}'.format(count)
        strain_genotype_dict = {x[0].iid: x[1] for x in zip(sample_list, row)}
        sample_strains = [sample.iid.split('__')[0].replace('-', '') for sample in output_sample_list]
        output_row = [strain_genotype_dict[x] for x in sample_strains]
        out_plink.write_row(locus, output_row)
    return
Пример #3
0
def test_read_write():
    with tempfile.TemporaryDirectory() as temp_dir:
        plink_prefix = os.path.join(temp_dir, "test")

        samples = [
            Sample("fid1", "iid1", "0", "0", 0, 0),
            Sample("fid2", "iid2", "0", "0", 0, 1)
        ]
        loci = [
            Locus(1, "chr1:1", 1.0, 1, "A", "C"),
            Locus(2, "chr1:2", 2.0, 2, "G", "T")
        ]
        rows = [[0, 1], [1, 2]]

        writer = plinkfile.create(plink_prefix, samples)

        for i, locus in enumerate(loci):
            writer.write_row(locus, rows[i])

        writer.close()

        reader = plinkfile.open(plink_prefix)
        assert samples == reader.get_samples()
        assert loci == reader.get_loci()

        for row, reader_row in zip(rows, reader):
            assert row == list(reader_row)
Пример #4
0
def getdata(chr):

	total = 0
	sel = 1
	infile = "{}/2_Exome_36K_Ws_chr{}".format(data_dir, chr) 

	print infile
	plink_file = plinkfile.open( infile )
# 	sample_list = plink_file.get_samples( )
	locus_list = plink_file.get_loci( )

# 	pl.cplinkio.reset_row(pl.plink_file.handle)
	snps = dict()
	for locus in locus_list:

		total+=1
		if not(total%10000): print total, sel

		if (locus.chromosome == chr):
	
			iter = plink_file.next()
			maf_ = iter.allele_counts()
			maf = (2*maf_[2]+maf_[1])/(2.*float(maf_[0]+maf_[1]+maf_[2]))
			if not(total%10000): print total, sel, maf
			if not(sel%1000): print total, sel, maf
			sel+=1
			snps[locus.name] = [int(locus.bp_position), maf, iter]
			
	return snps
def get_pred_exp_correlation_matrix(ref_panel, locus_genes):
    # Get predicted expression correlations across STARNET individuals
    assert top_gene in locus_genes
    model_weights = {}
    for gene in locus_genes:
        model_weights[gene] = get_gene_model(gene, ref_panel)
    model_weights = pd.DataFrame(model_weights, columns=model_weights)
    model_weights.fillna(0, inplace=True)
    rs_numbers_in_block = model_weights.index.values
    model_weights = model_weights.values
    STARNET_rs_number_file = 'STARNET_rs_numbers.txt'
    open(STARNET_rs_number_file, 'w').write('\n'.join(rs_numbers_in_block))
    weight_dir = f'fusion_twas/WEIGHTS/{ref_panel}'
    plink_file = f'{weight_dir}/{top_gene}_locus'
    individuals_with_expression_file = f'{weight_dir}/individuals_with_expression_thread_0.txt'
    subprocess.check_call(
        f'plink --bfile data/STARNET/genotypes/STARNET '
        f'--extract {STARNET_rs_number_file} '
        f'--keep-fam {individuals_with_expression_file} '
        f'--maf 1e-10 '  # re-filter on MAF because we've removed some indivs
        f'--make-bed --out {plink_file} > /dev/null',
        shell=True)
    plinkio_file = plinkfile.open(plink_file)
    variants = np.array(tuple(plinkio_file))
    plinkio_rs_numbers = np.array(
        [locus.name for locus in plinkio_file.get_loci()])
    sorted_order = np.argsort(plinkio_rs_numbers)[np.argsort(
        np.argsort(rs_numbers_in_block))]
    assert (plinkio_rs_numbers[sorted_order] == rs_numbers_in_block).all()
    variants = variants[sorted_order]
    predicted_expression = model_weights.T.dot(variants)
    pred_exp_correlation_matrix = np.corrcoef(predicted_expression)
    return pred_exp_correlation_matrix
    def test_merge(
            self, my_result_dir, my_interim_dir, my_processed_dir):
        # create a temporary directory using the context manager
        with tempfile.TemporaryDirectory() as tmpdirname:
            working_dir = pathlib.Path(tmpdirname)
            results_dir = DATA_DIR / "processed"

            # assign return value to mocked property
            my_result_dir.return_value = results_dir
            my_interim_dir.return_value = working_dir
            my_processed_dir.return_value = working_dir

            result = self.runner.invoke(
                merge_datasets,
                [
                    "--species",
                    "sheep",
                    "--assembly",
                    "OAR3"
                ]
            )

            self.assertEqual(0, result.exit_code, msg=result.exception)

            smarter_tag = f"SMARTER-OA-OAR3-top-{__version__}"
            plink_path = working_dir / "OAR3" / smarter_tag
            plink_file = plinkfile.open(str(plink_path))

            sample_list = plink_file.get_samples()
            locus_list = plink_file.get_loci()

            self.assertEqual(len(sample_list), 2)
            self.assertEqual(len(locus_list), 3)
Пример #7
0
def compare(hdf_file, plink_file):
  pfile = plinkfile.open(plink_file)
  if not pfile.one_locus_per_row():
    logging.error("""This script requires the snps to be rows and samples to be columns.""")
    sys.exit(1)

  locus_list = pfile.get_loci( )
  pset = { (l.chromosome, l.bp_position) for l in locus_list }
#  pset = {item for item in pset if item[0] == 1}#TODO remove
  total_intersection = 0
  total_hset_length = 0
  total_pset_length = len(pset)
  with h5py.File(hdf_file, "r") as hfile: 
    for key in hfile.keys():
      if key == "meta":
        continue 
      ikey = int(key)
      hset = {(ikey, int(pos)) for pos in hfile[key].keys()}
#      testpset= {(i[0], i[1]) for i in pset if i[0] == ikey}
#      if len(testpset - hset) > 0 or len( hset - testpset) > 0:
#        pdb.set_trace()
      pset_len = len(pset)
#      hmp = sorted([i for _,i in hset - pset])
#      pmh = sorted([i for _,i in pset - hset])
      pset = pset - hset
      total_intersection += pset_len - len(pset)
      total_hset_length += len(hset)

  return(total_intersection, total_pset_length, total_hset_length)
    def test_import_from_text_plink(self, my_working_dir, my_result_dir):
        # create a temporary directory using the context manager
        with tempfile.TemporaryDirectory() as tmpdirname:
            working_dir = pathlib.Path(tmpdirname)
            results_dir = working_dir / "results"

            # assign return value to mocked property
            my_working_dir.return_value = working_dir
            my_result_dir.return_value = results_dir

            # copy test data files
            self.link_files(working_dir)

            result = self.runner.invoke(import_from_plink, [
                "--dataset", "test.zip", "--file", "plinktest", "--chip_name",
                self.chip_name, "--assembly", "OAR3"
            ])

            self.assertEqual(0, result.exit_code, msg=result.exception)
            self.assertEqual(SampleSheep.objects.count(), 2)

            # check imported chip_name attribute
            for sample in SampleSheep.objects:
                self.assertEqual(sample.chip_name, self.chip_name)

            plink_path = results_dir / "OAR3" / "plinktest_updated"
            plink_file = plinkfile.open(str(plink_path))

            sample_list = plink_file.get_samples()
            locus_list = plink_file.get_loci()

            self.assertEqual(len(sample_list), 2)
            self.assertEqual(len(locus_list), 3)
def main(plink_fn, pheno_fn, output_plink_fn):
    plink_file = plinkfile.open(plink_fn)
    if not plink_file.one_locus_per_row():
        print("This script requires that snps are rows and samples columns.")
        exit(1)
    sample_list = plink_file.get_samples()
    for x in sample_list:
        print x.iid
    pheno_df = pd.read_csv(pheno_fn, sep="\t")
    iids = pheno_df['Strain'] + '__' + pheno_df['Animal_Id'].astype(str)
    # print iids
    strain_count = pheno_df.groupby(['Strain']).count()['Animal_Id'].to_dict()
    # print strain_count
    output_sample_list = create_output_sample_list(sample_list, iids)
    # for x in output_sample_list:
    #     print x.iid
    out_plink = plinkfile.create(output_plink_fn, output_sample_list)
    locus_list = plink_file.get_loci()
    count = 0
    for locus, row in zip(locus_list, plink_file):
        count += 1
        if count % 1000 == 0:
            print 'At SNP {}'.format(count)
        strain_genotype_dict = {x[0].iid: x[1] for x in zip(sample_list, row)}
        sample_strains = [
            sample.iid.split('__')[0].replace('-', '')
            for sample in output_sample_list
        ]
        output_row = [strain_genotype_dict[x] for x in sample_strains]
        out_plink.write_row(locus, output_row)
    return
Пример #10
0
    def test_iter(self):
        pf = plinkfile.open( "./data/wgas" )

        num_rows = 0
        for row in pf:
            num_rows += 1

        self.assertEqual( num_rows, 228694 )
Пример #11
0
    def test_iter(self):
        pf = plinkfile.open("./data/wgas")

        num_rows = 0
        for row in pf:
            num_rows += 1

        self.assertEqual(num_rows, 228694)
Пример #12
0
def load_plinkfile(basepath):
    plink_file = plinkfile.open(basepath)
    sample_list = plink_file.get_samples()
    locus_list = plink_file.get_loci()
    my_array = np.zeros((len(plink_file.get_loci( )), len(plink_file.get_samples( ))))
    for i, el in enumerate(plink_file): 
        my_array[i] = el
    return(sample_list, locus_list, my_array.astype(np.int))
def check_plinkfile(plink_fn):
    out_plink = plinkfile.open(plink_fn)
    samples = out_plink.get_samples()
    locuses = out_plink.get_loci()
    count = 0
    for locus, row in zip(locuses, out_plink):
        for sample, genotype in zip(samples, row):
            print("Individual {0} has genotype {1} for snp {2}.".format(sample.iid, genotype, locus.name))
        count += 1
        if count >= 2:
            break
    print len(locuses)
def check_plinkfile(plink_fn):
    out_plink = plinkfile.open(plink_fn)
    samples = out_plink.get_samples()
    locuses = out_plink.get_loci()
    count = 0
    for locus, row in zip(locuses, out_plink):
        for sample, genotype in zip(samples, row):
            print("Individual {0} has genotype {1} for snp {2}.".format(
                sample.iid, genotype, locus.name))
        count += 1
        if count >= 2:
            break
    print len(locuses)
Пример #15
0
def run_gwas(imputed, toPCA, out, npcs=5):
    # Compute PCA
    plinkpca = plinkfile.open(toPCA)
    if not plinkpca.one_locus_per_row():
        print("The plink file is f****d")
        exit(1)

    sample_list = plinkpca.get_samples()
    locus_list = plinkpca.get_loci()

    demo = pd.read_table('data/popres_European.ind', delimiter='\t')
    famIDs = set(int(i.fid) for i in sample_list)
    demography = [
        row.country for _, row in demo.iterrows() if row.famID in famIDs
    ]
    demo = pd.read_table('data/popres_European.ind', delimiter='\t')
    ids = [int(row.famID) for _, row in demo.iterrows() if row.famID in famIDs]

    del demo, famIDs

    n = len(sample_list)
    p = len(locus_list)
    gen_mat = np.empty((n, p), dtype=np.float32)

    loc = 0
    for i, row in enumerate(plinkpca):
        arr = np.fromiter(row, dtype=np.float32)
        arr[arr == 3] = np.nan
        sd = np.nanstd(arr)
        mu = np.nanmean(arr)
        arr -= mu
        arr[np.isnan(arr)] = 0
        arr /= sd
        gen_mat[:, loc] = arr  #np.fromiter(row, dtype=np.float32)
        loc += 1

    pca = decomp.PCA()
    U, S, V = pca._fit_truncated(gen_mat,
                                 n_components=npcs,
                                 svd_solver='arpack')

    np.savetxt(out + '.V.txt', V)
    np.savetxt(out + '.U.txt', U)
    np.savetxt(out + '.sigma.txt', S)
    np.savetxt(out + '.ids.txt', ids, fmt='%i')
    with open(out + '.countries', 'w') as f:
        f.write("\n".join(demography))

    #del S, V, gen_mat
    U_id_dict = dict((key, value) for (key, value) in zip(ids, U[:, :npcs]))
    run_regressions(imputed, U_id_dict, out + 'betas.txt', npcs)
Пример #16
0
    def load_genotypes(self):
        """
        Load the plink BED format genotype data file.
        Assumes samples in columns and SNP loci in rows.

        Needs plinkio.
        https://github.com/fadern/libplinkio
        """
        from plinkio import plinkfile

        bed_file = plinkfile.open(self.file_name)
        for counter, row in enumerate(bed_file):
            self.genotype[counter,:] = list(row)
            if counter % 100000 == 99999:
                print(counter+1)
        bed_file.close()
Пример #17
0
    def load_genotypes(self):
        """
        Load the plink BED format genotype data file.
        Assumes samples in columns and SNP loci in rows.

        Needs plinkio.
        https://github.com/fadern/libplinkio
        """
        from plinkio import plinkfile

        bed_file = plinkfile.open(self.file_name)
        for counter, row in enumerate(bed_file):
            self.genotype[counter, :] = list(row)
            if counter % 100000 == 99999:
                print(counter + 1)
        bed_file.close()
Пример #18
0
def add_pheno(plink_in, multigenecity, out, h=0.85, p_cases=0.5):
    plink_file = plinkfile.open(plink_in)
    if not plink_file.one_locus_per_row():
        print("This script requires that snps are rows and samples columns.")
        exit(1)
    sample_list = plink_file.get_samples()
    locus_list = plink_file.get_loci()
    n = len(sample_list)
    p = len(locus_list)
    fids = np.array([item.fid for item in sample_list])
    iids = np.array([item.iid for item in sample_list])
    edge_offset = 100
    causal_mut_index = np.linspace(
        edge_offset, p-edge_offset, multigenecity, dtype=int)
    gen_effect_size_unnormalized = {
            item: np.random.normal(loc=0, scale=float(h)/np.sqrt(multigenecity))
            for item in causal_mut_index}
    prs = np.zeros(n)
    for i, variant in enumerate(locus_list):
        row = plink_file.next()
        if i in causal_mut_index:
            genotypes = np.fromiter(row, dtype=float)
            genotypes[genotypes == 3] = np.mean(genotypes[genotypes != 3])
            prs += genotypes * gen_effect_size_unnormalized[i]
    plink_file.close()
    del causal_mut_index, gen_effect_size_unnormalized
    # Draw random environmental effects
    env_rs_unnormalized = np.random.normal(
        loc=0, scale=np.sqrt(1-h**2), size=n)
    gen_effect_size = h * (prs - np.mean(prs)) / np.std(prs)
    env_effect_size = (np.sqrt(1-h**2)
                       * (env_rs_unnormalized - np.mean(env_rs_unnormalized)) / np.std(env_rs_unnormalized))
    burden = gen_effect_size + env_effect_size
    sorted_i = np.argsort(burden)[::-1]
    ncases = int(n * p_cases)
    cases_i = sorted_i[:ncases]
    affection = np.zeros(n, dtype=np.int8)
    affection[cases_i] = 2
    affection[affection == 0] = 1
    towrite = np.column_stack((fids, iids, affection))
    np.savetxt(out, towrite, delimiter='\t', fmt=['%s', '%s', '%s'], header='FID\tID\tpheno',)
def read_geno_mat_plinkio(bed_location):
    """
    Reads in a genotype matrix, using plinkio,
    Takes a lot of mem, and a lot of time if there are a lot of individuals.

    :param bed_location: path of the bed file for reading. make sure you've limited the  it enough before you continue.
    :return: genotype matrix, and the plinkio documentation

    """
    plink_file = plinkfile.open(bed_location)

    genotype_mat = np.zeros((len(plink_file.loci),len(plink_file.samples)), dtype=float)
    snp_names = [x.name for x in plink_file.loci]
    i_ids = [x.fid + " " +  x.iid for x in plink_file.samples]

    for i in range(len(plink_file.loci)):
        check_object = plink_file.__next__()


        genotype_mat[i, :] = check_object

    return genotype_mat, plink_file
Пример #20
0
def getdata2(chr, df_catalog):

	snps_cmn = dict()
	snp_list_cmn = ['exm-'+str(x) for x in df_catalog.SNP.tolist()]

	infile ="{}/2_Exome_36K_Ws_chr{}".format(data_dir, chr) 
	plink_file = plinkfile.open( infile )
	locus_list = plink_file.get_loci( )
	total = 0
	
	for locus in locus_list:
		total+=1

		iter = plink_file.next()

		if (locus.name in snp_list_cmn) and (locus.chromosome == chr):
			maf_ = iter.allele_counts()
			maf = (2*maf_[2]+maf_[1])/(2.*float(maf_[0]+maf_[1]+maf_[2]))
			if not(total%10000): print total, maf

			snps_cmn[locus.name] = [int(locus.bp_position), maf, iter]

	return snps_cmn
Пример #21
0
def snps_match(plinkName, store_name, position_dset=None):
    # WARNING: this only works if positions are unique.
    with h5py.File(store_name, 'r', libver='latest') as store:
        # check the plink file
        plink_file = plinkfile.open(plinkName)
        locus_list = plink_file.get_loci()
        plink_file.close()
        plinkSet = set((l.chromosome, l.bp_position) for l in locus_list)
        del locus_list
        len_plink = len(plinkSet)
        if position_dset is None:
            position_dset = 'positions'
        for key in store:
            if key == 'meta':
                continue
            positions = store["{}/{}".format(key, position_dset)].value
            ikey = int(key)
            hset = set((ikey, int(pos)) for pos in positions)
            len_plink -= len(hset)
            plinkSet -= hset
    if len(plinkSet) == 0 and len_plink == 0:
        return True
    return False
Пример #22
0
def main(args):
    usage = """python %s <plink root> <h5 file>
    Convert binary PLINK files into h5 file.
    E.g.: py plink2h5.py mydata_final_clean mydata_final_clean.h5\n""" % args[0]
    if len(args) != 3:
        sys.stderr.write(usage)
        sys.exit(0)

    plinkRoot = args[1]
    h5fname = args[2]

    plinkTitle = plinkRoot.split("/")[-1]

    # Read binary PLINK files
    plinkF = pf.open(plinkRoot)
    numSnps = len(plinkF.get_loci())
    numSamples = len(plinkF.get_samples())
    print "%d SNPs x %d samples" % (numSnps, numSamples)

    # Create the empty array to store genotypes
    atom = tables.Int8Atom()
    h5F = tables.openFile(h5fname, 'w', title=plinkTitle)
    genotype = h5F.createCArray(h5F.root,
                                'genotype',
                                atom, (numSnps, numSamples),
                                title='Genotype',
                                filters=tables.Filters(complevel=5,
                                                       complib='blosc'))

    # populate
    for counter, row in enumerate(plinkF):
        genotype[counter, :] = list(row)
        if counter % 10000 == 9999:
            print(counter + 1), 'SNPs read'
    plinkF.close()
    h5F.close()
Пример #23
0
 def __init__(self, path, chrm=None, fids=None, iids=None):
     self.chrm = chrm
     self.handle = plinkfile.open(path)
     if not self.handle.one_locus_per_row():
         raise Exception("This script requires that SNPs are rows and samples columns.")
     
     samples = self.handle.get_samples()
     self._subset_idxs = None
     
     if fids is None and iids is None:
         self._samples = dict((s.iid, s) for s in samples)
     
     else:
         if fids is not None:
             fids = set(fids)
         if iids is not None:
             iids = set(iids)
         def keep(s):
             return (fids is None or s.fid in fids) and (iids is None or s.iid in iids)
         self._subset_idxs = set(i for i, sample in enumerate(samples) if keep(sample))
         self._samples = index_map(samples[i].iid for i in self._subset_idxs)
     
     self._loci = self.handle.get_loci()
     self._iter = izip(self._loci, self.handle)
Пример #24
0
def run_regressions(plink_file, U_dict, out_file, npcs, buf=100 ):
  plink_file = plinkfile.open(plink_file)
  if not plink_file.one_locus_per_row():
    print("The plink file is f****d")
    exit(1)

  locus_list = plink_file.get_loci()
  sample_list = plink_file.get_samples()
  n = len(sample_list)
  p = len(locus_list)
  y = np.array([sample.affection for sample in sample_list])


  X = np.empty((n, npcs + 1))
  betas = np.empty((buf, 2 * X.shape[1] + 2), dtype = np.float32)
  #X_design = np.ones((n,2))
  V = np.matrix(np.zeros(shape = (X.shape[0], X.shape[0])))
  X[:,1:] = [U_dict[int(sample.iid)] for sample in sample_list]
  X[:,1:] /= np.std(X[:,1:], axis = 0)
  covp = X.shape[1]

  # High C corresponds to less regularization.
  model = LogisticRegression(fit_intercept=False,  tol=1e-5, C=1e4)
  k = 0
  # fit nuisance
  model.fit(X[:,1:], y)
  y_model = model.predict_proba(X[:,1:])
  l_null = log_loss(y, y_model, normalize=False)

  with open(out_file, 'w') as out_f:
    i = 0
    logging.info("Iterating over SNPs")
    for j, row in tqdm.tqdm(enumerate(plink_file), total = p):
      locus = locus_list[j]
      arr = np.fromiter(row, dtype = np.float32)
      mu = np.mean(arr[arr!=3])
      std = np.std(arr[arr!=3])
      arr[arr==3] = mu
      arr -= mu
      if std > 0:
        arr /= std
        X[:,0] = arr
        model.fit(X, y)
        # Wald Test
        y_model = model.predict_proba(X)
        X_design= X
        np.fill_diagonal(V, np.multiply(y_model[:,0], y_model[:,1]))
        covLogit = np.linalg.inv(X_design.T * V * X_design)
        coefs = np.array(model.coef_)#np.insert(model.coef_, 0, model.intercept_)
        z = (coefs / np.sqrt(np.diag(covLogit))) ** 2
        # Chi-squared test
        l_fit = log_loss(y, y_model, normalize=False)
        D = l_fit - l_null
        p = chi2.sf(z, 1)
        betas[i, :covp] = coefs
        betas[i, covp:2*covp] = p
        betas[i, 2*covp] = D
      else:
        betas[i,:] = np.nan
      betas[i, 2*covp+1] = locus.chromosome
      i += 1
      if i == buf:
        i = 0
        np.savetxt(out_f, betas, delimiter='\t')
    np.savetxt(out_f, betas[:i,:], delimiter='\t')  # write the remaining
    logging.info("Finished iterating")
Пример #25
0
 def test_get_path(self):
     path = "./data/wgas"
     pf = plinkfile.open(path)
     self.assertEqual(path, pf.get_path())
Пример #26
0
def plinkToH5(client_config, env):
    """Gets plink prefix, produces an HDF file with the same prefix"""
    pfile = client_config['plinkfile']
    store_name = shared.get_plink_store(pfile)
    logger.info(f'Opening plinkfile: {pfile}')
    try:
        plink_file = plinkfile.open(pfile)
    except MemoryError as e:
        logger.error('MemoryError!')
        logger.error(e)
    if not plink_file.one_locus_per_row():
        logger.error("""This script requires that snps are
            rows and samples columns.""")
        sys.exit(1)
    sample_list = plink_file.get_samples()
    locus_list = plink_file.get_loci()
    n_tot = len(sample_list)
    logger.info(f'Opening h5py file:{store_name}')
    with h5py.File(store_name, 'w', libver='latest') as store:
        store.attrs['n'] = len(sample_list)
        store.attrs['has_local_AF'] = False
        store.attrs['has_global_AF'] = False
        store.attrs['has_centering'] = False
        store.attrs['has_normalization'] = False
        potential_pheno_file = pfile + ".pheno"
        if os.path.isfile(pfile + ".pheno"):
            affection = np.loadtxt(potential_pheno_file, dtype=int, usecols=2)
        else:
            affection = [sample.affection for sample in sample_list]
        if len(np.unique(affection)) > 2:
            raise ValueError(
                "phenotype is not binary. We only support binary for now")
        write_or_replace(store, 'meta/Status', affection, np.int8)
        ids = [sample.iid for sample in sample_list]
        write_or_replace(store, 'meta/id', ids, 'S11')
        del ids, affection
        # Read Demographic file
        logger.info(f'Reading demographic file at {pfile}.ind')
        logger.info(f'File exists: {os.path.isfile(pfile + ".ind")}')
        with open(pfile + ".ind", 'r') as dem_f:
            dem = [(row.split("\t")[2]).encode("UTF8") for row in dem_f]
            write_or_replace(store, 'meta/regions', dem)
        # Read chromosome data
        current_chr = 1
        positions = []
        rsids = []
        all_counts = []
        current_group = store.require_group(str(current_chr))
        genotypes = np.zeros(n_tot, dtype=np.float32)
        for locus, row in zip(locus_list, plink_file):
            if locus.chromosome != current_chr:
                if len(positions) == 0:
                    del store[str(current_chr)]
                else:
                    write_or_replace(current_group,
                                     'positions',
                                     positions,
                                     dtype=np.uint)
                    write_or_replace(current_group, 'rsids', rsids)
                    write_or_replace(current_group, 'counts', all_counts,
                                     np.uint32)

                    send_positions_to_server(positions, current_chr,
                                             client_config, env)
                    positions = []
                    # rsid = []
                    all_counts = []
                current_chr = locus.chromosome
                if current_chr == 23:
                    break
                current_group = store.require_group(str(current_chr))
            pos = str(locus.bp_position)
            counts, geno = process_plink_row(row, genotypes)
            # This should be a try except
            try:
                current_group.create_dataset(pos, data=geno)
            except Exception:
                logger.error(
                    f"Cannot write position: chr{locus.chromosome} {pos}")
            rsids.append(locus.name.encode('utf8'))
            positions.append(pos)
            all_counts.append(counts)
        if locus.chromosome != 23:
            write_or_replace(current_group, 'positions', positions, np.uint32)
            write_or_replace(current_group, 'rsids', rsids)
            write_or_replace(current_group, 'counts', all_counts, np.uint32)
            send_positions_to_server(positions, current_chr, client_config,
                                     env)
    plink_file.close()
    logger.info('Finished writing plink to hdf5.')
Пример #27
0
 def setUp(self):
     pf = plinkfile.open("./data/wgas")
     self.row = next(pf)
     pf.close()
Пример #28
0
 def test_get_samples(self):
     pf = plinkfile.open("./data/wgas")
     self.assertEqual(len(pf.get_samples()), 90)
Пример #29
0
    else:
        if str(sampleId) in reps:
            return str(sampleId)
        else:
            #if not sampleId.startswith("HGDP"):
            #    pdb.set_trace()
            return ""


droppingRelatives = False

basename = 'uae_hgdp1LD'

wdir = "/research/gutsybugs/KUMI/Data/"
hetfile = "%s.het" % basename  ## to be calculated
plink_file = plinkfile.open(basename)

dmfile = "%s.mdist" % basename  #"mergedQCLD.mdist" # "merged_1ibs.mdist" #"merged.dist"
dmfileIDs = dmfile + ".id"
## Loading distance matrix, produced by plink, putting ids to dataframe
ids = pd.read_csv(dmfileIDs, delimiter='\t', header=None)[1]
dm = pd.read_csv(dmfile, header=None, delimiter='\t')
het = pd.read_csv(hetfile, delimiter='\s+')
sampleInfo = pd.read_csv("hgdp/HGDPid_populations.csv",
                         sep=',',
                         index_col='Id')
fam = pd.read_csv('%s.fam' % basename, sep=' ', header=None)

#reps = map(str, set(pd.read_csv('100medoids.txt', header=None)[0]))
reps = '10187 12742 13120 13076 10651 10347 10215 10725 10926 12599'.split()
Пример #30
0
from plinkio import cplinkio

import collections
import numpy as np
import pandas
from prettytable import PrettyTable
import math
import scipy.stats as stats
from operator import itemgetter

import utils
data_dir = utils.data_dir

# Initialize plink library and read phenotype file
infile = "{}/2_Exome_36K_Ws".format(data_dir)
plink_file = plinkfile.open( infile )

if not plink_file.one_locus_per_row( ):
	 print( "This script requires that snps are rows and samples columns." )
	 exit( 1 )

sample_list = plink_file.get_samples( )
locus_list = plink_file.get_loci( )

# Phenotype  file
phenotypefile = "../data/pheno_Exome_36k_MCC_Ws_MIN2.txt"
# phenotypefile = "../data/pheno_autism_exclutions2.txt"
df_full_pheno = pandas.read_csv(phenotypefile, '\t')


Пример #31
0
 def test_get_samples(self):
     pf = plinkfile.open( "./data/wgas" )
     self.assertEqual( len( pf.get_samples( ) ), 90 );
Пример #32
0
 def test_get_loci(self):
     pf = plinkfile.open( "./data/wgas" )
     self.assertEqual( len( pf.get_loci( ) ), 228694 );
Пример #33
0
 def test_open(self):
     pf = plinkfile.open( "./data/wgas" )
     self.assertNotEqual( pf, None );
Пример #34
0
 def test_get_path(self):
     path = "./data/wgas"
     pf = plinkfile.open( path )
     self.assertEqual( path, pf.get_path( ) )
Пример #35
0
 def test_fail_open(self):
     with self.assertRaises(IOError):
         plinkfile.open("/")
Пример #36
0
from plinkio import plinkfile
import pandas as pd
import numpy as np
import MySQLdb
from MySQLdb.cursors import DictCursor
from scipy.spatial.distance import squareform
import scipy.cluster.hierarchy as sch
import pylab

#filename = '/research/gutsybugs/Software/Plink/Tutorial/hapmap1'
wdir = "/research/gutsybugs/KUMI/Data/"
filename = '%s/mergedQC' % wdir
dmfile = "%s.dist" % filename
hetfile = "%s.het" % filename
plink_file = plinkfile.open(filename)
#plink_file = plinkfile.open('/research/gutsybugs/KUMI/Data/mergedQC') ## TAKES LOOOOOONG!

sample_list = plink_file.get_samples()
locus_list = plink_file.get_loci()

makeSampleID = lambda sample: "%s_%s" % (sample.fid, sample.iid)
sampleIDs = [makeSampleID(sample) for sample in sample_list]

ntdict = {'N': np.NaN, 'A': 1, 'C': 2, 'G': 3, 'T': 4}
if filename.endswith('hapmap1') or filename.endswith('mergedQC'):
    snpFct = lambda x: x
else:
    snpFct = lambda x: ntdict[x]

makerow = lambda row, locus: [snpFct(snp) for snp in row] + [
    locus.chromosome, locus.name, locus.position, locus.bp_position
Пример #37
0
def write_records(prefix, phenotype_file,
                  nfolds=5,
                  phenotype_idcol=0,
                  phenotype_col=1,
                  phenotype_categorical=True,
                  save_tfrecords=True,
                  save_npy=False,
                  num_class = None):

    assert save_tfrecords or save_npy, 'Either TFRecords or NPY must be specified'

    create_diet_dir(prefix)

    # Read plink files
    Xt_plink = plinkfile.open(prefix)
    num_snps = len(Xt_plink.get_loci())
    num_ind = len(Xt_plink.get_samples())

    # Read sample ids from the .fam file
    fam_ids = np.array([s.iid for s in Xt_plink.get_samples()])
    pheno = pd.read_csv(phenotype_file, sep=None, engine='python')
    assert len(fam_ids) == pheno.shape[0], "Number of records in .fam file "\
                                           "and phenotype file do not match."

    assert np.all(fam_ids ==
            np.array(pheno.iloc[:,phenotype_idcol].as_matrix())),\
           "IDs of .fam file and phenotype file do not match"

    pheno_list = pheno.iloc[:, phenotype_col]

    if phenotype_categorical:
        pheno_list_cat = pheno_list.astype('category').cat
        pheno_list_values = pheno_list_cat.categories.values
        pheno_map = pd.DataFrame({'Phenotype': pheno_list_values,
                                  'Codes': range(len(pheno_list_values))},
                                  columns=('Phenotype', 'Codes'))

        pheno_map.to_csv(_templ['phenomap'].format(pref=prefix),
                         sep='\t', index=False)

        labels = pheno_list_cat.codes.astype(np.uint8)
        num_class = num_class or len(set(labels))
    else:
        # TODO: Test that
        labels = pheno_list.as_matrix()

    # Prepare indices for k-fold cv and train/valid/test split
    cv_indices = []
    for cv_trainval, cv_test in KFold(nfolds, True, 42).split(range(num_ind)):
        cv_train, cv_val = train_test_split(cv_trainval, test_size=1/(nfolds-1))
        cv_indices.append((cv_train, cv_val, cv_test))

    # Save metadata as json
    with open(_templ['metadata'].format(pref=prefix), 'w') as f:
        json.dump({'num_snp': num_snps,
                   'num_ind': num_ind,
                   'phenotype_categorical': phenotype_categorical,
                   'nfolds': nfolds,
                   'num_ind_per_fold': [(len(x),len(y),len(z)) for x,y,z in cv_indices],
                   'num_class': num_class
        }, f)

    # Transpose bed file to get X matrix
    trans_filename = _templ['plinktrans'].format(pref=prefix)
    # Produces transposed BED file
    print('Transposing plink file...')
    assert Xt_plink.transpose(trans_filename), 'Transpose failed'

    # Open transposed file and iterate over records
    X_plink = plinkfile.open(trans_filename)
    assert not X_plink.one_locus_per_row(), 'PLINK file should be transposed'
    assert len(labels) == num_ind, 'Number of labels is not equal to num individuals'

    if save_tfrecords:
        wr = lambda i, t: tf.python_io.TFRecordWriter(_templ['fold'].format(pref=prefix,
                                                                            k=i,
                                                                            set=t))
        tf_writers = [{
            'train': wr(i+1, 'train'),
            'valid': wr(i+1, 'valid'),
            'test':  wr(i+1, 'test')} for i in range(nfolds)]

        tf_writer_all = tf.python_io.TFRecordWriter(_templ['data'].format(pref=prefix))

    if save_npy:
        X = np.zeros((num_ind, num_snps), np.int8)

    # Write k-fold train/valid/test splits
    for i, (row, label) in enumerate(zip(X_plink, labels)): #iterates over individuals

        if save_tfrecords:
            # Save TFRecords
            example = tf.train.Example(features=tf.train.Features(feature={
                'genotype': tf.train.Feature(int64_list=_int_feature(list(row))),
                'label':    tf.train.Feature(int64_list=_int_feature([int(label)]))}))

            for fold, (train_idx, valid_idx, test_idx) in zip(range(nfolds), cv_indices):
                serialized_example = example.SerializeToString()
                if i in train_idx:
                    tf_writers[fold]['train'].write(serialized_example)
                elif i in valid_idx:
                    tf_writers[fold]['valid'].write(serialized_example)
                elif i in test_idx:
                    tf_writers[fold]['test'].write(serialized_example)
                else:
                    raise 'Not valid index'
            tf_writer_all.write(serialized_example)

        if save_npy:
            X[i, :] = list(row)

        if i % 100 == 0:
            print('Writing genotypes... {:.2f}% completed'.format((i/num_ind)*100), end='\r')
            sys.stdout.flush()

    # Save fold as npy if requested
    if save_npy:
        for i, (train_idx, valid_idx, test_idx) in zip(range(nfolds), cv_indices):
            fold_filename = _templ['npy_fold'].format(pref=prefix, k=i+1, set='train')
            np.save(fold_filename, X[train_idx,])
            fold_filename = _templ['npy_fold'].format(pref=prefix, k=i+1, set='valid')
            np.save(fold_filename, X[valid_idx,])
            fold_filename = _templ['npy_fold'].format(pref=prefix, k=i+1, set='test')
            np.save(fold_filename, X[test_idx,])

        np.save(_templ['npy'].format(pref=prefix), X)

    print('\nDone')

    if save_tfrecords:
        for fold in range(nfolds):
            tf_writers[fold]['train'].close()
            tf_writers[fold]['valid'].close()
            tf_writers[fold]['test'].close()
        tf_writer_all.close()

    Xt = np.zeros([num_snps, num_ind], np.int8)
    for i, row in enumerate(Xt_plink): #iterates over snps
        Xt[i,:] = row
        if i % 1000 == 0:
            print('Writing X transpose matrix... {:.2f}% completed'.format((i/num_snps)*100), end='\r')
            sys.stdout.flush()
    print('\nDone')

    # Save X^T as numpy arrays
    np.save(_templ['x_t'].format(pref=prefix), Xt)
Пример #38
0
 def test_open(self):
     pf = plinkfile.open("./data/wgas")
     self.assertNotEqual(pf, None)
Пример #39
0
 def test_fail_open(self):
     with self.assertRaises( IOError ):
         plinkfile.open( "/" )
Пример #40
0
# load libraries for plotting
if not args.no_plot:
    import matplotlib.pyplot as plt
    from mpl_toolkits.basemap import Basemap
    from scipy.ndimage import label

# importing data
sampleLoc = np.loadtxt(args.coords, delimiter=args.coord_sep)
if np.shape(sampleLoc)[1] != 2:
    raise IndexError('{} has {} separate columns, but ought to be 2 '
                     '(check COORDS_SEP?)'.format(args.coords,
                                                  np.shape(sampleLoc)[1]))

if args.plink:
    sampleData = plinkfile.open(args.plink)
    print('Using data file (PLINK):  {}'.format(args.plink), file=sys.stderr)
    printEveryNthLine = 10000
else:
    sampleData = np.transpose(
        np.loadtxt(args.non_genetic, delimiter=args.non_gen_sep))
    if np.shape(sampleData)[1] != np.shape(sampleLoc)[0]:
        raise IndexError(
            'Make sure that {} ({} rows) has one row per position '
            'in {} ({} positions)!'.format(args.non_genetic,
                                           np.shape(sampleData)[1],
                                           args.coords,
                                           np.shape(sampleLoc)[0]))
    print('Using data file:  {}'.format(args.non_genetic), file=sys.stderr)
    printEveryNthLine = int(np.shape(sampleData)[0] / 10)
Пример #41
0
 def test_get_loci(self):
     pf = plinkfile.open("./data/wgas")
     self.assertEqual(len(pf.get_loci()), 228694)
Пример #42
0
def load_data(path, prefix, phenotype_file):
    """
    loads the prefixed files: prefix.bed, prefix.fam, ...
    and saves, but it may make more sense to save in the load_1000 function above.
    Args:
        prefix: path with last elem as prefix of .bed, .fam, ...
    Returns:
        genomic_data: numpy array
        label_data: nuumpy array of labels
    """
    prefix = os.path.join(path, prefix)
    
    print("loading plink files...")
    Xt_plink = plinkfile.open(prefix)
    num_snps = len(Xt_plink.get_loci())
    num_ind = len(Xt_plink.get_samples())
    num_class = 26
    print("loaded.")

    # save metafile for info
    print("writing meta file...")
    with open(os.path.join(path,"_metadata.json"), 'w') as f:
        json.dump({'num_snps': num_snps,
                       'num_ind': num_ind,
                       'num_class': num_class}, f)    
    print("written.")

    # have to transpose the plinkfile to get X
    trans_filename = os.path.join(path,"trans")
    print("transposing plink file...")
    assert Xt_plink.transpose(trans_filename), "transpose failed"
    print("done.")

    # Now Open the transpose as X
    print("make genomic_data matrix...")
    X_plink = plinkfile.open(trans_filename)
    assert not X_plink.one_locus_per_row(), "Plink file should be transposed"

    # save the data as a npy file:
    genomic_data = np.zeros((num_ind,num_snps), np.int8)
    for i, row in enumerate(X_plink):
            genomic_data[i,:]=list(row)
    print("made.")	
    
    # lets save labels
    print("loading labels and making one-hot rep...")
    pheno = pd.read_csv(os.path.join(path,phenotype_file), sep=None, engine= "python")
    pheno_list = pheno.iloc[:, 1]
    pheno_list_cat = pheno_list.astype('category').cat
    pheno_list_values = pheno_list_cat.categories.values
    pheno_map = pd.DataFrame({'Phenotype': pheno_list_values,
                                                     "Codes": range(len(pheno_list_values))},
                                                     columns=('Phenotype','Codes'))
    pheno_map.to_csv(os.path.join(path,"pheno_map"))
    # okay get labels now that we have a map
    labels = pheno_list_cat.codes.astype(np.uint8)
    nb_class = len(pheno_list_values)
    targets = np.array(labels).reshape(-1)
    
    # makes one hot matrix for label data class1 = [1,0,...,0]
    label_data = np.eye(nb_class)[targets]
    print("just made the one-hot matrix for labels")

    return genomic_data, label_data
Пример #43
0
 def setUp(self):
     pf = plinkfile.open( "./data/wgas" )
     self.row = next( pf )
     pf.close( )
Пример #44
0
def import_genetics(path_to_plink_files):
    """
    Import genetics data

    Note about how to read the bed/bim/fam files:
        ## affection
        - unknown: 0 (fam) -> -9 (python)
        - unaffected: 1 (fam) -> 0 (python)
        - affected: 2 (fam) -> 1 (python)

        ## sex
        - male: 1 (fam) -> 0 (python)
        - female: 2 (fam) -> 1 (python)

        ## genotype
        - genotype 0: code 00 Homozygote "0"/"0"
        - genotype 1: code 01 Heterozygote
        - genotype 2: code 11 Homozygote "1"/"1"
        - genotype 3: unknown
    -------------------------------------------------------
    More details are available on https://web.njit.edu/~zhiwei/GAS_101.pdf
    and on https://github.com/mfranberg/libplinkio
    and on http://www.gwaspi.org/?page_id=671

    Args:
        path_to_plink_files: path to a folder containing a bed, bim and fam file

    Returns:
        snp_list: list of snps
        patients: list of patients
        matrix: matrix with patient as line and snp as column

    """
    from plinkio import plinkfile
    import numpy as np

    plink_file = plinkfile.open(path_to_plink_files)
    if not plink_file.one_locus_per_row():
        print("This script requires that snps are rows and samples columns.")
        exit(1)

    sample_list = plink_file.get_samples()
    locus_list = plink_file.get_loci()
    """
    for sample in sample_list:
        print sample.fid, sample.iid, sample.father_iid, sample.mother_iid, sample.sex, sample.affection
    for locus in locus_list:
        print locus.chromosome, locus.name, locus.position, locus.bp_position, locus.allele1, locus.allele2
    """

    num_snp = len(locus_list)
    num_patient = len(sample_list)

    matrix_patient_snp = np.zeros((num_snp, num_patient))
    patient_list = np.zeros(num_patient, dtype='|S30')
    snp_list = np.zeros(num_snp, dtype='|S30')
    for i, row, locus in zip(range(num_snp), plink_file, locus_list):
        matrix_patient_snp[i] = row
        snp_list[i] = locus.name
    matrix_patient_snp = matrix_patient_snp.T
    for i, sample in zip(range(num_patient), sample_list):
        patient_list[i] = sample.iid
    return snp_list, patient_list, matrix_patient_snp
Пример #45
0
def case_control_split(to_split,
                       num_case_holders,
                       num_control_holders,
                       split_prefix,
                       seed=1234,
                       pheno_file=None,
                       create=True):
    """Distributes the rows of h5py dataset at to_split into num_case_holders, num_control_holders groups 
  of approximately equal size adding up to the total number of individuals
  This function copies by shamelessly iterating over everything so it can be 
  very slow"""
    # Figure out how many #cases and #controls
    if pheno_file is None:
        with h5py.File(to_split, 'r') as to_split_fp:
            status = to_split_fp['meta/Status'].value
            num_cases = np.sum(status)
            controls = status == 0
            num_controls = np.sum(controls)
            control_rows = np.where(controls)
            case_rows = np.where(~controls)
        del controls
    else:  # It must be a plink file
        plink_file = plinkfile.open(pheno_file)
        sample_list = plink_file.get_samples()
        status = np.array([i.affection for i in sample_list])
        ids = np.array([i.iid for i in sample_list])
        case_rows = ids[status == 1]
        control_rows = ids[status == 0]
        num_cases = len(case_rows)
        num_controls = len(control_rows)
        del status, ids

    if num_case_holders > 1:
        case_per_silo = [num_cases / num_case_holders] * (num_case_holders - 1)
        case_per_silo.append(num_cases - sum(case_per_silo))
    else:
        case_per_silo = [num_cases]
    to_create = zip(case_per_silo, ['case'] * num_case_holders)

    if num_control_holders > 1:
        control_per_silo = [num_controls / num_control_holders
                            ] * (num_control_holders - 1)
        control_per_silo.append(num_controls - sum(control_per_silo))
    else:
        control_per_silo = [num_controls]

    to_create += zip(control_per_silo, ['control'] * num_control_holders)
    to_create = set(to_create)

    names = []

    def group_copy(name, node, rows, fp):
        dtype = node.dtype
        value = node[...]
        fp.require_dataset(name,
                           data=value[rows],
                           shape=(len(rows), ),
                           dtype=dtype)

    i = 0
    with h5py.File(to_split, 'r') as to_split_fp:
        if pheno_file is not None:
            ids = to_split_fp["meta/id"].value
            case_rows = np.where([ind in case_rows for ind in ids])[0]
            control_rows = np.where([ind in control_rows for ind in ids])[0]

        np.random.seed(seed)
        case_rows = np.random.permutation(case_rows)
        control_rows = np.random.permutation(control_rows)
        while len(to_create):
            count, status = to_create.pop()
            split_name = split_prefix + status + str(i) + '.h5py'
            names.append(split_name)
            if not create:
                i += 1
                continue
            logging.info("-Constructing: " + split_name)
            if status == 'case':
                chosen_rows = case_rows[:count]
                case_rows = case_rows[count:]
            else:
                chosen_rows = control_rows[:count]
                control_rows = control_rows[count:]

            with h5py.File(split_name, 'w') as copy_to_fp:
                for key in to_split_fp.keys():
                    dset_to_copy = to_split_fp[key]
                    dset_to_copyto = copy_to_fp.require_group(key)
                    copier = partial(group_copy,
                                     rows=chosen_rows,
                                     fp=dset_to_copyto)
                    dset_to_copy.visititems(copier)
            i += 1

    return names