def _add_datasets(self, group, j, track_times): # Create a table table = self.h5file.create_table(group, f'table{j}', Record, title=self.title, filters=None, track_times=track_times) # Get the record object associated with the new table d = table.row # Fill the table for i in range(self.nrows): d['var1'] = '%04d' % (self.nrows - i) d['var2'] = i d['var3'] = i * 2 d.append() # This injects the Record values # Flush the buffer for this table table.flush() # Create a couple of arrays in each group var1List = [x['var1'] for x in table.iterrows()] var3List = [x['var3'] for x in table.iterrows()] self.h5file.create_array(group, f'array{j}', var1List, f"col {j}", track_times=track_times) # Create CArrays as well self.h5file.create_carray(group, name=f'carray{j}', obj=var3List, title="col {}".format(j + 2), track_times=track_times) # Create EArrays as well ea = self.h5file.create_earray(group, f'earray{j}', StringAtom(itemsize=4), (0, ), "col {}".format(j + 4), track_times=track_times) # And fill them with some values ea.append(var1List) # Finally VLArrays too vla = self.h5file.create_vlarray(group, f'vlarray{j}', Int16Atom(), "col {}".format(j + 6), track_times=track_times) # And fill them with some values vla.append(var3List)
def setup(self, node, block_size, blob_name, index_name): if not hasattr(node, blob_name): self.file_.create_earray(node, blob_name, StringAtom(itemsize=block_size), (0, ), filters=filters) description = {} description["index"] = Int64Col(pos=0) description["start"] = UInt32Col(pos=1) description["size"] = UInt32Col(pos=2) # every colums which appears in a where method call should/must be indexed ! # this is not only for performance but for correct lookup as well (I had strange bugs # else) string_index = self.file_.create_table(node, index_name, description, filters=None) string_index.cols.index.create_index()
def test_from_kind_04(self): atom1 = Atom.from_kind('string', itemsize=5, dflt=b'hello') atom2 = StringAtom(itemsize=5, shape=(), dflt=b'hello') self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
def test_from_dtype_03(self): with self.assertWarns(Warning): atom1 = Atom.from_dtype(numpy.dtype('U5'), dflt=b'hello') atom2 = StringAtom(itemsize=5, shape=(), dflt=b'hello') self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2))
def test_init_parameters_03(self): atom1 = StringAtom(itemsize=12) self.assertRaises(TypeError, atom1.copy, foobar=42)
def test_init_parameters_02(self): atom1 = StringAtom(itemsize=12) atom2 = atom1.copy(itemsize=100, shape=(2, 2)) self.assertEqual(atom2, StringAtom(itemsize=100, shape=(2, 2), dflt=b''))
def test_init_parameters_01(self): atom1 = StringAtom(itemsize=12) atom2 = atom1.copy() self.assertEqual(atom1, atom2) self.assertEqual(str(atom1), str(atom2)) self.assertFalse(atom1 is atom2)
def combine_sample_genotypes(path_x, path_y, output_path, contig, samples_x=None, samples_y=None): if isfile(output_path): raise FileExistsError("out path already exists") h5file = openFile(output_path, mode="w") # load 1st hdf5 fh_a = h5py.File(path_x) fh_b = h5py.File(path_y) # load genotypes ga = allel.GenotypeCArray.from_hdf5(fh_a[contig]["calldata"]["genotype"]) gb = allel.GenotypeCArray.from_hdf5(fh_b[contig]["calldata"]["genotype"]) alleles = ga.count_alleles() biallelic = np.array(alleles.max_allele() < 2) # load positions pos_a = fh_a[contig]["variants"]["POS"][:] pos_b = fh_b[contig]["variants"]["POS"][:] # filter out non-biallelic sites: ga = ga.compress(biallelic, axis=0) pos = np.compress(biallelic, pos_a, axis=0) ref = np.compress(biallelic, fh_a[contig]["variants"]["REF"][:], axis=0) alt = np.compress(biallelic, fh_a[contig]["variants"]["ALT"][:], axis=0) assert np.array_equal(pos, pos_b) # samples samplesa = fh_a[contig]["samples"][:] samplesb = fh_b[contig]["samples"][:] if samples_y: l = [s.decode() for s in samplesb] idx = [l.index(s) for s in samples_y] gb = gb.take(idx, axis=1) samplesb = samples_y if samples_x: l = [s.decode() for s in samplesa] idx = [l.index(s) for s in samples_x] ga = ga.take(idx, axis=1) samplesa = samples_x root = h5file.root # Create the groups chrom = h5file.create_group(root, contig) grp_calldata = h5file.create_group(chrom, "calldata") grp_variants = h5file.create_group(chrom, "variants") # create objects filters = Filters(complevel=1, complib='zlib') sample_names = np.concatenate([samplesa, samplesb]).astype("|S10") h5file.create_array(chrom, 'samples', sample_names) number_sites = ga.shape[0] position = h5file.create_earray(grp_variants, name='POS', atom=IntAtom(itemsize=4), expectedrows=number_sites, shape=(0, ), filters=filters) reference = h5file.create_earray(grp_variants, name='REF', atom=StringAtom(itemsize=1), expectedrows=number_sites, shape=(0, ), filters=filters) alternate = h5file.create_earray(grp_variants, name='ALT', atom=StringAtom(itemsize=1), expectedrows=number_sites, shape=(0, 3), filters=filters) genotypes = h5file.create_earray(grp_calldata, name='genotype', atom=IntAtom(itemsize=1), expectedrows=number_sites, shape=(0, sample_names.size, 2), filters=filters) chunks = np.arange(0, number_sites, chunk_size) chunks[-1] = number_sites for start, stop in zip(chunks[:-1], chunks[1:]): gt = np.hstack([ga[start:stop], gb[start:stop]]) genotypes.append(gt) position.append(pos[start:stop]) reference.append(ref[start:stop]) alternate.append(alt[start:stop]) h5file.close()