def test_count_alleles(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r') chunk = first(hdf5.iterate_chunks()) genotypes = chunk['/calls/GT'] expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]] counts = counts_by_row(genotypes, missing_value=-1) assert numpy.all(expected == counts) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT']) chunks = (chunk['/calls/GT'] for chunk in chunks) matrix = first(chunks) for _ in range(20): extend_matrix(matrix, chunks) counts = counts_by_row(matrix, missing_value=-1) gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]] gts = numpy.array(gts) counts = counts_by_row(gts, missing_value=-1) assert numpy.all(counts == [[6]]) gts = [[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]]] gts = numpy.array(gts) counts = counts_by_row(gts, missing_value=-1) assert numpy.all(counts == [[6, 6]])
def gts_as_mat012(self): '''It transforms the GT matrix into 0 (major allele h**o), 1 (het), 2(other hom)''' gts = self[GT_FIELD] counts = counts_by_row(gts, missing_value=MISSING_INT) if counts is None: return numpy.full((gts.shape[0], gts.shape[1]), fill_value=MISSING_INT) major_alleles = numpy.argmax(counts, axis=1) if is_dataset(gts): gts = gts[:] gts012 = numpy.sum(gts != major_alleles[:, None, None], axis=2) gts012[numpy.any(gts == MISSING_INT, axis=2)] = MISSING_INT return gts012
def allele_count(self): counts = None for gt_chunk in select_dset_from_chunks(self.iterate_chunks(), '/calls/GT'): chunk_counts = counts_by_row(gt_chunk, missing_value=MISSING_VALUES[int]) if counts is None: counts = chunk_counts else: if counts.shape[1:] < chunk_counts.shape[1:]: n_extra_cols = chunk_counts.shape[-1] - counts.shape[-1] shape = list(counts.shape) shape[-1] = n_extra_cols extra_cols = numpy.zeros(shape, dtype=chunk_counts.dtype) counts = numpy.hstack((counts, extra_cols)) elif counts.shape[1:] > chunk_counts.shape[1:]: n_extra_cols = counts.shape[-1] - chunk_counts.shape[-1] shape = list(chunk_counts.shape) shape[-1] = n_extra_cols extra_cols = numpy.zeros(shape, dtype=chunk_counts.dtype) chunk_counts = numpy.hstack((chunk_counts, extra_cols)) counts = numpy.concatenate([counts, chunk_counts], axis=0) return counts
def allele_count(self): gts = self['/calls/GT'] counts = counts_by_row(gts, missing_value=MISSING_VALUES[int]) return counts