def _extend_array_with_iter(array, matrices): try: matrix = first(matrices) except ValueError: return matrices = chain([matrix], matrices) matrix_size = sys.getsizeof(matrix) mats_in_group = math.floor(AVAILABLE_MEM / matrix_size) if not mats_in_group: mats_in_group = 1 for mats_in_mem in group_items(matrices, mats_in_group): _extend_array(array, mats_in_mem)
def chunks(self): vars_parser = self.vars_parser vars_in_chunk = self.vars_in_chunk kept_fields = self.kept_fields ignored_fields = self.ignored_fields log = self.log # metadata = vars_parser.metadata snps = vars_parser.variations field_paths = {'filter': {}, 'calls': {}, 'info': {}} missing_values = {} filter_field_names = set(getattr(vars_parser, 'metadata', {}).get('FILTER', {}).keys()) exemplar_matrices_for_metadata = {} for chunk in group_items(snps, vars_in_chunk): chunk = list(chunk) n_snps_in_chunk = len(chunk) matrices = {} n_non_none_snps = 0 for snp in chunk: if snp is None: continue snp_dict = self._snp_tuple_to_dict(snp, field_paths, filter_field_names) self._put_snp_in_matrices(matrices, snp_dict, n_non_none_snps, n_snps_in_chunk, missing_values, exemplar_matrices_for_metadata) n_non_none_snps += 1 # print(numpy.unique(matrices[GT_FIELD])) # cut the empty snps from the end if n_non_none_snps < n_snps_in_chunk: matrices = {path: mat[:n_non_none_snps, ...] for path, mat in matrices.items()} varis = VariationsArrays() for path, mat in matrices.items(): varis[path] = mat samples = [sample.decode() for sample in vars_parser.samples] varis.samples = samples try: metadata = _prepare_metadata(vars_parser.metadata) varis._set_metadata(metadata) except AttributeError: pass # print('unique in chunkers', numpy.unique(varis[GT_FIELD])) # print('chunk', varis[GT_FIELD][:4, 12, ...]) yield varis
def chunks(self): vars_parser = self.vars_parser hdf5 = self.hdf5 vars_in_chunk = self.vars_in_chunk kept_fields = self.kept_fields ignored_fields = self.ignored_fields max_field_lens = self.max_field_lens max_field_str_lens = self.max_field_str_lens log = self.log ignore_overflows = hdf5.ignore_overflows snps = vars_parser.variations mat_structure = _build_matrix_structures(vars_parser, vars_in_chunk, kept_fields, ignored_fields, hdf5.ignore_undefined_fields, log, max_field_lens, max_field_str_lens) for chunk in group_items(snps, vars_in_chunk): mats = {} for path, struct in mat_structure.items(): mat = numpy.full(struct['shape'], struct['missing_value'], struct['dtype']) mats[path] = mat good_snp_idxs = [] for idx, snp in enumerate(chunk): if snp is None: break log['variations_processed'] += 1 filters = snp[6] info = snp[7] calls = snp[8] info = dict(info) if info else {} calls = dict(calls) if calls else {} ignore_snp = False for path, struct in mat_structure.items(): basepath = struct['basepath'] if path == '/variations/chrom': item = snp[0] elif path == '/variations/pos': item = snp[1] elif path == '/variations/id': item = snp[2] elif path == '/variations/ref': item = snp[3] elif path == '/variations/alt': item = snp[4] elif path == '/variations/qual': item = snp[5] elif basepath == 'FILTER': if struct['field'] == b'PASS': item = True if filters == [] else False else: item = struct['field'] in filters elif basepath == 'INFO': item = info.get(struct['field'], None) elif basepath == 'CALLS': item = calls.get(struct['field'], None) shape = struct['shape'] if item is not None: n_dims = len(shape) mat = mats[path] if n_dims == 1: try: mat[idx] = item except ValueError: if hasattr(item, '__len__'): if len(item) == 1: mat[idx] = item[0] else: log['data_no_fit'][path] += 1 break else: raise elif n_dims == 2: if len(item) > mat.shape[1]: if ignore_overflows: ignore_snp = True log['data_no_fit'][path] += 1 break else: msg = 'Data no fit in field:' msg += path msg += '\n' msg += str(item) raise RuntimeError(msg) try: mat[idx, 0:len(item)] = item except (ValueError, TypeError): missing_val = struct['missing_value'] item = [missing_val if val is None else val[0] for val in item] mat[idx, 0:len(item)] = item elif n_dims == 3: if len(item[0]) > mat.shape[2]: if ignore_overflows: ignore_snp = True log['data_no_fit'][path] += 1 break else: msg = 'Data no fit in field:' msg += path msg += '\n' msg += str(item) raise RuntimeError(msg) try: mat[idx, :, 0:len(item[0])] = item except ValueError: print(path, item) raise else: raise RuntimeError('Fixme, we should not be here.') if not ignore_snp: good_snp_idxs.append(idx) log['variations_stored'] += 1 varis = VariationsArrays() for path, mat in mats.items(): varis[path] = mat[good_snp_idxs] samples = [sample.decode() for sample in vars_parser.samples] varis.samples = samples metadata = _prepare_metadata(vars_parser.metadata) varis._set_metadata(metadata) yield varis