def extract_rawdata(ids, features, aggregators): storage_loc_template = get_storage_loc_template() rawdata = [] col_inds = {} col_inds_start = 0 for feature in features: storage_loc = storage_loc_template.format(feature.name) if feature.is_fixed_length: rawdata_ = bs.retrieve(ids, storage_loc, flat=True) rawdata_stacked = np.stack(rawdata_) rawdata.append(rawdata_stacked) ncols = rawdata_stacked.shape[1] col_inds[feature.name] = (col_inds_start, col_inds_start + ncols) col_inds_start += ncols else: fa_storage_loc_template = os.path.join(storage_loc, '{}') for aggregator in aggregators: fa_storage_loc = fa_storage_loc_template.format( aggregator.name) rawdata_ = bs.retrieve(ids, fa_storage_loc, flat=True) try: rawdata_stacked = np.stack(rawdata_) except ValueError: raise rawdata.append(rawdata_stacked) ncols = rawdata_stacked.shape[1] col_inds['{}_{}'.format( feature.name, aggregator.name)] = (col_inds_start, col_inds_start + ncols) col_inds_start += ncols rawdata = np.concatenate(rawdata, axis=1) return rawdata, col_inds
def _test_update(self, nupdate): _, arrs_for_update = create_random_id_based_dataset(nupdate) id2arr = {x: y for x, y in zip(self.ids, self.arrs)} # We want to make sure there are new ids (to be appended) and old ids (to be updated) nkeeps = nupdate // 2 nnews = nupdate - nkeeps maxid = np.max(self.ids) new_ids = np.arange(maxid + 1, maxid + nnews + 1) keep_ids = self.ids[:nkeeps] ids_for_update = np.concatenate((keep_ids, new_ids)) for x, y in zip(ids_for_update, arrs_for_update): id2arr[x] = y self.ids = np.array(list(id2arr.keys())) np.random.shuffle(self.ids) self.arrs = [id2arr[i] for i in self.ids] with tictoc('Test update {} items'.format(nupdate)): bs.store(ids_for_update, arrs_for_update, self.loc) retrieved_arrs = bs.retrieve(self.ids, self.loc) for id, retrieved_arr in zip(self.ids, retrieved_arrs): self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
def extract_rawdata(ids, features): storage_loc_template = get_storage_loc_template() data_by_id = {id: [] for id in ids} for feature in features: storage_loc = storage_loc_template.format(feature.name) with tictoc('{}'.format(feature.name)): feature_values = bs.retrieve(ids, storage_loc) for id, feature_value in zip(ids, feature_values): data_by_id[id].append(feature_value) data = [] for id in ids: feature_values = data_by_id[id] data.append(feature_values) return data
def _test_retrieve(self, nselected, shuffle=True): selected_ids = copy.deepcopy(self.ids) if shuffle: np.random.shuffle(selected_ids) selected_ids = selected_ids[:nselected] selected_ids_inds = [ np.where(self.ids == x)[0][0] for x in selected_ids ] selected_arrs = [self.arrs[i] for i in selected_ids_inds] with tictoc('Test retrieving {} items shuffle={}'.format( nselected, shuffle)): retrieved_arrs = bs.retrieve(selected_ids, self.loc) self.assertEqual(len(selected_ids), len(retrieved_arrs)) for i in range(len(selected_ids)): selected_arr = selected_arrs[i] retrieved_arr = retrieved_arrs[i] try: self.assertTrue(np.allclose(selected_arr, retrieved_arr)) except TypeError: pass
def aggregate_feature_values(runner, tids, features, aggregators, force=False): """ Compress all feature sequences into fixed-length vectors :param sid_to_label: :param h5file: :param features: :return: """ if features is None or len(features) == 0: raise Exception('must provide non-empty list of features') storage_loc_template = get_storage_loc_template() if len(tids) == 0: runner.wrapping_up() return tid_min = tids.min() tid_max = tids.max() n_calculations = 0 jobss = [] for feature in features: if feature.is_fixed_length: continue jobs = [] storage_loc = storage_loc_template.format(feature.name) fa_storage_loc_template = os.path.join(storage_loc, '{}') if force: combined_tids = [tids] else: combined_tids = [] for aggregator in aggregators: fa_storage_loc = fa_storage_loc_template.format(aggregator.name) mkdirp(fa_storage_loc) if force: tids_target = tids else: existing_tids = bs.retrieve_ids(fa_storage_loc, (tid_min, tid_max)) sorted_ids, sort_order = np.unique(existing_tids, return_index=True) non_existing_idx = np.where( np.logical_not(np.isin(tids, sorted_ids))) missing_tids = tids[non_existing_idx] tids_target = np.array(sorted(missing_tids)) n_tids_target = len(tids_target) if not force and n_tids_target: combined_tids.append(tids_target) if n_tids_target: n_calculations += n_tids_target jobs.append((tids_target, aggregator, fa_storage_loc)) if len(combined_tids): combined_tids = np.unique( np.concatenate(combined_tids).astype(np.int32)) jobss.append((combined_tids, storage_loc, jobs)) if not n_calculations: return runner.start(limit=n_calculations) for combined_tids, storage_loc, jobs in jobss: batches = get_batches(combined_tids, batch_size=100) for batch_tids in batches: batch_size = len(batch_tids) batch_arrs = bs.retrieve(batch_tids, storage_loc) for tids_target, aggregator, fa_storage_loc in jobs: aggregateds = [] aggregated_ids = [] target_batch_ind = np.searchsorted(batch_tids, tids_target) batch_id_within_range = np.where(target_batch_ind < batch_size) target_batch_ind = target_batch_ind[batch_id_within_range] tids_within_range = tids_target[batch_id_within_range] for batch_ind, tid in zip(target_batch_ind, tids_within_range): if batch_ind == 0 and batch_tids[0] != tid: continue aggregated_ids.append(tid) arr = batch_arrs[batch_ind] aggregated = aggregator.process(arr) aggregateds.append(aggregated) if len(aggregated_ids): aggregated_ids = np.array(aggregated_ids) bs.store(aggregated_ids, aggregateds, fa_storage_loc) runner.tick(len(aggregated_ids))
def _test_retrieve_error(self): non_existing_ids = NUM_POINTS * 100 + np.random.randint( 100, size=NUM_POINTS // 2) with self.assertRaises((ValueError, FileNotFoundError)): bs.retrieve(non_existing_ids, self.loc)