def get_pos_negs_all_v2(dbidxs, label_db: LabelDB, vec_meta: pd.DataFrame): idxs = pr.BitMap(dbidxs) relvecs = vec_meta[vec_meta.dbidx.isin(idxs)] pos = [] neg = [] for idx in dbidxs: acc_vecs = relvecs[relvecs.dbidx == idx] acc_boxes = get_boxes(acc_vecs) label_boxes = label_db.get(idx, format="df") ious = box_iou(label_boxes, acc_boxes) total_iou = ious.sum(axis=0) negatives = total_iou == 0 negvec_positions = acc_vecs.index[negatives].values # get the highest iou positives for each max_ious_id = np.argmax(ious, axis=1) max_ious = np.max(ious, axis=1) pos_idxs = pr.BitMap(max_ious_id[max_ious > 0]) # if label_boxes.shape[0] > 0: # some boxes are size 0 bc. of some bug in the data, so don't assert here. # assert len(pos_idxs) > 0 posvec_positions = acc_vecs.index[pos_idxs].values pos.append(posvec_positions) neg.append(negvec_positions) posidxs = pr.BitMap(np.concatenate(pos)) negidxs = pr.BitMap(np.concatenate(neg)) return posidxs, negidxs
def get_pos_negs(box, vec_meta): """ For a given image im, and a list of boxes (dataframe) and metadata of image vectors, compute 1. vectors of image chunks that do not overlap at all 2. vectors of chunks nearest to box center. """ if box.shape[0] == 0: neg_idxs = pr.BitMap(vec_meta.index.values) pos_idxs = pr.BitMap() return pos_idxs, neg_idxs ijs = box2ij(box, base_size=224) nearest_ijs = nearest_ij(box, base_size=224) tmp_meta = vec_meta negatives = [] centers = [] for tup, ctup in zip(ijs.itertuples(), nearest_ijs.itertuples()): overlap_ijs = tmp_meta.iis.between( tup.i1, tup.i2 - 1) & (tmp_meta.jjs.between(tup.j1, tup.j2 - 1)) negs = tmp_meta[~overlap_ijs] # no overlap whatsoever negatives.append(pr.BitMap(negs.index)) cent = tmp_meta[(tmp_meta.iis == ctup.i) & (tmp_meta.jjs == ctup.j)] centers.append(pr.BitMap(cent.index)) neg_idxs = pr.BitMap.intersection(*negatives) pos_idxs = pr.BitMap.union(*centers) return pos_idxs, neg_idxs
def hard_neg_tuples(v, Xt, yt, max_tups): """returns indices for the 'hardest' ntups""" p = np.where(yt > 0)[0] n = np.where(yt < 1)[0] assert p.shape[0] > 0 assert n.shape[0] > 0 scores = Xt @ v.reshape(-1, 1) score_diffs = scores[p].reshape(-1, 1) - scores[n].reshape(1, -1) iis, jjs = np.meshgrid(np.arange(p.shape[0]), np.arange(n.shape[0]), indexing="ij") diff_order = np.argsort(score_diffs, axis=None)[:max_tups] # score_diffs.flatten()[diff_order] pps = p[iis.flatten()[diff_order]] nns = n[jjs.flatten()[diff_order]] ridx = np.array(pr.BitMap(pps).union(pr.BitMap(nns))) lookup_tab = np.zeros(Xt.shape[0], dtype="int") - 1 lookup_tab[ridx] = np.arange(ridx.shape[0], dtype="int") piis = lookup_tab[pps] pjjs = lookup_tab[nns] # then X[ridx][piis] and X[ridx][jjs] # rdix o piis == iis <=> piis = iis assert (ridx[piis] == pps).all() return ridx, piis, pjjs
def query(self, *, topk, mode, vector=None, exclude=None, startk=None, **kwargs): if exclude is None: exclude = pr.BitMap([]) included = pr.BitMap(self.all_indices).difference(exclude) if len(included) == 0: return np.array([]), np.array([]) if len(included) <= topk: topk = len(included) assert mode == "dot" metas = self.vector_meta.dbidx.isin(included) vecs = self.vectors[metas] if vector is None: scores = np.random.randn(vecs.shape[0]) else: scores = vecs @ vector.reshape(-1) maxpos = np.argsort(-scores)[:topk] dbidxs = np.array(included)[maxpos] # metas = metas.iloc[maxpos][['x1', 'y1', ]] scores = scores[maxpos] ret = dbidxs assert ret.shape[0] == scores.shape[0] sret = pr.BitMap(ret) assert len(sret) == ret.shape[0] # no repeats assert ret.shape[ 0] == topk # return quantity asked, in theory could be less assert sret.intersection_cardinality( exclude) == 0 # honor exclude request def make_acc(sc, dbidx): return pd.DataFrame.from_records( [dict(x1=0, y1=0, x2=224, y2=224, dbidx=dbidx, score=sc)]) return { "dbidxs": ret, "nextstartk": len(exclude) + ret.shape[0], "activations": [make_acc(sc, dbidx) for (sc, dbidx) in zip(scores, ret)], }
def __init__(self, index: AccessMethod): self.index = index self.returned = ( pr.BitMap() ) # images returned from index (not necessarily seen yet) self.label_db = LabelDB() self.startk = 0
def query(self, *, topk, vector, exclude=None, startk=None, **kwargs): agg_method = 'avg_score' if exclude is None: exclude = pr.BitMap([]) included = pr.BitMap(self.all_indices).difference(exclude) if len(included) == 0: return np.array([]), np.array([]) if len(included) <= topk: topk = len(included) fullmeta = self.vector_meta[self.vector_meta.dbidx.isin(included)] nframes = len(included) dbidxs = np.zeros(nframes) * -1 dbscores = np.zeros(nframes) activations = [] for i, (dbidx, frame_vec_meta) in enumerate(fullmeta.groupby("dbidx")): dbidxs[i] = dbidx boxscs = np.zeros(frame_vec_meta.shape[0]) for j in range(frame_vec_meta.shape[0]): tup = frame_vec_meta.iloc[j:j + 1] # GET BOX # GET IMAGE # GET VECTOR image_vector = tup.vectors.values[0] # CROSS VECTOR #print(tup) #print(tup.vectors.values[0]) score = image_vector @ vector.reshape(-1) boxscs[j] = score frame_activations = frame_vec_meta.assign(score=boxscs) frame_activations = frame_activations[ frame_activations.score == frame_activations.score.max()][[ "x1", "y1", "x2", "y2", "dbidx", "score", "filename" ]] activations.append(frame_activations) dbscores[i] = np.max(boxscs) topkidx = np.argsort(-dbscores)[:topk] return { "dbidxs": dbidxs[topkidx].astype("int"), "nextstartk": 100, #nextstartk, "activations": [activations[idx] for idx in topkidx], }
def __init__( self, gdm: GlobalDataManager, dataset: SeesawDatasetManager, hdb: AccessMethod, params: SessionParams, ): self.gdm = gdm self.dataset = dataset self.acc_indices = [] self.acc_activations = [] self.seen = pr.BitMap([]) self.accepted = pr.BitMap([]) self.params = params self.init_q = None self.timing = [] self.image_timing = {} self.index = hdb self.q = hdb.new_query() self.loop = SeesawLoop(self.gdm, self.q, params=self.params) self.action_log = [] self._log("init")
def __init__(self, filepath, mode='r'): self._mode = mode try: fmode = {'r': 'rb', 'rw': 'r+b'}[self._mode] except KeyError: raise ValueError('invalid mode') if (not os.path.isfile(filepath)) and self._mode == 'rw': with open(filepath, 'wb') as fp: b = roaring.BitMap() fp.write(b.serialize()) self._fp = open(filepath, fmode) buff = self._fp.read() self._fp.seek(0) self.map = roaring.BitMap.deserialize(buff)
def get_metric_summary(res: BenchResult): session = res.session curr_idx = 0 hit_indices = [] for ent in session.gdata: for imdata in ent: if is_image_accepted(imdata): hit_indices.append(curr_idx) curr_idx += 1 index_set = pr.BitMap(hit_indices) assert len(index_set) == len(hit_indices) return dict( hit_indices=np.array(index_set), nseen=curr_idx, nimages=res.nimages, ntotal=res.ntotal, total_time=res.total_time, )
def restrict_fine_grained(vec_meta, vec, indxs): assert vec_meta.shape[0] == vec.shape[0] assert (indxs[1:] > indxs[:-1]).all(), "must be sorted" mask = vec_meta.dbidx.isin(pr.BitMap(indxs)) if mask.all(): return vec_meta, vec vec_meta = vec_meta[mask] vec = vec[mask] lookup_table = np.zeros(vec_meta.dbidx.max() + 1).astype("int") - 1 lookup_table[indxs] = np.arange(indxs.shape[0], dtype="int") new_dbidx = lookup_table[vec_meta.dbidx] assert (new_dbidx >= 0).all() vec_meta = vec_meta.assign( dbidx=new_dbidx) # this line shows up in profiler assert ( vec_meta.dbidx.unique().shape[0] == indxs.shape[0] ), "missing fine-grained embedding for some of the indices requested" assert vec_meta.shape[0] == vec.shape[0] return vec_meta.reset_index(drop=True), vec
def get_nns(startk, topk): i = 0 deltak = topk * 100 while True: if i > 1: print( "warning, we are looping too much. adjust initial params?" ) vec_idxs, scores = self.vec_index.query(vector, top_k=startk + deltak) found_idxs = pr.BitMap(vec_meta.dbidx.values[vec_idxs]) newidxs = found_idxs.difference(exclude) if len(newidxs) >= topk: break deltak = deltak * 2 i += 1 return vec_idxs, scores
def get_seen(self): return pr.BitMap(self.ldata.keys())
def _query_prelim(self, *, vector, topk, zoom_level, exclude=None, startk=None): if exclude is None: exclude = pr.BitMap([]) included_dbidx = pr.BitMap(self.all_indices).difference(exclude) vec_meta = self.vector_meta if len(included_dbidx) == 0: print("no dbidx included") return [], [], [] if len(included_dbidx) <= topk: topk = len(included_dbidx) ## want to return proposals only for images we have not seen yet... ## but library does not allow this... ## guess how much we need... and check def get_nns(startk, topk): i = 0 deltak = topk * 100 while True: if i > 1: print( "warning, we are looping too much. adjust initial params?" ) vec_idxs, scores = self.vec_index.query(vector, top_k=startk + deltak) found_idxs = pr.BitMap(vec_meta.dbidx.values[vec_idxs]) newidxs = found_idxs.difference(exclude) if len(newidxs) >= topk: break deltak = deltak * 2 i += 1 return vec_idxs, scores def get_nns_by_vector_exact(): scores = self.vectors @ vector.reshape(-1) vec_idxs = np.argsort(-scores) return vec_idxs, scores[vec_idxs] if self.vec_index is not None: idxs, scores = get_nns(startk, topk) else: idxs, scores = get_nns_by_vector_exact() # work only with the two columns here bc dataframe can be large topscores = vec_meta[["dbidx"]].iloc[idxs] topscores = topscores.assign(score=scores) allscores = topscores newtopscores = topscores[~topscores.dbidx.isin(exclude)] scoresbydbidx = (newtopscores.groupby("dbidx").score.max().sort_values( ascending=False)) score_cutoff = scoresbydbidx.iloc[topk - 1] # kth largest score newtopscores = newtopscores[newtopscores.score >= score_cutoff] # newtopscores = newtopscores.sort_values(ascending=False) nextstartk = (allscores.score >= score_cutoff).sum() nextstartk = math.ceil(startk * 0.8 + nextstartk * 0.2) # average to estimate next candidates = pr.BitMap(newtopscores.dbidx) assert len(candidates) >= topk assert candidates.intersection_cardinality(exclude) == 0 return newtopscores.index.values, candidates, allscores, nextstartk