def add_perm(self, rphrase, rperm): iphrase = self.__d_rphrase_to_iphrase.get(rphrase, -1) assert iphrase != -1, 'Error. add_perm received for unknown rphrase: ' + str( rphrase) iperm = self.__d_rperm_to_iperm.get(rperm, -1) if iperm != -1 and iperm in self.__ll_iperms[iphrase]: return del iperm iperm_new = len(self.__l_rperms) if rperm in self.__l_rperms: print('Error. rperm', rperm, 'passed to cl_bitvec_db already in __l_rperms') assert False, 'not dealt with yet' self.__l_rperms.append(rperm) self.__d_rperm_to_iperm[rperm] = iperm_new self.__ll_iperms[iphrase].append(iperm_new) self.__l_perm_iphrase.append(iphrase) l_perm_eids = self.__phraseperms.get_perm_eids(rperm) perm_len = len(l_perm_eids) self.__l_perm_len.append(perm_len) if perm_len > self.__max_perm_len: self.__max_perm_len = perm_len phrase_bitvec = [] for iel in l_perm_eids: phrase_bitvec += self.__el_nlb_mgr.get_bin_by_id(iel) s_iperms = self.__d_iel_to_l_iperm.get(iel, set()) s_iperms.add(iperm_new) self.__d_iel_to_l_iperm[iel] = s_iperms self.__nlb_mgr_notifier.notify_on_iel(iel) bitvecdb.add_rec(self.__hcbdb, len(l_perm_eids), utils.convert_charvec_to_arr(phrase_bitvec))
def get_el_hd_recs(self, pos, hd, el, num_cands, cands_arr): irec_arr = bitvecdb.intArray(num_cands) # el_bitvec = self.__el_bitvec_mgr.get_bin_by_id(eid).tolist() el_bitvec = self.__el_nlb_mgr.get_el_bin(el) num_ret = bitvecdb.get_el_hd_recs_by_list( self.__hcbdb, irec_arr, cands_arr, num_cands, pos, hd, utils.convert_charvec_to_arr(el_bitvec)) return num_ret, irec_arr
def get_close_recs(self, idb, plen, hd_thresh, l_qbits): num_ret, len_arr = self.get_plen_irecs(idb, plen) ret_arr = bitvecdb.intArray(num_ret) num_ret = bitvecdb.get_thresh_recs_by_list( self.__hcbdb, ret_arr, plen, utils.convert_intvec_to_arr(hd_thresh), len_arr, num_ret, utils.convert_charvec_to_arr(l_qbits)) return num_ret, ret_arr
def get_rperms_with_eid_at(self, idb, eid, pos, num_cands, cands_arr): # bufsize = len(self.__l_rperms) irec_arr = bitvecdb.intArray(num_cands) el_bitvec = self.__el_nlb_mgr.get_bin_by_id(eid) num_ret = bitvecdb.get_irecs_with_eid_by_list( self.__hcbdb, irec_arr, idb, pos, cands_arr, num_cands, utils.convert_charvec_to_arr(el_bitvec, bitvec_size)) return num_ret, irec_arr
def get_cluster(self, l_phrase_bits): plen = len(l_phrase_bits) / self.__bitvec_size # Each cent is an array of el bitvecs, each cent is also an array of hd, one for each el num_recs = len(self.__l_cent_hd) ret_arr = bitvecdb.intArray(num_recs) null_arr = bitvecdb.intArray(0) num_ret = bitvecdb.get_thresh_recs( self.__hcdb_cent, ret_arr, plen, null_arr, utils.convert_charvec_to_arr(l_phrase_bits), False, True) l_rcents = [ret_arr[i] for i in range(num_ret)] return l_rcents
def get_rec_rule_names(self, nd_cent, hd_thresh, plen, num_recs, l_rule_names): iperm_arr = bitvecdb.intArray(num_recs) cent_arr = utils.convert_charvec_to_arr(nd_cent.tolist()) hd_arr = utils.convert_intvec_to_arr(hd_thresh) num_ret = bitvecdb.get_cluster(self.__hcbdb, iperm_arr, num_recs, cent_arr, plen, hd_arr) l_ret = [ l_rule_names[self.__l_phrase_rphrases[self.__l_perm_iphrase[ iperm_arr[iperm]]]] for iperm in range(num_ret) ] return l_ret
def iel_bitvec_changed(self, iel, bitvec): s_iperms = self.__d_iel_to_l_iperm.get(iel, set()) for iperm in s_iperms: rperm = self.__l_rperms[iperm] l_perm_eids = self.__phraseperms.get_perm_eids(rperm) phrase_bitvec = [] for iel in l_perm_eids: phrase_bitvec += self.__el_nlb_mgr.get_bin_by_id(iel) bitvecdb.change_rec(self.__hcbdb, len(l_perm_eids), utils.convert_charvec_to_arr(phrase_bitvec), iperm) pass
def get_irecs_with_eid(self, idb, eid, rphrase_src, l_rphrase_excl): bufsize = len(self.__l_rperms) irec_arr = bitvecdb.intArray(bufsize) el_bitvec = self.__el_nlb_mgr.get_bin_by_id(eid) num_ret = bitvecdb.get_irecs_with_eid( self.__hcbdb, irec_arr, idb, -1, utils.convert_charvec_to_arr(el_bitvec, bitvec_size)) s_rphrases_close = set() for iret in range(num_ret): rphrase = self.__l_phrase_rphrases[self.__l_perm_iphrase[ irec_arr[iret]]] if rphrase in [rphrase_src] + l_rphrase_excl: continue # phrase = self.__phraseperms.get_phrase(rphrase) # print(phrase) s_rphrases_close.add(rphrase) return s_rphrases_close
def get_closest_recs(self, k, phrase_eids, iskip, shrink=0): ret_arr, hds_arr, obits_arr = bitvecdb.intArray(k), bitvecdb.intArray( k), bitvecdb.charArray(k * bitvec_size) qdata = [] for iel in phrase_eids: qdata += self.__el_nlb_mgr.get_bin_by_id(iel) num_ret = bitvecdb.get_closest_recs( self.__hcbdb, k, ret_arr, hds_arr, obits_arr, len(phrase_eids), utils.convert_charvec_to_arr(qdata), iskip, shrink) l_idexs_ret, l_hds_arr = [ret_arr[ir] for ir in range(num_ret) ], [hds_arr[ir] for ir in range(num_ret)] nd_obits = np.array( [ord(obits_arr[ib]) for ib in range(num_ret * bitvec_size)], dtype=np.int8) nd_obits = np.reshape(nd_obits, (num_ret, bitvec_size)) return l_idexs_ret, l_hds_arr, nd_obits
def process_clusters(self): print('Cluster closest els:') for plen, l_cent_hd_thresh in reversed(list(enumerate(self.__ll_cent_hd_thresh))): for i_lencent, hd_thresh in enumerate(l_cent_hd_thresh): irec = len(self.__ll_centroids) close_phrase = [] for iel in range(plen): # print('iel', iel, 'plen:', plen, 'i_lencent', i_lencent, 'len', len(self.__l_nd_centroids[plen][i_lencent])) # , 'bits:', self.__l_nd_centroids[plen][i_lencent]) word = self.__nlb_mgr.dbg_closest_word(self.__l_nd_centroids[plen][i_lencent][iel*self.__bitvec_size:(iel+1)*self.__bitvec_size]) close_phrase.append(word) # print(close_phrase) print('rcent:', irec, 'plen:', plen, ', hd:', hd_thresh, ',', close_phrase) self.__ll_centroids.append(np.reshape(self.__l_nd_centroids[plen][i_lencent], -1).tolist()) self.__l_cent_hd.append(hd_thresh) bitvecdb.add_rec(self.__hcdb_cent, plen, utils.convert_charvec_to_arr(self.__ll_centroids[-1])) hd_arr = utils.convert_intvec_to_arr(hd_thresh) bitvecdb.set_hd_thresh(self.__hcdb_cent, irec, hd_arr, len(hd_thresh))