def get_encoder(verbLev=0): path = os.path.dirname(os.path.realpath(__file__)) with open(path + '/encoder.json', 'r') as f: encoder = json.load(f) with open(path + '/vocab.bpe', 'r', encoding="utf-8") as f: bpe_data = f.read() distribution = r_pickle(path + '/enc.dist') tok_len = r_pickle(path + '/enc.tLen') bpe_merges = [ tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1] ] enc = BPEncoder(encoder=encoder, bpe_merges=bpe_merges, distribution=distribution, tok_len=tok_len) if verbLev > 0: print('\nGot BPE Encoder (%s)' % path) print(' > encoder length', len(enc.decoder)) print(' > distribution', type(distribution)) print(' > tLen', type(tok_len)) return enc
def load(self, save_dir: str): try: obj = r_pickle(self.__get_srl_path(save_dir)) except: obj = r_pickle(self.__get_srl_backup_path(save_dir)) self.paspa = obj.paspa self.__np_smooth = obj.__np_smooth self.plot_axes = obj.plot_axes self.__srL = obj.__srL self.__avg_dst = obj.__avg_dst
def get_test_batch( size :int, # batch size mcs :int, # n montecarlo samples with_ASC= True): # with all seven cards (dict) fn = '_cache/s%d_m%d.batch'%(size,mcs) test_batch = lim.r_pickle(fn) if test_batch: print('\nGot test batch from file: %s'%fn) else: print('\nPreparing test batch (%d,%d)...'%(size,mcs)) test_batch = prep2X7Batch( bs= size, n_monte= mcs, asc= ASC('_cache/asc.dict') if with_ASC else None, verb= 1) lim.w_pickle(test_batch, fn) c_tuples = [] for ix in range(size): c_tuples.append(tuple(sorted(test_batch['cA'][ix]))) c_tuples.append(tuple(sorted(test_batch['cB'][ix]))) print('Got %d of hands in test_batch' % len(c_tuples)) c_tuples = dict.fromkeys(c_tuples, 1) print('of which %d is unique' % len(c_tuples)) return test_batch, c_tuples
def save(self, folder: str): # backup copy previous old_res = r_pickle(self.__get_srl_path(folder)) if old_res: w_pickle(old_res, self.__get_srl_backup_path(folder)) w_pickle(self, self.__get_srl_path(folder)) self.plot(folder=folder)
def __init__(self, file_FP: str, use_QMP=True): super().__init__() print('\nReading ASC dict cache...') as_cards = lM.r_pickle(file_FP) if as_cards: print(' > using cached ASC dict') else: print(' > cache not found, building all seven cards rank dict...') as_cards = {} comb_list = list(itertools.combinations([x for x in range(52)], 7)) if use_QMP: def iPF(task): tv = [] for t in task: tv.append((t, PDeck.cards_rank(t)[1])) return tv dqmp = DeQueMP(func=iPF, user_tasks=True, verb=1) np = 0 tcmb = [] for cmb in comb_list: tcmb.append(cmb) if len(tcmb) > 10000: dqmp.put_task({'task': tcmb}) tcmb = [] np += 1 if tcmb: dqmp.put_task({'task': tcmb}) np += 1 for _ in tqdm(range(np)): res = dqmp.get_result() for r in res: as_cards[r[0]] = r[1] dqmp.close() else: as_cards = { cmb: PDeck.cards_rank(cmb)[1] for cmb in tqdm(comb_list) } lM.w_pickle(as_cards, file_FP) self.update(as_cards)
def put_cn_ckpts(dmk_name: str): nestarter(log_folder=None, devices=False, verb=0) dmk_FD = f'{DMK_MODELS_FD}/{dmk_name}/' file_mdict = r_pickle(f'{dmk_FD}mdict.dct') c_embW = file_mdict['c_embW'] cn_name = get_cardNet_name(c_embW) cardNet_FD = f'{CN_MODELS_FD}/{cn_name}/' if not os.path.isdir(cardNet_FD): return False # there is no cardNet for this dmk mrg_ckpts(ckptA='enc_vars', ckptA_FD=cardNet_FD, ckptB=None, ckptB_FD=None, ckptM='enc_vars', ckptM_FD=dmk_FD, replace_scope=dmk_name, verb=0) return True
def __load(self): if os.path.isfile(self.__obj_FN()): return r_pickle(self.__obj_FN(), obj_type=ParaDict) return {}
def __init__( self, fName=None, # vec filename useOOV=True, # add vector for OOV words (random) usePAD=True, # add vector for padding purposes (zeroes) verb=0): self.verbLev = verb self._useOOV = useOOV self._usePAD = usePAD if self.verbLev > 0: print('\nFTVec inits (useOOV %s, usePAD %s)' % (self._useOOV, self._usePAD)) if fName is None: if self.verbLev > 0: print('using default VEC file: %s' % FTVec.DEF_VEC) path = os.path.dirname(os.path.realpath(__file__)) fName = path + '/' + FTVec.DEF_VEC assert os.path.isfile( fName[:-4] + '.dicts'), 'ERR: default .dicts for VEC file does not exists!' pickleDictFN = fName[:-4] + '.dicts' pickle = lim.r_pickle(pickleDictFN) if pickle: vec, vecSTI, vecITS = pickle if self.verbLev: print(' > got VEC from .dict file') # read VEC and save .dict else: if self.verbLev: print(' > builds VEC...') with open(fName, 'r') as file: fileLines = [line[:-1] for line in file] vec = [] vecSTI = {} vecITS = {} ix = 0 if len(fileLines[0].split()) == 2: fileLines = fileLines[ 1:] # trim first line (from FT .vec file) for line in fileLines: #split = line[:-1].split() split = line.split() if len(split) == 301: # skip words with spaces vec.append([float(x) for x in split[1:]]) vecITS[ix] = split[0] vecSTI[split[0]] = ix ix += 1 if 'not' in vecSTI and 'n\'t' not in vecSTI: vecSTI['n\'t'] = vecSTI['not'] # add n't to dictionary vecWidth = len(vec[0]) oovVEC = np.random.normal(loc=0.0, scale=0.1, size=[1, vecWidth]).astype(dtype=np.float16) padVEC = np.zeros(shape=[1, vecWidth], dtype=np.float16) vec = np.asarray(vec, dtype=np.float16) vec = np.concatenate([vec, oovVEC, padVEC]) lim.w_pickle((vec, vecSTI, vecITS), pickleDictFN) self._vecARR = vec self._vecSTI = vecSTI # string to int self._vecITS = vecITS # int to string self.vecNum = self._vecARR.shape[0] # num of vectors self.vecLen = self._vecARR.shape[1] # vector length self.oovID = self.vecNum - 2 if self._useOOV else None self.padID = self.vecNum - 1 if self._usePAD else None if self.verbLev: print(' > FTVec got %d vec of width %d' % (self.vecNum, self.vecLen))
def __init__( self, uDD: UDD = None, # for None uses ONLY cache # preparation settings translate_labels=True, # translates labels to ints (may be false for labels given as ints) merge_multisen=False, # merges uDD multi-sen into one sequence seed=12321, # seed for random shuffle of data (for data distribution) vl_split=0.1, # VL data split (from TR) ts_split=0.1, # TS data split (from TR) cache_file=None, # path to cache file # TODO: mergeAB= None, # token int, merges tokens sequences into one (with token separator) verb=0): self.verb = verb if self.verb > 0: print('\n*** DVCdata *** initializes...') if cache_file and os.path.isfile(cache_file): if self.verb > 0: print(' > loading the data from cache (%s)...' % cache_file) uDD, lbl_dictL = r_pickle( cache_file) # load already preprocessed uDD # preprocess uDD and save cache else: if self.verb > 0: print(' > preprocessing of uDD...') # gather all texts all_texts = [] for key in uDD.keys(): if 'sen' in key and len(uDD[key]): # every key of sentences for ls in uDD[key]: # every list from tuple all_texts += ls # prepare lbl_dictL [list (per classifier) of labels dictionaries (list of dictionaries) that translate each label to int <0,n-1>] if not translate_labels: max_L = [max(c_lab) for c_lab in uDD['TRlbl']] lbl_dictL = [{x: x for x in range(mx + 1)} for mx in max_L] else: lbl_dictL = [{ lab: x for x, lab in enumerate(sorted(set(c_lab))) } for c_lab in uDD['TRlbl']] if uDD['TRlbl'] else None # and translate for key in uDD.keys(): if 'lbl' in key: # label key if uDD[key]: trans_lables = [] for cix in range(len(lbl_dictL)): trans_lables.append( [lbl_dictL[cix][lbl] for lbl in uDD[key][cix]]) uDD[key] = trans_lables if cache_file: w_pickle((uDD, lbl_dictL), cache_file) # save preprocessed uDD self.uDD = uDD self.lbl_dictL = lbl_dictL self.uDD_size = data_size(uDD) # data length if self.verb > 0: for PT in DVCData.DATA_PARTS: print(' >> got %s data of %d samples' % (PT, self.uDD_size[PT])) self.vl_split = vl_split self.ts_split = ts_split if self.uDD_size['VL']: self.vl_split = 0 # in case of explicit given VL do not split it from train if self.uDD_size['TS']: self.ts_split = 0 # in case of explicit given TS do not split it from train # resolve present data types self.got_sen = False # got sentences self.got_vec = False # got vector self.got_tks = False # got tokens sequence self.got_seq = False # got vectors sequence for PT in DVCData.DATA_PARTS: for tp in DVCData.DATA_TYPES: if tp != 'lbl': if PT + tp in uDD: if tp == 'sen' and uDD[PT + tp]: self.got_sen = True if tp == 'vec' and uDD[PT + tp]: self.got_vec = True if tp == 'tks' and uDD[PT + tp]: self.got_tks = True if tp == 'seq' and uDD[PT + tp]: self.got_seq = True if self.verb > 0: print(' > data types present:') if self.got_sen: print(' >> sen (sentences)') if self.got_vec: print(' >> vec (vector)') if self.got_tks: print(' >> tks (sequence of tokens)') if self.got_seq: print(' >> seq (sequence of vectors)') # resolve multi_sen self.multi_sen = 0 for key in uDD.keys(): if type(uDD[key]) is tuple: if len(uDD[key]) > 0: self.multi_sen = len(uDD[key]) break if self.verb > 0: print(' > data multi-sen (tuple len): %d' % self.multi_sen) # merge multi-sen if self.multi_sen > 1 and merge_multisen: if self.verb > 0: print(' > merging multi-sen...') for key in uDD.keys(): if key != 'sen' and type(uDD[key]) is tuple: uDD[key] = ( [ np.concatenate([ uDD[key][eix][six] for eix in range(len(uDD[key])) ], axis=0) for six in range(len(uDD[key][0])) ], ) # tuple with list of concatenated multi-sen samples over time(0) axis self.multi_sen = 1 # report labels distribution if self.verb > 0 and self.lbl_dictL: inv_parts = ['TR', 'TS'] for PT in inv_parts: if uDD[PT + 'lbl']: print( ' > got %s labels of %d classifiers with distribution:' % (PT, len(self.lbl_dictL))) for cix in range(len(self.lbl_dictL)): print(' >> classifier %d' % cix) clD = self.lbl_dictL[cix] inv_clD = { clD[key]: key for key in clD } # inverted dictionary of classifier labels labDist = {lab: 0 for lab in sorted(list(clD.keys()))} for lab in uDD[PT + 'lbl'][cix]: labDist[inv_clD[lab]] += 1 sum = len(uDD[PT + 'lbl'][cix]) for lab in sorted(list(clD.keys())): labDist[lab] = labDist[lab] / sum * 100 print( ' >>> label: %d - (%.1f%%) [original label: %s]' % (clD[lab], labDist[lab], lab)) self.tdata = None self.data_dist_seed = None self.new_data_distribution(seed)