示例#1
0
def get_encoder(verbLev=0):

    path = os.path.dirname(os.path.realpath(__file__))

    with open(path + '/encoder.json', 'r') as f:
        encoder = json.load(f)
    with open(path + '/vocab.bpe', 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    distribution = r_pickle(path + '/enc.dist')
    tok_len = r_pickle(path + '/enc.tLen')
    bpe_merges = [
        tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]
    ]

    enc = BPEncoder(encoder=encoder,
                    bpe_merges=bpe_merges,
                    distribution=distribution,
                    tok_len=tok_len)

    if verbLev > 0:
        print('\nGot BPE Encoder (%s)' % path)
        print(' > encoder length', len(enc.decoder))
        print(' > distribution', type(distribution))
        print(' > tLen', type(tok_len))

    return enc
示例#2
0
文件: hpmser.py 项目: piteren/ptools
 def load(self, save_dir: str):
     try:
         obj = r_pickle(self.__get_srl_path(save_dir))
     except:
         obj = r_pickle(self.__get_srl_backup_path(save_dir))
     self.paspa = obj.paspa
     self.__np_smooth = obj.__np_smooth
     self.plot_axes = obj.plot_axes
     self.__srL = obj.__srL
     self.__avg_dst = obj.__avg_dst
示例#3
0
def get_test_batch(
        size :int,          # batch size
        mcs :int,           # n montecarlo samples
        with_ASC=    True): # with all seven cards (dict)

    fn = '_cache/s%d_m%d.batch'%(size,mcs)
    test_batch = lim.r_pickle(fn)
    if test_batch: print('\nGot test batch from file: %s'%fn)
    else:
        print('\nPreparing test batch (%d,%d)...'%(size,mcs))
        test_batch = prep2X7Batch(
            bs=         size,
            n_monte=    mcs,
            asc=        ASC('_cache/asc.dict') if with_ASC else None,
            verb=       1)
        lim.w_pickle(test_batch, fn)
    c_tuples = []
    for ix in range(size):
        c_tuples.append(tuple(sorted(test_batch['cA'][ix])))
        c_tuples.append(tuple(sorted(test_batch['cB'][ix])))
    print('Got %d of hands in test_batch' % len(c_tuples))
    c_tuples = dict.fromkeys(c_tuples, 1)
    print('of which %d is unique' % len(c_tuples))

    return test_batch, c_tuples
示例#4
0
文件: hpmser.py 项目: piteren/ptools
    def save(self, folder: str):

        # backup copy previous
        old_res = r_pickle(self.__get_srl_path(folder))
        if old_res: w_pickle(old_res, self.__get_srl_backup_path(folder))

        w_pickle(self, self.__get_srl_path(folder))
        self.plot(folder=folder)
示例#5
0
文件: podeck.py 项目: piteren/pypoks
    def __init__(self, file_FP: str, use_QMP=True):

        super().__init__()

        print('\nReading ASC dict cache...')
        as_cards = lM.r_pickle(file_FP)
        if as_cards: print(' > using cached ASC dict')
        else:
            print(' > cache not found, building all seven cards rank dict...')
            as_cards = {}
            comb_list = list(itertools.combinations([x for x in range(52)], 7))

            if use_QMP:

                def iPF(task):
                    tv = []
                    for t in task:
                        tv.append((t, PDeck.cards_rank(t)[1]))
                    return tv

                dqmp = DeQueMP(func=iPF, user_tasks=True, verb=1)

                np = 0
                tcmb = []
                for cmb in comb_list:
                    tcmb.append(cmb)
                    if len(tcmb) > 10000:
                        dqmp.put_task({'task': tcmb})
                        tcmb = []
                        np += 1
                if tcmb:
                    dqmp.put_task({'task': tcmb})
                    np += 1
                for _ in tqdm(range(np)):
                    res = dqmp.get_result()
                    for r in res:
                        as_cards[r[0]] = r[1]
                dqmp.close()

            else:
                as_cards = {
                    cmb: PDeck.cards_rank(cmb)[1]
                    for cmb in tqdm(comb_list)
                }

            lM.w_pickle(as_cards, file_FP)

        self.update(as_cards)
示例#6
0
def put_cn_ckpts(dmk_name: str):

    nestarter(log_folder=None, devices=False, verb=0)

    dmk_FD = f'{DMK_MODELS_FD}/{dmk_name}/'

    file_mdict = r_pickle(f'{dmk_FD}mdict.dct')
    c_embW = file_mdict['c_embW']
    cn_name = get_cardNet_name(c_embW)

    cardNet_FD = f'{CN_MODELS_FD}/{cn_name}/'

    if not os.path.isdir(cardNet_FD):
        return False  # there is no cardNet for this dmk

    mrg_ckpts(ckptA='enc_vars',
              ckptA_FD=cardNet_FD,
              ckptB=None,
              ckptB_FD=None,
              ckptM='enc_vars',
              ckptM_FD=dmk_FD,
              replace_scope=dmk_name,
              verb=0)
    return True
示例#7
0
 def __load(self):
     if os.path.isfile(self.__obj_FN()):
         return r_pickle(self.__obj_FN(), obj_type=ParaDict)
     return {}
示例#8
0
    def __init__(
            self,
            fName=None,  # vec filename
            useOOV=True,  # add vector for OOV words (random)
            usePAD=True,  # add vector for padding purposes (zeroes)
            verb=0):

        self.verbLev = verb

        self._useOOV = useOOV
        self._usePAD = usePAD

        if self.verbLev > 0:
            print('\nFTVec inits (useOOV %s, usePAD %s)' %
                  (self._useOOV, self._usePAD))

        if fName is None:
            if self.verbLev > 0:
                print('using default VEC file: %s' % FTVec.DEF_VEC)
            path = os.path.dirname(os.path.realpath(__file__))
            fName = path + '/' + FTVec.DEF_VEC
            assert os.path.isfile(
                fName[:-4] +
                '.dicts'), 'ERR: default .dicts for VEC file does not exists!'

        pickleDictFN = fName[:-4] + '.dicts'
        pickle = lim.r_pickle(pickleDictFN)
        if pickle:
            vec, vecSTI, vecITS = pickle
            if self.verbLev: print(' > got VEC from .dict file')
        # read VEC and save .dict
        else:
            if self.verbLev: print(' > builds VEC...')
            with open(fName, 'r') as file:
                fileLines = [line[:-1] for line in file]
            vec = []
            vecSTI = {}
            vecITS = {}
            ix = 0
            if len(fileLines[0].split()) == 2:
                fileLines = fileLines[
                    1:]  # trim first line (from FT .vec file)
            for line in fileLines:
                #split = line[:-1].split()
                split = line.split()
                if len(split) == 301:  # skip words with spaces
                    vec.append([float(x) for x in split[1:]])
                    vecITS[ix] = split[0]
                    vecSTI[split[0]] = ix
                    ix += 1

            if 'not' in vecSTI and 'n\'t' not in vecSTI:
                vecSTI['n\'t'] = vecSTI['not']  # add n't to dictionary

            vecWidth = len(vec[0])
            oovVEC = np.random.normal(loc=0.0, scale=0.1,
                                      size=[1,
                                            vecWidth]).astype(dtype=np.float16)
            padVEC = np.zeros(shape=[1, vecWidth], dtype=np.float16)
            vec = np.asarray(vec, dtype=np.float16)
            vec = np.concatenate([vec, oovVEC, padVEC])

            lim.w_pickle((vec, vecSTI, vecITS), pickleDictFN)

        self._vecARR = vec
        self._vecSTI = vecSTI  # string to int
        self._vecITS = vecITS  # int to string

        self.vecNum = self._vecARR.shape[0]  # num of vectors
        self.vecLen = self._vecARR.shape[1]  # vector length

        self.oovID = self.vecNum - 2 if self._useOOV else None
        self.padID = self.vecNum - 1 if self._usePAD else None
        if self.verbLev:
            print(' > FTVec got %d vec of width %d' %
                  (self.vecNum, self.vecLen))
示例#9
0
    def __init__(
        self,
        uDD: UDD = None,  # for None uses ONLY cache
        # preparation settings
        translate_labels=True,  # translates labels to ints (may be false for labels given as ints)
        merge_multisen=False,  # merges uDD multi-sen into one sequence
        seed=12321,  # seed for random shuffle of data (for data distribution)
        vl_split=0.1,  # VL data split (from TR)
        ts_split=0.1,  # TS data split (from TR)
        cache_file=None,  # path to cache file
        # TODO: mergeAB=                None,   # token int, merges tokens sequences into one (with token separator)
        verb=0):

        self.verb = verb
        if self.verb > 0: print('\n*** DVCdata *** initializes...')

        if cache_file and os.path.isfile(cache_file):
            if self.verb > 0:
                print(' > loading the data from cache (%s)...' % cache_file)
            uDD, lbl_dictL = r_pickle(
                cache_file)  # load already preprocessed uDD
        # preprocess uDD and save cache
        else:
            if self.verb > 0: print(' > preprocessing of uDD...')

            # gather all texts
            all_texts = []
            for key in uDD.keys():
                if 'sen' in key and len(uDD[key]):  # every key of sentences
                    for ls in uDD[key]:  # every list from tuple
                        all_texts += ls

            # prepare lbl_dictL [list (per classifier) of labels dictionaries (list of dictionaries) that translate each label to int <0,n-1>]
            if not translate_labels:
                max_L = [max(c_lab) for c_lab in uDD['TRlbl']]
                lbl_dictL = [{x: x for x in range(mx + 1)} for mx in max_L]
            else:
                lbl_dictL = [{
                    lab: x
                    for x, lab in enumerate(sorted(set(c_lab)))
                } for c_lab in uDD['TRlbl']] if uDD['TRlbl'] else None
            # and translate
            for key in uDD.keys():
                if 'lbl' in key:  # label key
                    if uDD[key]:
                        trans_lables = []
                        for cix in range(len(lbl_dictL)):
                            trans_lables.append(
                                [lbl_dictL[cix][lbl] for lbl in uDD[key][cix]])
                        uDD[key] = trans_lables

            if cache_file:
                w_pickle((uDD, lbl_dictL), cache_file)  # save preprocessed uDD

        self.uDD = uDD
        self.lbl_dictL = lbl_dictL

        self.uDD_size = data_size(uDD)  # data length
        if self.verb > 0:
            for PT in DVCData.DATA_PARTS:
                print(' >> got %s data of %d samples' %
                      (PT, self.uDD_size[PT]))

        self.vl_split = vl_split
        self.ts_split = ts_split
        if self.uDD_size['VL']:
            self.vl_split = 0  # in case of explicit given VL do not split it from train
        if self.uDD_size['TS']:
            self.ts_split = 0  # in case of explicit given TS do not split it from train

        # resolve present data types
        self.got_sen = False  # got sentences
        self.got_vec = False  # got vector
        self.got_tks = False  # got tokens sequence
        self.got_seq = False  # got vectors sequence
        for PT in DVCData.DATA_PARTS:
            for tp in DVCData.DATA_TYPES:
                if tp != 'lbl':
                    if PT + tp in uDD:
                        if tp == 'sen' and uDD[PT + tp]: self.got_sen = True
                        if tp == 'vec' and uDD[PT + tp]: self.got_vec = True
                        if tp == 'tks' and uDD[PT + tp]: self.got_tks = True
                        if tp == 'seq' and uDD[PT + tp]: self.got_seq = True
        if self.verb > 0:
            print(' > data types present:')
            if self.got_sen: print(' >> sen (sentences)')
            if self.got_vec: print(' >> vec (vector)')
            if self.got_tks: print(' >> tks (sequence of tokens)')
            if self.got_seq: print(' >> seq (sequence of vectors)')

        # resolve multi_sen
        self.multi_sen = 0
        for key in uDD.keys():
            if type(uDD[key]) is tuple:
                if len(uDD[key]) > 0:
                    self.multi_sen = len(uDD[key])
                    break
        if self.verb > 0:
            print(' > data multi-sen (tuple len): %d' % self.multi_sen)

        # merge multi-sen
        if self.multi_sen > 1 and merge_multisen:
            if self.verb > 0: print(' > merging multi-sen...')
            for key in uDD.keys():
                if key != 'sen' and type(uDD[key]) is tuple:
                    uDD[key] = (
                        [
                            np.concatenate([
                                uDD[key][eix][six]
                                for eix in range(len(uDD[key]))
                            ],
                                           axis=0)
                            for six in range(len(uDD[key][0]))
                        ],
                    )  # tuple with list of concatenated multi-sen samples over time(0) axis
            self.multi_sen = 1

        # report labels distribution
        if self.verb > 0 and self.lbl_dictL:
            inv_parts = ['TR', 'TS']
            for PT in inv_parts:
                if uDD[PT + 'lbl']:
                    print(
                        ' > got %s labels of %d classifiers with distribution:'
                        % (PT, len(self.lbl_dictL)))
                    for cix in range(len(self.lbl_dictL)):
                        print(' >> classifier %d' % cix)
                        clD = self.lbl_dictL[cix]
                        inv_clD = {
                            clD[key]: key
                            for key in clD
                        }  # inverted dictionary of classifier labels
                        labDist = {lab: 0 for lab in sorted(list(clD.keys()))}
                        for lab in uDD[PT + 'lbl'][cix]:
                            labDist[inv_clD[lab]] += 1
                        sum = len(uDD[PT + 'lbl'][cix])
                        for lab in sorted(list(clD.keys())):
                            labDist[lab] = labDist[lab] / sum * 100
                            print(
                                ' >>> label: %d - (%.1f%%) [original label: %s]'
                                % (clD[lab], labDist[lab], lab))

        self.tdata = None
        self.data_dist_seed = None
        self.new_data_distribution(seed)