示例#1
0
    def __init__(self, V, K, rank, doc_per_set, dir='./', alpha=0.01, beta=0.0001, word_partition=2000, set_name='t_saved',
                 test_doc='test_doc', silent=False, single=True):
        """ dir: indicates the root folder of each data folder, tmp file folder shall be created in here
            NOTICE: mask is incomplete if dis, manually add it
            NOTICE: digits in set name are used to ident the ndk nd and z"""
        # ******************************* store parameter *********************************************
        self.K = K
        self.V = V
        self.doc_per_set = doc_per_set
        self.suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + str(rank)
        self.data_dir = dir + str(rank) + '/'
        self.tmp_dir = dir + 'tmp' + self.suffix + '/'
        makedirs(self.tmp_dir)
        self.alpha = alpha
        self.beta = beta
        self.beta_bar = beta * V
        self.alpha_bar = alpha * K
        self.mask = np.ones(V, dtype=bool)
        util_funcs.set_srand()
        self.set_name = set_name
        self.rank = rank
        self.bak_time = 0

        # ******************************* init the matrices *********************************************
        self.name = self.tmp_dir + 'nkw' + self.suffix + '.h5'
        self.node_name = 'nkw'
        nkw_file = h5py.File(self.tmp_dir + 'nkw' + self.suffix + '.h5', 'w')
        self.nkw = nkw_file.create_dataset('nkw', (K, self.V), dtype='int32')
        self.nk = np.zeros(K, dtype=np.int32)

        self.mask = np.zeros(self.V, dtype=bool)
        self.test_doc = np.load(self.data_dir + test_doc + '.npy').tolist()

        # ******************************* init counts&mask *********************************************
        self.train_cts_set = []
        for file_name in listdir(self.data_dir):
            if self.set_name in file_name:
                self.train_cts_set.append((file_name, np.load(self.data_dir + file_name).tolist()))

        if single:
            ndk = np.zeros((self.doc_per_set, self.K), dtype=np.int32)
            nd = np.zeros(self.doc_per_set, dtype=np.int32)
            z = np.array([None for _ in xrange(self.doc_per_set)], dtype=object)

            start = 0
            while start < self.V:
                end = start + word_partition
                end = end * (end <= self.V) + self.V * (end > self.V)

                nkw_part = np.zeros((K, end - start), dtype=np.int32)
                self.init_cnts(nkw_part, ndk, nd, z, start, end, silent)
                self.nkw[:, start:end] = nkw_part

                start = end
示例#2
0
    def __init__(self,
                 V,
                 K,
                 rank,
                 doc_per_set,
                 dir='./',
                 alpha=0.01,
                 beta=0.0001,
                 word_partition=2000,
                 set_name='t_saved',
                 test_doc='test_doc',
                 silent=False,
                 single=True):
        """ dir: indicates the root folder of each data folder, tmp file folder shall be created in here
            NOTICE: mask is incomplete if dis, manually add it
            NOTICE: digits in set name are used to ident the ndk nd and z"""
        # ******************************* store parameter *********************************************
        self.K = K
        self.V = V
        self.doc_per_set = doc_per_set
        self.suffix = time.strftime('_%m%d_%H%M%S',
                                    time.localtime()) + str(rank)
        self.data_dir = dir + str(rank) + '/'
        self.tmp_dir = dir + 'tmp' + self.suffix + '/'
        makedirs(self.tmp_dir)
        self.alpha = alpha
        self.beta = beta
        self.beta_bar = beta * V
        self.alpha_bar = alpha * K
        self.mask = np.ones(V, dtype=bool)
        util_funcs.set_srand()
        self.set_name = set_name
        self.rank = rank
        self.bak_time = 0

        # ******************************* init the matrices *********************************************
        self.name = self.tmp_dir + 'nkw' + self.suffix + '.h5'
        self.node_name = 'nkw'
        nkw_file = h5py.File(self.tmp_dir + 'nkw' + self.suffix + '.h5', 'w')
        self.nkw = nkw_file.create_dataset('nkw', (K, self.V), dtype='int32')
        self.nk = np.zeros(K, dtype=np.int32)

        self.mask = np.zeros(self.V, dtype=bool)
        self.test_doc = np.load(self.data_dir + test_doc + '.npy').tolist()

        # ******************************* init counts&mask *********************************************
        self.train_cts_set = []
        for file_name in listdir(self.data_dir):
            if self.set_name in file_name:
                self.train_cts_set.append(
                    (file_name, np.load(self.data_dir + file_name).tolist()))

        if single:
            ndk = np.zeros((self.doc_per_set, self.K), dtype=np.int32)
            nd = np.zeros(self.doc_per_set, dtype=np.int32)
            z = np.array([None for _ in xrange(self.doc_per_set)],
                         dtype=object)

            start = 0
            while start < self.V:
                end = start + word_partition
                end = end * (end <= self.V) + self.V * (end > self.V)

                nkw_part = np.zeros((K, end - start), dtype=np.int32)
                self.init_cnts(nkw_part, ndk, nd, z, start, end, silent)
                self.nkw[:, start:end] = nkw_part

                start = end
示例#3
0
    def __init__(self, H, dir, rank, D, K, W, max_len, apprx, batch_size=50, alpha=0.01, beta=0.0001,
                 a=10**5.2, b=10**(-6), c=0.33, samples_per_update=50, test_doc='test_doc', suffix=None):
        # set the related parameters1
        self.K = K
        self.batch_size = batch_size
        self.step_size_params = (a, b, c)
        self.samples_per_update = samples_per_update

        self.W = W
        self.D = D
        self.H = H

        self.alpha = alpha
        self.beta = beta
        self.beta_bar = beta * self.W
        self.alpha_bar = alpha * K

        self.update_ct = 0
        self.rank = rank
        if suffix is None: suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + '_' + str(rank)
        self.dir = dir
        self.data_dir = dir + str(rank) + '/'
        self.tmp_dir = dir + 'tmp' + suffix + '/'
        makedirs(self.tmp_dir)
        self.current_set = None
        self.batch_loc = [0, 0]
        self.time_bak = 0
        self.apprx = apprx

        # used to map between real w and the sliced cnts matrix in memory
        self.batch_map = np.zeros(self.W, dtype=np.int32)
        self.batch_map_4w = np.zeros(self.W, dtype=np.int32)
        self.w4_cnt = None
        util_funcs.set_srand()

        # allocate the file
        theta_file = h5py.File(self.tmp_dir + 'theta' + suffix + '.h5', 'w')
        self.theta = theta_file.create_dataset('theta', (K, self.W), dtype='float32')
        self.norm_const = np.zeros((self.K, 1), dtype=np.float32)

        start = 0
        while start < self.W:
            end = start + max_len
            end = end * (end <= self.W) + self.W * (end > self.W)
            tmp = np.random.gamma(1, 1, (self.K, end-start)); collect()
            self.theta[:, start:end] = tmp
            self.norm_const[:] += np.sum(tmp, 1)[:, np.newaxis]

            start = end
            tmp = None; collect()

        self.ndk = np.zeros((self.batch_size, K), dtype=np.int32)
        self.ndk_avg = np.zeros((self.batch_size, K), dtype=np.float32)
        self.nd = np.zeros(self.batch_size, dtype=np.int32)
        self.nk = np.zeros(K, dtype=np.int32)

        self.table_h = np.zeros(self.K, dtype=np.int32)
        self.table_l = np.zeros(self.K, dtype=np.int32)
        self.table_p = np.zeros(self.K, dtype=np.float32)
        self.samples = None
        self.test_doc = load(open(self.data_dir + test_doc, 'r'))

        self.mask = np.ones(self.W, dtype=bool)

        self.iters_per_doc = 50
示例#4
0
    def __init__(self, H, dir, rank, D, K, W, max_len, apprx, batch_size=50, alpha=0.01, beta=0.0001,
                 a=10**5.2, b=10**(-6), c=0.33, samples_per_update=50, test_doc='test_doc', suffix=None):
        """
        H: value sqrt(m)*sigma^(1+0.3)
        dir: indicates the root folder of each data folder, tmp file folder shall be created in here
        rank: indicate the subfolder where the docs exist
        D: the total docs in the local training set
        K: the num of topic
        W: the len of vocab
        max_len: maximum number of slices we can load into memory (ignore for 10708 prj)
        approx: a hack for counting the time (ignore for 10708 prj)
        samples_per_update = how many iterations needed to approximate the expectation term in SGLD update
        test_doc: file path of test documents
        time_bak: time used to load, correct with 1.5s; you need to set 0 every time you use it (ignore for 10708 prj)

        train_set: has form [ [[d],[d],[d],[d]], map[], mask[], flag, [maskd[], maskd[], maskd[], maskd[]] ]
        test_doc: [ [[w], [..], ...], [[test_w], [..], ..], mask[], map[] ]
        """

        # set the related parameters
        self.K = K
        self.batch_size = batch_size
        self.step_size_params = (a, b, c)
        self.samples_per_update = samples_per_update

        self.W = W
        self.D = D
        self.H = H

        self.alpha = alpha
        self.beta = beta
        self.beta_bar = beta * self.W
        self.alpha_bar = alpha * K

        self.update_ct = 0
        self.rank = rank
        if suffix is None: suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + '_' + str(rank)
        self.dir = dir
        self.data_dir = dir + str(rank) + '/'
        self.tmp_dir = dir + 'tmp' + suffix + '/'
        makedirs(self.tmp_dir)
        self.current_set = None
        self.batch_loc = [0, 0]
        self.time_bak = 0
        self.apprx = apprx

        # used to map between real w and the sliced cnts matrix in memory
        self.batch_map = np.zeros(self.W, dtype=np.int32)
        self.batch_map_4w = np.zeros(self.W, dtype=np.int32)
        self.w4_cnt = None
        util_funcs.set_srand()

        # allocate the file
        theta_file = h5py.File(self.tmp_dir + 'theta' + suffix + '.h5', 'w')
        self.theta = theta_file.create_dataset('theta', (K, self.W), dtype='float32')
        self.norm_const = np.zeros((self.K, 1), dtype=np.float32)

        # comments for 10708 prj
        # here theta is the T matrix I refer to in the paper, and theta in the paper is named as g_theta (global theta)
        # I init theta one chunk at a time for the sake of memory
        # the norm_const is the normal constant of theta, where we have: phi (topic-word mat) = theta / norm_const
        start = 0
        while start < self.W:
            end = start + max_len
            end = end * (end <= self.W) + self.W * (end > self.W)
            tmp = np.random.gamma(1, 1, (self.K, end-start)); collect()
            self.theta[:, start:end] = tmp
            self.norm_const[:] += np.sum(tmp, 1)[:, np.newaxis]

            start = end
            tmp = None; collect()

        # ndk, nd and nk are cnt matrix similar to those used in vanilla collapsed gibbs sampling, and we use them
        # to count in every minibatches
        self.ndk = np.zeros((self.batch_size, K), dtype=np.int32)
        self.ndk_avg = np.zeros((self.batch_size, K), dtype=np.float32)
        self.nd = np.zeros(self.batch_size, dtype=np.int32)
        self.nk = np.zeros(K, dtype=np.int32)

        # these are alias tables for lightlda fast per-token sampling, you can refer to the summary in my paper to
        # refresh yourself
        self.table_h = np.zeros(self.K, dtype=np.int32)
        self.table_l = np.zeros(self.K, dtype=np.int32)
        self.table_p = np.zeros(self.K, dtype=np.float32)
        self.samples = None
        self.test_doc = load(open(self.data_dir + test_doc, 'r'))

        self.mask = np.ones(self.W, dtype=bool)