def __init__(self, opt=None): self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto) self.logger = aux.get_logger("culda", level=self.opt.py_log_level) assert self.opt.block_dim <= WARP_SIZE ** 2 and \ self.opt.block_dim % WARP_SIZE == 0, \ f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})" tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) tmp.write(opt_content) tmp.close() self.logger.info("opt: %s", opt_content) self.obj = CuLDABind() assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" os.remove(tmp.name) self.words, self.num_words, self.num_docs = None, None, None self.alpha, self.beta, self.grad_alpha, self.new_beta = \ None, None, None, None self.tmp_dirs = [] atexit.register(self.remove_tmp)
def preprocess_data(self): if self.opt.skip_preprocess: return iou = IoUtils(aux.proto_to_dict(self.opt.io)) if not self.opt.processed_data_path: data_dir = tempfile.TemporaryDirectory().name self.tmp_dirs.append(data_dir) self.opt.processed_data_path = pjoin(data_dir, "token.h5") iou.convert_bow_to_h5(self.opt.data_path, self.opt.processed_data_path)
def preprocess_data(self): if self.opt.skip_preprocess: return iou = IoUtils(aux.proto_to_dict(self.opt.io)) if not self.opt.processed_data_dir: self.opt.processed_data_dir = tempfile.TemporaryDirectory().name self.tmp_dirs.append(self.opt.processed_data_dir) iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count, self.opt.processed_data_dir)
def __init__(self, opt=None): self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto) self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level) tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) tmp.write(opt_content) tmp.close() self.logger.info("opt: %s", opt_content) self.obj = IoUtilsBind() assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" os.remove(tmp.name)
def init_model(self): # load voca data_dir = self.opt.processed_data_dir keys_path = pjoin(data_dir, "keys.txt") count_path = pjoin(data_dir, "count.txt") self.logger.info("load key, count from %s, %s", keys_path, count_path) with open(keys_path, "rb") as fin: self.words = [line.strip().decode("utf8") for line in fin] with open(count_path, "rb") as fin: self.word_count = np.array([int(line.strip()) for line in fin], dtype=np.int64) self.num_words = len(self.words) assert len(self.words) == len(self.word_count) # count number of docs h5f = h5py.File(pjoin(data_dir, "token.h5"), "r") self.num_docs = h5f["indptr"].shape[0] - 1 h5f.close() self.logger.info("number of words: %d, docs: %d", self.num_words, self.num_docs) # normalize word count word_count = np.power(self.word_count, self.opt.count_power, dtype=np.float64) word_count /= np.sum(word_count) if self.opt.neg: self.obj.build_random_table(word_count, self.opt.random_size) else: self.obj.build_huffman_tree(word_count.astype(np.float32)) # random initialize alpha and beta np.random.seed(self.opt.seed) scale = 1 / np.sqrt(self.opt.num_dims) self.emb_in = np.random.normal(loc=0, scale=scale, \ size=(self.num_words, self.opt.num_dims)).astype(np.float32) out_words = self.num_words if self.opt.neg else self.num_words - 1 self.emb_out = np.random.normal(loc=0, scale=scale, \ size=(out_words, self.opt.num_dims)).astype(np.float32) self.logger.info("emb_in %s, emb_out %s initialized", self.emb_in.shape, self.emb_out.shape) if self.opt.pretrained_model.filename: self.load_word2vec_format(**aux.proto_to_dict(self.opt.pretrained_model)) # push it to gpu self.obj.load_model(self.emb_in, self.emb_out)