Пример #1
0
    def __init__(self, opt=None):
        self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
        self.logger = aux.get_logger("culda", level=self.opt.py_log_level)

        assert self.opt.block_dim <= WARP_SIZE ** 2 and \
          self.opt.block_dim % WARP_SIZE == 0, \
          f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})"

        tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
        opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
        tmp.write(opt_content)
        tmp.close()

        self.logger.info("opt: %s", opt_content)
        self.obj = CuLDABind()
        assert self.obj.init(bytes(tmp.name,
                                   "utf8")), f"failed to load {tmp.name}"
        os.remove(tmp.name)

        self.words, self.num_words, self.num_docs = None, None, None
        self.alpha, self.beta, self.grad_alpha, self.new_beta = \
          None, None, None, None

        self.tmp_dirs = []
        atexit.register(self.remove_tmp)
Пример #2
0
 def preprocess_data(self):
     if self.opt.skip_preprocess:
         return
     iou = IoUtils(aux.proto_to_dict(self.opt.io))
     if not self.opt.processed_data_path:
         data_dir = tempfile.TemporaryDirectory().name
         self.tmp_dirs.append(data_dir)
         self.opt.processed_data_path = pjoin(data_dir, "token.h5")
     iou.convert_bow_to_h5(self.opt.data_path, self.opt.processed_data_path)
Пример #3
0
 def preprocess_data(self):
   if self.opt.skip_preprocess:
     return
   iou = IoUtils(aux.proto_to_dict(self.opt.io))
   if not self.opt.processed_data_dir:
     self.opt.processed_data_dir = tempfile.TemporaryDirectory().name
     self.tmp_dirs.append(self.opt.processed_data_dir)
   iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count,
                            self.opt.processed_data_dir)
Пример #4
0
  def __init__(self, opt=None):
    self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto)
    self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level)

    tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
    opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
    tmp.write(opt_content)
    tmp.close()

    self.logger.info("opt: %s", opt_content)
    self.obj = IoUtilsBind()
    assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
    os.remove(tmp.name)
Пример #5
0
  def init_model(self):
    # load voca
    data_dir = self.opt.processed_data_dir
    keys_path = pjoin(data_dir, "keys.txt")
    count_path = pjoin(data_dir, "count.txt")
    self.logger.info("load key, count from %s, %s", keys_path, count_path)
    with open(keys_path, "rb") as fin:
      self.words = [line.strip().decode("utf8") for line in fin]
    with open(count_path, "rb") as fin:
      self.word_count = np.array([int(line.strip()) for line in fin],
                                 dtype=np.int64)
    self.num_words = len(self.words)
    assert len(self.words) == len(self.word_count)

    # count number of docs
    h5f = h5py.File(pjoin(data_dir, "token.h5"), "r")
    self.num_docs = h5f["indptr"].shape[0] - 1
    h5f.close()

    self.logger.info("number of words: %d, docs: %d",
                     self.num_words, self.num_docs)

    # normalize word count
    word_count = np.power(self.word_count, self.opt.count_power,
                          dtype=np.float64)
    word_count /= np.sum(word_count)
    if self.opt.neg:
      self.obj.build_random_table(word_count, self.opt.random_size)
    else:
      self.obj.build_huffman_tree(word_count.astype(np.float32))

    # random initialize alpha and beta
    np.random.seed(self.opt.seed)
    scale = 1 / np.sqrt(self.opt.num_dims)
    self.emb_in = np.random.normal(loc=0, scale=scale, \
      size=(self.num_words, self.opt.num_dims)).astype(np.float32)
    out_words = self.num_words if self.opt.neg else self.num_words - 1
    self.emb_out = np.random.normal(loc=0, scale=scale, \
      size=(out_words, self.opt.num_dims)).astype(np.float32)
    self.logger.info("emb_in %s, emb_out %s initialized",
                     self.emb_in.shape, self.emb_out.shape)

    if self.opt.pretrained_model.filename:
      self.load_word2vec_format(**aux.proto_to_dict(self.opt.pretrained_model))

    # push it to gpu
    self.obj.load_model(self.emb_in, self.emb_out)