def optimize(self): opt = self.opt.optimize iters, max_trials = 0, opt.get('max_trials', -1) space = self._get_space(opt.space) with log.pbar(log.INFO, desc='optimizing... ', total=None if max_trials == -1 else max_trials, mininterval=30) as pbar: tb_opt = None tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick if opt.start_with_default_parameters: with log.supress_log_level(log.WARN): loss = self._optimize({}) self.logger.info( f'Starting with default parameter result: {loss}') self._optimization_info['best'] = loss # NOTE: need better way tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick self.initialize_tensorboard( 1000000 if max_trials == -1 else max_trials, name_postfix='.optimize') tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick while (max_trials): with log.supress_log_level(log.WARN): best = fmin(fn=self._optimize, space=space, algo=tpe.suggest, max_evals=len( self._optimization_info['trials'].trials) + 1, trials=self._optimization_info['trials'], show_progressbar=False) tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick self.update_tensorboard_data(self._optimize_loss) tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick iters += 1 max_trials -= 1 if self._optimization_info.get('best', {}).get( 'loss', 987654321) > self._optimize_loss['loss']: is_first_time = self._optimization_info['best'] == {} best = self._optimize_loss # we cannot use return value of hyperopt due to randint behavior patch self.logger.info( f'Found new best parameters: {best} @ iter {iters}') self._optimization_info['best'] = best if opt.deployment and (is_first_time or not opt.min_trials or opt.min_trials >= iters): if not self.opt.model_path: raise RuntimeError( 'Failed to dump model: model path is not defined' ) self.logger.info('Saving model... to {}'.format( self.opt.model_path)) self.save(self.opt.model_path) if self.optimize_after_callback_fn: self.optimize_after_callback_fn(self) pbar.update(1) self.logger.debug('Params({}) Losses({})'.format( self._optimize_params, self._optimize_loss)) tb_opt, self.opt.tensorboard = self.opt.tensorboard, tb_opt # trick self.finalize_tensorboard()
def _iterate(self, buf, group='rowwise'): header = self.data.get_header() # end = header['num_users'] if group == 'rowwise' else header['num_items'] int_group = 0 if group == 'rowwise' else 1 st = time.time() self.obj.precompute(int_group) el, st = time.time() - st, time.time() loss_nume, loss_deno = 0.0, 0.0 update_t, feed_t, updated = el, 0, 0 buf.set_group(group) with log.pbar(log.DEBUG, desc='%s' % group, total=header['num_nnz'], mininterval=30) as pbar: for sz in buf.fetch_batch(): updated += sz start_x, next_x, indptr, keys, vals = buf.get() _feed_t, st = time.time() - st, time.time() _loss_nume, _loss_deno = self.obj.partial_update( start_x, next_x, indptr, keys, vals, int_group) loss_nume += _loss_nume loss_deno += _loss_deno _update_t, st = time.time() - st, time.time() pbar.update(sz) feed_t += _feed_t update_t += _update_t self.logger.debug( f'{group} updated: processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.03}s)' ) return loss_nume, loss_deno
def _build_compressed_triplets(self, db, job_files, num_lines, max_key, is_colwise=0): self.logger.info('Total job files: %s' % len(job_files)) with log.pbar(log.INFO, total=len(job_files), mininterval=10) as pbar: indptr_index = 0 data_index = 0 RECORD_SIZE = 12 prev_key = 0 for job in job_files: with open(job, 'rb') as fin: total_size = fin.seek(0, 2) if total_size == 0: continue total_records = int(total_size / RECORD_SIZE) fin.seek(0, 0) data = np.fromstring(fin.read(), dtype=np.dtype([('u', 'i'), ('i', 'i'), ('v', 'f')]), count=total_records) U, I, V = data['u'], data['i'], data['v'] if is_colwise: U, I = I, U U -= 1 I -= 1 V = self.value_prepro(V) self.logger.debug("minU: {}, maxU: {}".format(U[0], U[-1])) assert data_index + total_records <= num_lines, 'Requests data size(%s) exceed capacity(%s)' % ( data_index + total_records, num_lines) db['key'][data_index:data_index + total_records] = I db['val'][data_index:data_index + total_records] = V indptr = [data_index for j in range(U[0] - prev_key)] indptr += [ data_index + i for i in range(1, total_records) for j in range(U[i] - U[i - 1]) ] db['indptr'][indptr_index:indptr_index + len(indptr)] = indptr assert indptr_index + len(indptr) <= max_key data_index += total_records indptr_index += len(indptr) prev_key = U[-1] pbar.update(1) db["indptr"][indptr_index:] = data_index for path in job_files: os.remove(path)
def _iterate(self): header = self.data.get_header() end = header['num_users'] update_t, feed_t, updated = 0, 0, 0 self.buf.set_group('rowwise') with log.pbar(log.DEBUG, total=header['num_nnz'], mininterval=15) as pbar: start_t = time.time() for sz in self.buf.fetch_batch(): updated += sz feed_t += time.time() - start_t start_x, next_x, indptr, keys = self.buf.get() start_t = time.time() self.obj.add_jobs(start_x, next_x, indptr, keys) update_t += time.time() - start_t pbar.update(sz) self.logger.debug( f'processed({updated}) elapsed(data feed: {feed_t:0.3f}s update: {update_t:0.3f})s' )
def _iterate(self, buf, group='user'): assert group in ["user", "item", "context"], f"group {group} is not properly provided" header = self.data.get_scale_info(with_sppmi=True) err, update_t, feed_t, updated = 0, 0, 0, 0 if group == "user": self.obj.precompute("item".encode("utf8")) total = header["num_nnz"] _groups = ["rowwise"] elif group == "item": self.obj.precompute("user".encode("utf8")) total = header["num_nnz"] + header["sppmi_nnz"] _groups = ["colwise", "sppmi"] elif group == "context": total = header["sppmi_nnz"] _groups = ["sppmi"] with log.pbar(log.DEBUG, desc='%s' % group, total=total, mininterval=30) as pbar: st = time.time() for start_x, next_x in buf.fetch_batch_range(_groups): feed_t += time.time() - st _err, _updated, _update_t, _feed_t = \ self.partial_update(buf, group, start_x, next_x) update_t += _update_t updated += _updated err += _err pbar.update(_updated) st = time.time() pbar.refresh() self.logger.debug( f'updated {group} processed({updated}) elapsed(data feed: {feed_t:.3f} update: {update_t:.3f}")' ) return err
def _create(self, data_path, P, H): def get_max_column_length(fname): with open(fname) as fin: max_col = 0 for l in fin: max_col = max(max_col, len(l)) return max_col uid_path, iid_path, main_path = P['uid_path'], P['iid_path'], P[ 'main_path'] num_users, num_items, num_nnz = map(int, H.split()) # Manually updating progress bar is a bit naive with log.pbar(log.DEBUG, total=5, mininterval=30) as pbar: uid_max_col = len(str(num_users)) + 1 if uid_path: uid_max_col = get_max_column_length(uid_path) + 1 pbar.update(1) iid_max_col = len(str(num_items)) + 1 if iid_path: iid_max_col = get_max_column_length(iid_path) + 1 pbar.update(1) try: db = self._create_database(data_path, num_users=num_users, num_items=num_items, num_nnz=num_nnz, uid_max_col=uid_max_col, iid_max_col=iid_max_col) idmap = db['idmap'] # if not given, assume id as is if uid_path: with open(uid_path) as fin: idmap['rows'][:] = np.loadtxt(fin, dtype=f'S{uid_max_col}') else: idmap['rows'][:] = np.array( [str(i) for i in range(1, num_users + 1)], dtype=f'S{uid_max_col}') pbar.update(1) if iid_path: with open(iid_path) as fin: idmap['cols'][:] = np.loadtxt(fin, dtype=f'S{iid_max_col}') else: idmap['cols'][:] = np.array( [str(i) for i in range(1, num_items + 1)], dtype=f'S{iid_max_col}') pbar.update(1) num_header_lines = 0 with open(main_path) as fin: for line in fin: if line.strip().startswith('%'): num_header_lines += 1 else: break pbar.update(1) except Exception as e: self.logger.error('Cannot create db: %s' % (str(e))) self.logger.error(traceback.format_exc()) raise return db, num_header_lines
def _create_working_data(self, db, source_path, ignore_lines): """ Args: source_path: source data file path ignore_lines: number of lines to skip from start line """ vali_indexes = [] if 'vali' not in db else db['vali']['indexes'] vali_lines = [] file_path = aux.get_temporary_file(self.opt.data.tmp_dir) with open(file_path, 'w') as w: fin = open(source_path, mode='r') file_size = fin.seek(0, 2) fin.seek(0, 0) for _ in range(ignore_lines): fin.readline() total = file_size - fin.tell() buffered = '' CHUNK_SIZE = 4096 * 1000 total_lines = 0 vali_indexes = sorted(vali_indexes) target_index = vali_indexes[0] if vali_indexes else -1 vali_indexes = vali_indexes[1:] with log.pbar(log.INFO, total=total, mininterval=10) as pbar: while True: buffered += fin.read(CHUNK_SIZE) if buffered == '': break current_file_position = fin.tell() pbar.update(CHUNK_SIZE) num_lines_on_buffer = buffered.count('\n') # search the position of validation sample and extract # it from training data while target_index >= 0 and target_index <= ( total_lines + num_lines_on_buffer): no_line = total_lines new_buffered = '' from_index = 0 for idx, c in enumerate(buffered): if c == '\n': if no_line == target_index: vali_lines.append(buffered[from_index:idx]) if from_index > 0: w.write(buffered[0:from_index]) new_buffered = buffered[idx + 1:] no_line += 1 total_lines += 1 num_lines_on_buffer -= 1 break no_line += 1 total_lines += 1 from_index = idx + 1 num_lines_on_buffer -= 1 buffered = new_buffered if vali_indexes: target_index, vali_indexes = vali_indexes[ 0], vali_indexes[1:] else: target_index = -1 where = buffered.rfind('\n') total_lines += num_lines_on_buffer if where != -1: w.write(buffered[:where + 1]) buffered = buffered[where + 1:] elif current_file_position == file_size: w.write(buffered) buffered = '' w.close() fin.close() return w.name, vali_lines