def __init__(self): super(BertPolicy, self).__init__() # load config infer_config = self.load_config() # download data model_dir = os.path.join(infer_config["data_path"], "trained_model") # model_dir = os.path.join('/xhp/src/xbot/output/policy/bert', 'Epoch-19-f1-0.903') infer_config["model_dir"] = model_dir self.download_data(infer_config, model_dir) # 应该保持和训练使用的一致,否则 label 顺序不一致,TODO 训练时对 act_ontology 排序 self.act_ontology = load_json(infer_config["act_ontology"]) self.num_act = len(self.act_ontology) model_config = BertConfig.from_pretrained(infer_config["model_dir"]) model_config.num_labels = self.num_act self.model = BertForSequenceClassification.from_pretrained( infer_config["model_dir"], config=model_config) self.tokenizer = BertTokenizer.from_pretrained( infer_config["model_dir"]) self.model.eval() self.model.to(infer_config["device"]) self.db = Database() self.config = infer_config self.threshold = infer_config["threshold"]
def load_config() -> dict: """Load config for inference. Returns: config dict """ common_config_path = os.path.join(get_config_path(), BertPolicy.common_config_name) infer_config_path = os.path.join(get_config_path(), BertPolicy.inference_config_name) common_config = load_json(common_config_path) infer_config = load_json(infer_config_path) infer_config.update(common_config) infer_config["device"] = torch.device( "cuda" if torch.cuda.is_available() else "cpu") infer_config["data_path"] = os.path.join(get_data_path(), "crosswoz/policy_bert_data") if not os.path.exists(infer_config["data_path"]): os.makedirs(infer_config["data_path"]) return infer_config
def load_act_ontology() -> Tuple[List[str], int]: """Load action ontology from cache. Returns: action ontology and numbers of action """ act_ontology = load_json( os.path.join(get_data_path(), "crosswoz/policy_bert_data/act_ontology.json")) num_act = len(act_ontology) return act_ontology, num_act
def load_data(self, data_type: str) -> DataLoader: """Load data from data cache or build from scratch. Args: data_type: train, dev or tests Returns: DataLoader, see torch.utils.data.DataLoader """ raw_data_path = os.path.join( self.config["raw_data_path"], f"{data_type}.json.zip" ) filename = f"{data_type}.json" output_path = os.path.join(self.config["data_path"], filename) if not os.path.exists(output_path) or not self.config["use_data_cache"]: examples = preprocess(raw_data_path, output_path, filename) else: print(f"Loading {data_type} data from cache ...") examples = load_json(output_path) if self.config["debug"]: examples = random.sample(examples, k=int(len(examples) * 0.1)) examples_dict = self.get_input_ids(examples) print(f"Starting building {data_type} dataset ...") dataset = PolicyDataset(**examples_dict) shuffle = True if data_type == "train" else False collate = partial(collate_fn, mode=data_type) batch_size = self.config[f"{data_type}_batch_size"] dataloader = DataLoader( dataset=dataset, batch_size=batch_size, num_workers=self.config["num_workers"], shuffle=shuffle, pin_memory=True, collate_fn=collate, ) return dataloader