def __init__(self, config, device): for k, v in config.test.items(): setattr(self, k, v) self.dc_gate = config.model_param.dc_gate self.multi_value = config.train.multi_value self.sch_embed = (config.model_param.sch.type == "embed") nlp = spacy.load('en') self.tokenizer = \ spacy.lang.en.English().Defaults().create_tokenizer(nlp) self.logger = create_logger(name="TEST") self.origin_dir = Path(config.data.data_dir) self.data_dir = Path(config.data.save_dir) self.exp_dir = self.origin_dir / "exp" / config.model / self.exp self.pred_dir = self.origin_dir / "prediction" if not self.pred_dir.exists(): self.pred_dir.mkdir() self.config = config self.device = create_device(device) self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", 'rb')) self.model = Model(config=config.model_param, vocab=self.vocab, device=self.device) self.logger.info(f"[-] Reading word vector......") self.emb = {} with open(config.data.embed_path, 'r') as file: for line in tqdm(file, total=get_num_lines(config.data.embed_path), leave=False): data = line.strip().split(' ') token, emb = data[0], list(map(float, data[1:])) self.emb[token] = emb if hasattr(self, "model_path"): self.model.load_state(self.model_path, save_device=config.train.device, load_device=config.test.device) else: self.model.load_best_state(self.exp_dir / "ckpt", save_device=config.train.device, load_device=config.test.device) self.trim_front = [',', '.', '?', '!', ':', "'"] self.trim_back = ['#']
def __init__(self, config, device): for k, v in config.train.items(): setattr(self, k, v) self.dc_gate = config.model_param.dc_gate self.sch_embed = config.model_param.sch.type == "embed" self.logger = create_logger(name="TRAIN") self.origin_dir = Path(config.data.data_dir) self.data_dir = Path(config.data.save_dir) self.exp_dir = self.origin_dir / "exp" / config.model / self.exp self.config = config self.device = create_device(device) self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", "rb")) self.model = Model( config=config.model_param, vocab=self.vocab, device=self.device ) self.__cur_epoch = 0
def main(config_path): config = Box.from_yaml(config_path.open()) torch.cuda.set_device(config.train.device) logger = create_logger(name="MAIN") logger.info(f"[-] Config loaded from {config_path}") data_dir = Path(config.data.data_dir) save_dir = Path(config.data.save_dir) if not save_dir.exists(): save_dir.mkdir() transfo_dir = Path(config.data.transfo_dir) device = create_device(config.train.device) tokenizer = BertTokenizer.from_pretrained( str(transfo_dir), do_lower_case=(not config.data.cased)) global CLS global SEP global PAD CLS, SEP, PAD = tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]", "[PAD]"]) bert_config = BertConfig.from_pretrained(str(transfo_dir)) # To extract representations from other layers bert_config.output_hidden_states = True model = BertModel(bert_config) model.to(device) model.eval() train_file = data_dir / "schema_dstc8+m2.2.json" train_vocab_file = save_dir / "train_schema_vocab.pkl" train_embed_file = save_dir / "train_schema_embed.pkl" train_desc_file = save_dir / "train_schema_desc.pkl" valid_file = data_dir / "dev" / "schema.json" valid_vocab_file = save_dir / "valid_schema_vocab.pkl" valid_embed_file = save_dir / "valid_schema_embed.pkl" valid_desc_file = save_dir / "valid_schema_desc.pkl" if (data_dir / "test").exists(): test_file = data_dir / "test" / "schema.json" test_vocab_file = save_dir / "test_schema_vocab.pkl" test_embed_file = save_dir / "test_schema_embed.pkl" test_desc_file = save_dir / "test_schema_desc.pkl" else: test_file = None test_vocab_file = None test_embed_file = None test_desc_file = None train_schema_vocab, train_desc = extract(train_file, config.data.concat_name) valid_schema_vocab, valid_desc = extract(valid_file, config.data.concat_name) if test_file is not None: test_schema_vocab, test_desc = extract(test_file, config.data.concat_name) else: test_schema_vocab = test_desc = None pickle.dump(train_schema_vocab, open(train_vocab_file, "wb")) pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb")) if test_schema_vocab is not None: pickle.dump(test_schema_vocab, open(test_vocab_file, "wb")) layer = config.data.schema.layer pooling = config.data.schema.pooling train_embed = [] for desc in tqdm(train_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) train_embed.append(embed) train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in train_desc] pickle.dump(train_embed, open(train_embed_file, "wb")) pickle.dump(train_desc, open(train_desc_file, "wb")) valid_embed = [] for desc in tqdm(valid_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) valid_embed.append(embed) valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in valid_desc] pickle.dump(valid_embed, open(valid_embed_file, "wb")) pickle.dump(valid_desc, open(valid_desc_file, "wb")) if test_desc is None: exit() test_embed = [] for desc in tqdm(test_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) test_embed.append(embed) test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in test_desc] pickle.dump(test_embed, open(test_embed_file, "wb")) pickle.dump(test_desc, open(test_desc_file, "wb"))
def __init__(self, config, device, model_path=None, use_sgd=False, epoch=None): for k, v in config.test.items(): setattr(self, k, v) self.dc_gate = config.model_param.dc_gate self.multi_value = config.train.multi_value self.sch_embed = config.model_param.sch.type == "embed" nlp = spacy.load("en") self.tokenizer = spacy.lang.en.English().Defaults().create_tokenizer( nlp) self.logger = create_logger(name="TEST") self.origin_dir = Path(config.data.data_dir) self.data_dir = Path(config.data.save_dir) self.exp_dir = self.origin_dir / "exp" / config.model / self.exp if model_path: self.model_path = model_path self.pred_dir = (self.origin_dir / "prediction" / epoch if epoch else self.origin_dir / "prediction") if not self.pred_dir.exists(): self.pred_dir.mkdir() self.config = config self.device = create_device(device) self.vocab = pickle.load( open( self.data_dir if not use_sgd else Path("../save/") / "vocab.pkl", "rb")) self.model = Model(config=config.model_param, vocab=self.vocab, device=self.device) self.logger.info(f"[-] Reading word vector......") self.emb = {} with open(config.data.embed_path, "r") as file: for line in tqdm(file, total=get_num_lines(config.data.embed_path), leave=False): data = line.strip().split(" ") token, emb = data[0], list(map(float, data[1:])) self.emb[token] = emb if hasattr(self, "model_path"): self.model.load_state( self.model_path, save_device=config.train.device, load_device=config.test.device, ) else: self.model.load_best_state( self.exp_dir / "ckpt", save_device=config.train.device, load_device=config.test.device, ) self.trim_front = [",", ".", "?", "!", ":", "'"] self.trim_back = ["#"]