예제 #1
0
    def __init__(self, config, device):
        for k, v in config.test.items():
            setattr(self, k, v)
        self.dc_gate = config.model_param.dc_gate
        self.multi_value = config.train.multi_value
        self.sch_embed = (config.model_param.sch.type == "embed")

        nlp = spacy.load('en')
        self.tokenizer = \
            spacy.lang.en.English().Defaults().create_tokenizer(nlp)

        self.logger = create_logger(name="TEST")

        self.origin_dir = Path(config.data.data_dir)
        self.data_dir = Path(config.data.save_dir)
        self.exp_dir = self.origin_dir / "exp" / config.model / self.exp
        self.pred_dir = self.origin_dir / "prediction"
        if not self.pred_dir.exists():
            self.pred_dir.mkdir()

        self.config = config
        self.device = create_device(device)

        self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", 'rb'))
        self.model = Model(config=config.model_param,
                           vocab=self.vocab,
                           device=self.device)
        self.logger.info(f"[-] Reading word vector......")
        self.emb = {}
        with open(config.data.embed_path, 'r') as file:
            for line in tqdm(file,
                             total=get_num_lines(config.data.embed_path),
                             leave=False):
                data = line.strip().split(' ')
                token, emb = data[0], list(map(float, data[1:]))
                self.emb[token] = emb

        if hasattr(self, "model_path"):
            self.model.load_state(self.model_path,
                                  save_device=config.train.device,
                                  load_device=config.test.device)
        else:
            self.model.load_best_state(self.exp_dir / "ckpt",
                                       save_device=config.train.device,
                                       load_device=config.test.device)

        self.trim_front = [',', '.', '?', '!', ':', "'"]
        self.trim_back = ['#']
예제 #2
0
    def __init__(self, config, device):
        for k, v in config.train.items():
            setattr(self, k, v)
        self.dc_gate = config.model_param.dc_gate
        self.sch_embed = config.model_param.sch.type == "embed"

        self.logger = create_logger(name="TRAIN")
        self.origin_dir = Path(config.data.data_dir)
        self.data_dir = Path(config.data.save_dir)
        self.exp_dir = self.origin_dir / "exp" / config.model / self.exp

        self.config = config
        self.device = create_device(device)

        self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", "rb"))
        self.model = Model(
            config=config.model_param, vocab=self.vocab, device=self.device
        )
        self.__cur_epoch = 0
예제 #3
0
def main(config_path):
    config = Box.from_yaml(config_path.open())
    torch.cuda.set_device(config.train.device)
    logger = create_logger(name="MAIN")
    logger.info(f"[-] Config loaded from {config_path}")

    data_dir = Path(config.data.data_dir)
    save_dir = Path(config.data.save_dir)
    if not save_dir.exists():
        save_dir.mkdir()
    transfo_dir = Path(config.data.transfo_dir)
    device = create_device(config.train.device)

    tokenizer = BertTokenizer.from_pretrained(
        str(transfo_dir), do_lower_case=(not config.data.cased))

    global CLS
    global SEP
    global PAD
    CLS, SEP, PAD = tokenizer.convert_tokens_to_ids(
        ["[CLS]", "[SEP]", "[PAD]"])

    bert_config = BertConfig.from_pretrained(str(transfo_dir))
    # To extract representations from other layers
    bert_config.output_hidden_states = True
    model = BertModel(bert_config)
    model.to(device)
    model.eval()

    train_file = data_dir / "schema_dstc8+m2.2.json"
    train_vocab_file = save_dir / "train_schema_vocab.pkl"
    train_embed_file = save_dir / "train_schema_embed.pkl"
    train_desc_file = save_dir / "train_schema_desc.pkl"
    valid_file = data_dir / "dev" / "schema.json"
    valid_vocab_file = save_dir / "valid_schema_vocab.pkl"
    valid_embed_file = save_dir / "valid_schema_embed.pkl"
    valid_desc_file = save_dir / "valid_schema_desc.pkl"
    if (data_dir / "test").exists():
        test_file = data_dir / "test" / "schema.json"
        test_vocab_file = save_dir / "test_schema_vocab.pkl"
        test_embed_file = save_dir / "test_schema_embed.pkl"
        test_desc_file = save_dir / "test_schema_desc.pkl"
    else:
        test_file = None
        test_vocab_file = None
        test_embed_file = None
        test_desc_file = None

    train_schema_vocab, train_desc = extract(train_file,
                                             config.data.concat_name)
    valid_schema_vocab, valid_desc = extract(valid_file,
                                             config.data.concat_name)
    if test_file is not None:
        test_schema_vocab, test_desc = extract(test_file,
                                               config.data.concat_name)
    else:
        test_schema_vocab = test_desc = None

    pickle.dump(train_schema_vocab, open(train_vocab_file, "wb"))
    pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb"))
    if test_schema_vocab is not None:
        pickle.dump(test_schema_vocab, open(test_vocab_file, "wb"))

    layer = config.data.schema.layer
    pooling = config.data.schema.pooling

    train_embed = []
    for desc in tqdm(train_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        train_embed.append(embed)

    train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in train_desc]

    pickle.dump(train_embed, open(train_embed_file, "wb"))
    pickle.dump(train_desc, open(train_desc_file, "wb"))

    valid_embed = []
    for desc in tqdm(valid_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        valid_embed.append(embed)

    valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in valid_desc]

    pickle.dump(valid_embed, open(valid_embed_file, "wb"))
    pickle.dump(valid_desc, open(valid_desc_file, "wb"))

    if test_desc is None:
        exit()

    test_embed = []
    for desc in tqdm(test_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        test_embed.append(embed)

    test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                  for sent in desc] for desc in test_desc]

    pickle.dump(test_embed, open(test_embed_file, "wb"))
    pickle.dump(test_desc, open(test_desc_file, "wb"))
예제 #4
0
    def __init__(self,
                 config,
                 device,
                 model_path=None,
                 use_sgd=False,
                 epoch=None):
        for k, v in config.test.items():
            setattr(self, k, v)
        self.dc_gate = config.model_param.dc_gate
        self.multi_value = config.train.multi_value
        self.sch_embed = config.model_param.sch.type == "embed"

        nlp = spacy.load("en")
        self.tokenizer = spacy.lang.en.English().Defaults().create_tokenizer(
            nlp)

        self.logger = create_logger(name="TEST")

        self.origin_dir = Path(config.data.data_dir)
        self.data_dir = Path(config.data.save_dir)
        self.exp_dir = self.origin_dir / "exp" / config.model / self.exp
        if model_path:
            self.model_path = model_path
        self.pred_dir = (self.origin_dir / "prediction" /
                         epoch if epoch else self.origin_dir / "prediction")
        if not self.pred_dir.exists():
            self.pred_dir.mkdir()

        self.config = config
        self.device = create_device(device)

        self.vocab = pickle.load(
            open(
                self.data_dir if not use_sgd else Path("../save/") /
                "vocab.pkl", "rb"))
        self.model = Model(config=config.model_param,
                           vocab=self.vocab,
                           device=self.device)
        self.logger.info(f"[-] Reading word vector......")
        self.emb = {}
        with open(config.data.embed_path, "r") as file:
            for line in tqdm(file,
                             total=get_num_lines(config.data.embed_path),
                             leave=False):
                data = line.strip().split(" ")
                token, emb = data[0], list(map(float, data[1:]))
                self.emb[token] = emb

        if hasattr(self, "model_path"):
            self.model.load_state(
                self.model_path,
                save_device=config.train.device,
                load_device=config.test.device,
            )
        else:
            self.model.load_best_state(
                self.exp_dir / "ckpt",
                save_device=config.train.device,
                load_device=config.test.device,
            )

        self.trim_front = [",", ".", "?", "!", ":", "'"]
        self.trim_back = ["#"]