def init_state(logger, config, args): logger.log('Loading data...') with open(args.data) as f_o: data, _ = load_data(args.data) limit_passage = config.get('training', {}).get('limit') vocab_size = config.get('training', {}).get('vocab_size', None) logger.log('Tokenizing data...') data, token_to_id, char_to_id = tokenize_data(logger, data, vocab_size, True, limit_passage) data = get_loader(data, config) id_to_token = {id_: tok for tok, id_ in token_to_id.items()} id_to_char = {id_: char for char, id_ in char_to_id.items()} assert(token_to_id[C.SOS_TOKEN] == C.SOS_INDEX) assert(token_to_id[C.UNK_TOKEN] == C.UNK_INDEX) assert(token_to_id[C.EOS_TOKEN] == C.EOS_INDEX) assert(token_to_id[C.PAD_TOKEN] == C.PAD_INDEX) logger.log('Creating model...') model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char) if args.word_rep: logger.log('Loading pre-trained embeddings...') with open(args.word_rep) as f_o: pre_trained = SymbolEmbSourceText( f_o, set(tok for id_, tok in id_to_token.items() if id_ != 0)) mean, cov = pre_trained.get_norm_stats(args.use_covariance) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_token, 0, model.embedder.embeddings[0].embeddings.weight.data.numpy(), pre_trained, oovs)) else: pass # No pretraining, just keep the random values. # Char embeddings are already random, so we don't need to update them. if torch.cuda.is_available() and args.cuda: model.cuda() model.train() optimizer = get_optimizer(model, config, state=None) return model, id_to_token, id_to_char, optimizer, data
def init_state(config, args, loading_limit=None): token_to_id = {'': 0} char_to_id = {'': 0} print('Load Data [1/6]') with open(args.data) as f_o: data, _ = load_data(json.load(f_o), span_only=True, answered_only=True, loading_limit=loading_limit) print('Tokenize Data [2/6]') data = tokenize_data(data, token_to_id, char_to_id) data = get_loader(data, config) print('Create Inverse Dictionaries [3/6]') id_to_token = {id_: tok for tok, id_ in token_to_id.items()} id_to_char = {id_: char for char, id_ in char_to_id.items()} print('Initiate Model [4/6]') model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char) if args.word_rep: print('Load pre-trained embeddings [5/6]') with open(args.word_rep) as f_o: pre_trained = SymbolEmbSourceText( f_o, set(tok for id_, tok in id_to_token.items() if id_ != 0)) mean, cov = pre_trained.get_norm_stats(args.use_covariance) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_token, 0, model.embedder.embeddings[0].embeddings.weight.data.numpy(), pre_trained, oovs)) else: print('No pre-trained embeddings given [5/6]') pass # No pretraining, just keep the random values. # Char embeddings are already random, so we don't need to update them. if torch.cuda.is_available() and args.cuda: model.cuda() model.train() optimizer = get_optimizer(model, config, state=None) print('Done init_state [6/6]') return model, id_to_token, id_to_char, optimizer, data
def init_state(config, args): token_to_id = {'': 0} char_to_id = {'': 0} print('Loading data...') with open(args.data) as f_o: data, _ = load_data(json.load(f_o), span_only=True, answered_only=True) print('Tokenizing data...') data = tokenize_data(data, token_to_id, char_to_id) data = get_loader(data, config) id_to_token = {id_: tok for tok, id_ in token_to_id.items()} id_to_char = {id_: char for char, id_ in char_to_id.items()} print('Creating model...') model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char) if args.word_rep: print('Loading pre-trained embeddings...') with open(args.word_rep) as f_o: pre_trained = SymbolEmbSourceText( f_o, set(tok for id_, tok in id_to_token.items() if id_ != 0)) mean, cov = pre_trained.get_norm_stats(args.use_covariance) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_token, 0, model.embedder.embeddings[0].embeddings.weight.data.numpy(), pre_trained, oovs)) else: pass # No pretraining, just keep the random values. # Char embeddings are already random, so we don't need to update them. if torch.cuda.is_available() and args.cuda: model.cuda() model.train() optimizer = get_optimizer(model, config, state=None) return model, id_to_token, id_to_char, optimizer, data
def reload_state(checkpoint, config, args): """ Reload state before predicting. """ print('Loading Model...') model, id_to_token, id_to_char = BidafModel.from_checkpoint( config['bidaf'], checkpoint) token_to_id = {tok: id_ for id_, tok in id_to_token.items()} char_to_id = {char: id_ for id_, char in id_to_char.items()} len_tok_voc = len(token_to_id) len_char_voc = len(char_to_id) with open(args.data) as f_o: data, _ = load_data(json.load(f_o), span_only=True, answered_only=True) data = tokenize_data(data, token_to_id, char_to_id) id_to_token = {id_: tok for tok, id_ in token_to_id.items()} id_to_char = {id_: char for char, id_ in char_to_id.items()} data = get_loader(data, args) if len_tok_voc != len(token_to_id): need = set(tok for id_, tok in id_to_token.items() if id_ >= len_tok_voc) if args.word_rep: with open(args.word_rep) as f_o: pre_trained = SymbolEmbSourceText(f_o, need) else: pre_trained = SymbolEmbSourceText([], need) cur = model.embedder.embeddings[0].embeddings.weight.data.numpy() mean = cur.mean(0) if args.use_covariance: cov = np.cov(cur, rowvar=False) else: cov = cur.std(0) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) if args.word_rep: print('Augmenting with pre-trained embeddings...') else: print('Augmenting with random embeddings...') model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_token, len_tok_voc, model.embedder.embeddings[0].embeddings.weight.data.numpy(), pre_trained, oovs)) if len_char_voc != len(char_to_id): print('Augmenting with random char embeddings...') pre_trained = SymbolEmbSourceText([], None) cur = model.embedder.embeddings[1].embeddings.weight.data.numpy() mean = cur.mean(0) if args.use_covariance: cov = np.cov(cur, rowvar=False) else: cov = cur.std(0) rng = np.random.RandomState(2) oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance) model.embedder.embeddings[1].embeddings.weight.data = torch.from_numpy( symbol_injection( id_to_char, len_char_voc, model.embedder.embeddings[1].embeddings.weight.data.numpy(), pre_trained, oovs)) if torch.cuda.is_available() and args.cuda: model.cuda() model.eval() return model, id_to_token, id_to_char, data