def main(args): spacy_en = spacy.load('en_core_web_sm', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner']) postprocess = str.lower if args.lower else lambda x: x def tokenizer(x): return [postprocess(token.text) for token in spacy_en(x) if not token.is_space] if args.only_question: indices = [1] desc = 'question' elif args.only_context: indices = [0] desc = 'context' else: indices = [0, 1] desc = 'question_context' basename, ext = os.path.splitext(args.vocab_path) min_freq = args.min_freq if args.min_freq else '' max_size = args.max_size if args.max_size else '' filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}' squad_tokens = load_squad_tokens(args.train_path, tokenizer, indices=indices) Vocabulary.build(squad_tokens, args.min_freq, args.max_size, (PAD_TOKEN, UNK_TOKEN), filename)
def __init__(self, model_path, device="cpu", max_len=50, verbose=1): super().__init__(model_path, device) self.max_len = max_len self.verbose = verbose self.vocab = Vocabulary() self.races = Races() self.genders = Genders() self.to_tensor = ToTensor() self.name_transform = Compose([self.vocab, OneHot(self.vocab.size), ToTensor()]) self.race_transform = Compose([self.races, OneHot(self.races.size), ToTensor()]) self.gender_transform = Compose([self.genders, OneHot(self.genders.size), ToTensor()])
def predict(args, states): vocab = Vocabulary(config.vocab_file) model = CnnTextClassifier(len(vocab)) model.load_state_dict(states["model"]) if torch.cuda.is_available(): model.cuda() for line in args.file: sequence = [vocab.token_to_id(t) for t in line.strip().split()] sequences = autograd.Variable(torch.LongTensor([sequence])) if torch.cuda.is_available(): sequences = sequences.cuda() probs, classes = model(sequences) print(classes.data[0])
def main(args): token_to_index, index_to_token = Vocabulary.load(args.vocab_file) root, _ = os.path.splitext(args.vocab_file) basepath, basename = os.path.split(root) embed_path = f'{basepath}/embedding_{basename}.npy' embeddings = np.load(embed_path) if os.path.exists(embed_path) else None model = FastQA(len(token_to_index), args.embed, args.hidden, question_limit=args.q_len, context_limit=args.c_len, dropout=args.dropout, pretrained_embeddings=embeddings, with_feature=not args.without_feature).build() opt = Adam() model.compile(optimizer=opt, loss_weights=[1, 1, 0, 0], loss=['sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', None, None]) train_dataset = SquadReader(args.train_path) dev_dataset = SquadReader(args.dev_path) tokenizer = get_tokenizer(lower=args.lower, as_str=False) converter = SquadConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer, question_max_len=args.q_len, context_max_len=args.c_len) eval_converter = SquadEvalConverter( token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer, question_max_len=args.q_len, context_max_len=args.c_len) train_generator = Iterator(train_dataset, args.batch, converter) dev_generator_loss = Iterator(dev_dataset, args.batch, converter, shuffle=False) dev_generator_f1 = Iterator(dev_dataset, args.batch, eval_converter, repeat=False, shuffle=False) trainer = SquadTrainer(model, train_generator, args.epoch, dev_generator_loss, './models/fastqa.{epoch:02d}-{val_loss:.2f}.h5') trainer.add_callback(FastQALRScheduler( dev_generator_f1, val_answer_file=args.answer_path, steps=args.steps)) trainer.add_callback(FastQACheckpoint('./models/fastqa.{steps:06d}.h5', steps=args.steps)) if args.use_tensorboard: trainer.add_callback(TensorBoard(log_dir='./graph', batch_size=args.batch)) history = trainer.run() dump_graph(history, 'loss_graph.png')
def load_vocab(vocab_file, max_word_length=None): if max_word_length: return UnicodeCharsVocabulary(vocab_file, max_word_length, validate_file=True) else: return Vocabulary(vocab_file, validate_file=True)
def create_vocabs(opts,checkpointer): vocabs = None #Vocabulary(opts.pretrain_files , opts.trainfile,opts.dim,True) if checkpointer is not None: vocabs = checkpointer["vocabs"] else: vocabs = Vocabulary(opts.pretrain_files, opts.trainfile, opts.dim, True) return vocabs
def main(args): token_to_index, _ = Vocabulary.load(args.vocab_file) model = FastQA(len(token_to_index), args.embed, args.hidden, question_limit=args.q_len, context_limit=args.c_len, with_feature=not args.without_feature).build() model.load_weights(args.model_path) test_dataset = SquadReader(args.test_path) tokenizer = get_tokenizer(lower=args.lower, as_str=False) converter = SquadEvalConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, tokenizer, question_max_len=args.q_len, context_max_len=args.c_len) test_generator = Iterator(test_dataset, args.batch, converter, False, False) predictions = {} for inputs, (contexts, ids) in test_generator: _, _, start_indices, end_indices = model.predict_on_batch(inputs) for i, (start, end) in enumerate(zip(start_indices, end_indices)): prediction = ' '.join(contexts[i][j] for j in range(start, end + 1)) predictions[ids[i]] = prediction basename = osp.splitext(osp.basename(args.model_path))[0] save_path = osp.join(args.save_dir, f'predictions_{basename}.json') with open(save_path, 'w') as f: json.dump(predictions, f, indent=2)
def load_vocab(word_file, char_file=None, max_word_length=None): if max_word_length: return CharsVocabulary(word_file, char_file, max_word_length, validate_file=True) else: return Vocabulary(word_file, validate_file=True)
def tensorize(ctxs: List[str], word_vocab: Vocabulary, path_vocab: Vocabulary): if len(ctxs) > config.MAX_LENGTH: ctxs = random.sample(ctxs, config.MAX_LENGTH) x_s, path, x_t = [0] * config.MAX_LENGTH, [0] * config.MAX_LENGTH, [ 0 ] * config.MAX_LENGTH for i in range(config.MAX_LENGTH): if i < len(ctxs): s, p, t = ctxs[i].split(',') p = str(java_string_hashcode(p)) else: s, p, t = '<pad>', '<pad>', '<pad>' x_s[i] = word_vocab.lookup_idx(s) path[i] = path_vocab.lookup_idx(p) x_t[i] = word_vocab.lookup_idx(t) x_s, path, x_t = torch.LongTensor(x_s)[None, :], torch.LongTensor(path)[ None, :], torch.LongTensor(x_t)[None, :] return x_s, path, x_t
def __init__(self, task='sentiment', batch_size=32, gaze_data=None, et_predictor_model=None, et_predictor_vocab=None, use_predictor_vocab=False, filter_vocab=False): self.batch_size = batch_size self.filter_vocab = False self.use_gaze = gaze_data is not None _zuco = ZuCo(task=task) self.sentences = _zuco.sentences # this will be overridden if we're using a trained ET predictor: self.sentences_et = np.array(_zuco.sentences_et) self.max_seq_len = max([len(s) for s in self.sentences]) # Initialize the ET features per sentence if et_predictor_model and et_predictor_vocab: print('\nReceived ET Predictor model and vocab. Vocabulary size:', len(et_predictor_vocab)) print('Running sentences through ET predictor...') indexed_sentences = et_predictor_vocab.index_sentences( self.sentences) self.sentences_et = et_predictor_model.sentences_to_et( indexed_sentences=indexed_sentences, max_seq_len=self.max_seq_len) # Initialize Vocabulary object print('\nuse_predictor_vocab =', use_predictor_vocab) if use_predictor_vocab: # assuming that et_predictor_vocab is provided self.vocabulary = et_predictor_vocab self.indexed_sentences = indexed_sentences else: self.vocabulary = Vocabulary(self.sentences, filter_vocab) self.indexed_sentences = self.vocabulary.index_sentences( self.sentences) self.task_num = (1 if task == 'sentiment' else 2 if task == 'normal' else 3) self.load_labels() self.num_classes = len(set(self.labels))
def test_load(self): filename = '/path/to/vocab.pkl' open_ = patch('data.open', mock_open()).start() pickle_load = patch('data.pickle.load').start() pickle_load.return_value = ('token_to_index', 'index_to_token') token_to_index, index_to_token = Vocabulary.load(filename) self.assertEqual(token_to_index, 'token_to_index') self.assertEqual(index_to_token, 'index_to_token') open_.assert_called_with(filename, mode='rb') pickle_load.assert_called_with(open_.return_value)
def _load_data(self, reverse, chars, bidirectional=False): if chars: vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5) else: vocab = Vocabulary(self._tmp_vocab) if not bidirectional: data = LMDataset(self._tmp_train, vocab, reverse=reverse) else: data = BidirectionalLMDataset(self._tmp_train, vocab) return data
def main(args): token_to_index, index_to_token = Vocabulary.load(args.vocab_file) root, _ = os.path.splitext(args.vocab_file) basepath, basename = os.path.split(root) embed_path = f'{basepath}/embedding_{basename}.npy' embeddings = np.load(embed_path) if os.path.exists(embed_path) else None batch_size = args.batch # Batch size for training. epochs = args.epoch # Number of epochs to train for. converter = SquadDepConverter(token_to_index, PAD_TOKEN, UNK_TOKEN) if args.model == 'qanet': model = DependencyQANet(len(token_to_index), args.embed, len(converter._dep_to_index), args.hidden, args.num_heads, dropout=args.dropout, num_blocks=args.encoder_layer, num_convs=args.encoder_conv, embeddings=embeddings).build() elif args.model == 'lstm': model = DependencyLSTM(len(token_to_index), args.embed, len(converter._dep_to_index), args.hidden, dropout=args.dropout, embeddings=embeddings).build() opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.) model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) train_dataset = SquadReader(args.train_path) dev_dataset = SquadReader(args.dev_path) train_generator = Iterator(train_dataset, batch_size, converter) dev_generator = Iterator(dev_dataset, batch_size, converter) trainer = SquadTrainer(model, train_generator, epochs, dev_generator, './model/dep.{epoch:02d}-{val_loss:.2f}.h5') trainer.add_callback(BatchLearningRateScheduler()) trainer.add_callback(ExponentialMovingAverage(0.999)) if args.use_tensorboard: trainer.add_callback( TensorBoard(log_dir='./graph', batch_size=batch_size)) history = trainer.run() dump_graph(history, 'loss_graph.png') test_dataset = SquadReader(args.test_path) test_generator = Iterator(test_dataset, args.batch, converter, False, False) print(model.evaluate_generator(test_generator, steps=len(test_generator)))
def main(args): tokenizer = get_tokenizer(lower=args.lower, as_str=True) if args.only_question: indices = [1] desc = 'question' elif args.only_context: indices = [0] desc = 'context' else: indices = [0, 1] desc = 'question_context' basename, ext = os.path.splitext(args.vocab_path) min_freq = args.min_freq if args.min_freq else '' max_size = args.max_size if args.max_size else '' filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}' squad_tokens = load_squad_tokens(args.train_path, tokenizer, indices=indices) Vocabulary.build(squad_tokens, args.min_freq, args.max_size, (PAD_TOKEN, UNK_TOKEN), filename)
def embedding(cls, Config): emb_dim = Config.emb_dim if Config.encoder == 'Recurrent' else Config.d_model embedding = nn.Embedding(Config.max_vocab, emb_dim) if Config.init: # initialize embeddings with pre-dumped Fasttext embeddings try: embedding.load_state_dict(torch.load(os.path.join(os.path.dirname(EMB_PATH), f"dump/initial_{Config.dataset}_{Config.max_vocab}_{Config.emb_dim}.pt"))) except: raise Exception(f"First store an embedding file \ 'initial_{Config.dataset}_{Config.max_vocab}_{Config.emb_dim}.pt' under embeddings/dump/") # should always be trainable, because considerable number of embeddings is initialized randomly embedding.weight.requires_grad = Config.trainable vocab = Vocabulary(Config.vocab_path, Config.max_vocab) return vocab, embedding
def main(config, local): # random seed random.seed(config.random_seed) np.random.seed(config.random_seed) torch.random.manual_seed(config.random_seed) if config.device == 'cuda': torch.cuda.manual_seed_all(config.random_seed) vocab = Vocabulary(config) print(f'Vocabulary loaded') feature = Feature(config) print(f'Feature data loaded') setattr(config, 'char_vocab_size', 0) setattr(config, 'class_size', 1) if config.mode == 'train': train_question_file_path = os.path.join(config.data_dir, config.train_file_name) train_label_file_path = os.path.join(config.data_dir, config.train_label_file_name) train_dataset = Dataset(train_question_file_path, train_label_file_path, vocab, feature, mode='train') train_data_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) validation_question_file_path = os.path.join(config.data_dir, config.validation_file_name) validation_label_file_path = os.path.join(config.data_dir, config.validation_label_file_name) validation_dataset = Dataset(validation_question_file_path, validation_label_file_path, vocab, feature, mode='validation') validation_data_loader = DataLoader(validation_dataset, batch_size=config.batch_size) else: train_data_loader = None validation_data_loader = None print(f'{config.mode} Dataset loaded') trainer = Trainer(config, feature, train_data_loader, validation_data_loader) print(f'Trainer loaded') if nsml.IS_ON_NSML: bind_model(trainer.model, vocab, feature, config) if config.pause: nsml.paused(scope=local) if config.mode == 'train': print(f'Starting training') trainer.train() print(f'Finishing training')
def main(args): token_to_index, index_to_token = Vocabulary.load(args.vocab_file) root, _ = os.path.splitext(args.vocab_file) basepath, basename = os.path.split(root) embed_path = f'{basepath}/embedding_{basename}.npy' embeddings = np.load(embed_path) if os.path.exists(embed_path) else None batch_size = args.batch # Batch size for training. epochs = args.epoch # Number of epochs to train for. model = QANet(len(token_to_index), args.embed, args.hidden, args.num_heads, encoder_num_blocks=args.encoder_layer, encoder_num_convs=args.encoder_conv, output_num_blocks=args.output_layer, output_num_convs=args.output_conv, dropout=args.dropout, embeddings=embeddings).build() opt = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7, clipnorm=5.) model.compile(optimizer=opt, loss=[ 'sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', None, None ], loss_weights=[1, 1, 0, 0]) train_dataset = SquadReader(args.train_path) dev_dataset = SquadReader(args.dev_path) converter = SquadConverter(token_to_index, PAD_TOKEN, UNK_TOKEN, lower=args.lower) train_generator = Iterator(train_dataset, batch_size, converter) dev_generator = Iterator(dev_dataset, batch_size, converter) trainer = SquadTrainer(model, train_generator, epochs, dev_generator, './model/qanet.{epoch:02d}-{val_loss:.2f}.h5') trainer.add_callback(BatchLearningRateScheduler()) # trainer.add_callback(ExponentialMovingAverage(0.999)) if args.use_tensorboard: trainer.add_callback( TensorBoard(log_dir='./graph', batch_size=batch_size)) history = trainer.run() dump_graph(history, 'loss_graph.png')
def make_cache(data_path, cache_path, image_dim, validation_only=False, verbose=True): q_dict = Vocabulary() cache_time = time() for name in ('train', 'val'): if name != 'val' and validation_only: continue question_time = time() if verbose: print(f'cache {name}') images = data_path / f'{name}2014' questions = json.load( (data_path / f'v2_OpenEnded_mscoco_{name}2014_questions.json' ).open('r'))['questions'] for question in questions: q_dict.tokenize(question['question'], insert=True) if verbose: print( f'{len(questions)} questions and annotations cached in {time() - question_time:.2f}s' ) if Path(cache_path / f'{name}_img.hdf5').is_file() and Path( cache_path / f'{name}_imgmap.pkl').is_file(): continue img_cache_time = time() img_size = (image_dim, image_dim) n_images = len(list(images.glob('*'))) img_dict = {} with h5py.File(cache_path / f'{name}_img.hdf5', 'w') as h5: img_data = h5.create_dataset('images', shape=(n_images, 3, image_dim, image_dim), dtype='i') for i, image in enumerate(images.glob('*')): if i % 10000 == 0 and verbose: print(f'{i} images cached') img_id = int(image.name.replace('.jpg', '')[-12:]) img_dict[img_id] = i img = numpy.array( Image.open(image).resize(img_size).convert('RGB')) img_data[i, :] = img.reshape((3, image_dim, image_dim)) pickle.dump(img_dict, Path(cache_path / f'{name}_imgmap.pkl').open('wb')) if verbose: print( f'{n_images} images cached in {time() - img_cache_time:.2f}s') q_dict.save(cache_path / 'vocab.pkl') if verbose: print(f'data cached in {time() - cache_time:.2f}s')
def main(args): token_to_index, _ = Vocabulary.load(args.vocab_path) if os.path.exists(args.embed_array_path) and os.path.exists( args.embed_dict_path): with open(args.embed_dict_path, 'rb') as f: pretrained_token_to_index = pickle.load(f) embeddings = extract_embeddings(token_to_index, pretrained_token_to_index, np.load(args.embed_array_path)) else: if os.path.exists(args.embed_path): pretrained_token_to_index, embeddings = save_word_embedding_as_npy( args.embed_path, args.dim) else: raise FileNotFoundError( 'Please download pre-trained embedding file') root, _ = os.path.splitext(args.vocab_path) basepath, basename = os.path.split(root) filename = f'{basepath}/embedding_{basename}.npy' np.save(filename, embeddings)
def __init__(self, args, kwargs, root_dir, hidden_size, lr, epochs, batch_size, device, logfile, verbose=1): self.root_dir = kwargs['root_dir'] self.device = kwargs['device'] self.verbose = kwargs['verbose'] self.logfile = kwargs['logfile'] # Training params self.lr = kwargs['lr'] self.epochs = kwargs['epochs'] self.batch_size = kwargs['batch_size'] # Model params self.hidden_size = kwargs['hidden_size'] # Data params self.vocab = Vocabulary() self.races = Races() self.genders = Genders() # Initialization self.dataset = self.init_dataset() self.train_loder = self.init_loader() self.model = self.init_model() self.criterion = self.init_criterion() self.optimizer = self.init_optimizer() # Initialize logging self.logger = Logger(os.path.join(PROJECT_ROOT, self.logfile))
def __init__(self, root_dir, hidden_size, lr, epochs, batch_size, device, logfile, verbose=1): self.root_dir = root_dir self.device = device self.verbose = verbose self.logfile = logfile # Training params self.lr = lr self.epochs = epochs self.batch_size = batch_size # Model params self.hidden_size = hidden_size # Data params self.vocab = Vocabulary() self.races = Races() self.genders = Genders() # Initialization self.dataset = self.init_dataset() self.train_loder = self.init_loader() self.model = self.init_model() self.criterion = self.init_criterion() self.optimizer = self.init_optimizer() # Initialize logging self.logger = Logger(os.path.join(PROJECT_ROOT, logfile))
def init(): options = tf.app.flags.FLAGS os.environ['CUDA_VISIBLE_DEVICES'] = options.gpus if not options.model_dir: raise Exception('You need to specify --model_dir') if not options.vocab_path: options.vocab_path = os.path.join(options.model_dir, 'vocab.txt') if not options.n_senses_file: n_senses_file = os.path.join(options.model_dir, 'n_senses.txt') if os.path.exists(n_senses_file): options.n_senses_file = n_senses_file vocab = Vocabulary(options.vocab_path, min_occurrences=options.min_occurrences_for_vocab) multisense_vocab = get_multisense_vocab(options.n_senses_file, vocab, options) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True #config.log_device_placement = True tf_config.allow_soft_placement = True return options, vocab, multisense_vocab, tf_config
"model_name": "test", "stop_words_file": None, # use sub-sampling instead "n_epochs": 10, "data_path": "data/hansards/training.en", "use_cuda": False, "batch_size": 500, } ################# locals().update(params) stop_words = None if stop_words_file: stop_words = read_stop_words(stop_words_file) sentences = SentenceIterator(data_path, stop_words=stop_words) vocab = Vocabulary(sentences, max_size = vocab_size) sgm = SkipGramModel(vocab, embedding_dim, use_cuda=use_cuda) optimizer = optim.SparseAdam(sgm.parameters()) tictoc = utils.TicToc() epoch_losses = [] for epoch in np.arange(1, n_epochs + 1): print("Running epoch: ", epoch) epoch_loss = utils.Mean() for batch in batch_iterator(sentences, vocab, batch_size, n_negative): batch_center, batch_context, negative_words = batch optimizer.zero_grad() loss = sgm.forward(batch_center, batch_context, negative_words) epoch_loss.add(loss.item())
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() local_rank = args.local_rank # world_size = torch.cuda.device_count() # assume all local GPUs # Set up distributed process group rank = setup_dist(local_rank) # Set up datasets. train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) # Create vocabulary and tokenizer. vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) #model = model.to(rank) #model = DDP(model, device_ids=[rank], output_device=rank) num_pretrained = model.load_pretrained_embeddings( vocabulary, args.embedding_path ) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print( f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)' ) print() # device = torch.device(f'cuda:{rank}') model = model.to(rank) model = DDP(model, device_ids=[rank], output_device=rank) # if args.use_gpu: # model = cuda(args, model) if args.resume and args.model_path: map_location = {"cuda:0": "cuda:{}".format(rank)} model.load_state_dict(torch.load(args.model_path, map_location=map_location)) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. try: train_loss = train(args, epoch, model, train_dataset) except RuntimeError: print(f'NCCL Wait Timeout, rank: \'{args.local_rank}\' (exit)') exit(1) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. if rank == 0: eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print( f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}" ) # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print( f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...' ) print() cleanup_dist() break if args.do_test and rank == 0: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ( 'python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}' ) print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() # Set up datasets. train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) # Create vocabulary and tokenizer. if args.vocab_path != None: print("loading vocabulary from file at {}".format(args.vocab_path)) vocabulary = Vocabulary(train_dataset.samples, args.vocab_size, load_from_file=True, filepath=args.vocab_path) else: print("constructing the vocab from dataset examples") vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id args.char_vocab_size = vocabulary.numCharacters() print(f'vocab words = {len(vocabulary)}') print(f'num characters = {args.char_vocab_size}') # Print number of samples. num_train_samples = len(train_dataset) print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) # load the model from previous checkpoint if args.finetune >= 1: print("preparing to load {} as base model".format(args.init_model)) model.load_state_dict(torch.load(args.init_model, map_location='cpu')) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # create tensorboard summary writer train_writer = tb.SummaryWriter( log_dir=os.path.join(args.logdir, args.run + "_train")) valid_writer = tb.SummaryWriter( log_dir=os.path.join(args.logdir, args.run + "_valid")) # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. train_loss = train(args, epoch, model, train_dataset, train_writer, num_train_samples) eval_loss = evaluate(args, epoch, model, dev_dataset) # write the loss to tensorboard valid_writer.add_scalar("valid_loss", eval_loss, global_step=epoch) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def main(): parser = argparse.ArgumentParser( description='PyTorch RNNs for Poetry Generation') # data arguments parser.add_argument('--datadir', default='data', help='path to dataset', type=str) parser.add_argument('--rawdir', default=None, help='path to raw dataset', type=str) parser.add_argument('--logdir', default='log', help='path to log', type=str) parser.add_argument('--tag', default='tang', help='poetry type for the project.', type=str) parser.add_argument('--wordnum', default=5, help='The number of poetry words in the sentences.', type=int) parser.add_argument('--sentnum', default=4, help='The number of poetry sentences.', type=int) parser.add_argument('--max-len', default=20, help='The number of poetry titles.', type=int) parser.add_argument('--embedding-dim', default=300, help='The dimension of embedding .', type=int) parser.add_argument('--hidden-dim', default=150, help='The dimension of hidden .', type=int) parser.add_argument('--num_layers', default=2, help='The rnn layers.', type=int) parser.add_argument('--batch-size', default=30, help='The batch-size of the dataset.', type=int) parser.add_argument('--data-workers', type=int, default=5, help='Number of subprocesses for data loading') parser.add_argument('--epoches', default=50, help='The batch-size of the dataset.', type=int) parser.add_argument('--bidirectional', action='store_true', help='Whether using bidirectional RNNs') parser.add_argument('--lr', default=0.001, type=float, metavar='LR', help='initial learning rate') parser.add_argument('--seed', default=123, type=int, help='random seed (default: 123)') cuda_parser = parser.add_mutually_exclusive_group(required=False) cuda_parser.add_argument('--cuda', dest='cuda', action='store_true') cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false') parser.set_defaults(cuda=True) args = parser.parse_args() # preparing log # logging defination logger = logging.getLogger() logger.setLevel(logging.INFO) model_name = time.strftime("%Y%m%d%H%M", time.localtime(time.time())) log_dir = os.path.join( os.getcwd(), args.logdir, ) if not os.path.exists(log_dir): os.mkdir(log_dir) log_file = os.path.join(log_dir, model_name + ".log") fh = logging.FileHandler(log_file, mode="w") fh.setLevel(logging.DEBUG) formatter = logging.Formatter( "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" ) fh.setFormatter(formatter) logger.addHandler(fh) logger.info(args) args.cuda = args.cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True assert (args.rawdir is not None) # preparing dataset poetry_path = os.path.join( args.datadir, "poet.%s._%d_%d.json" % (args.tag, args.sentnum, args.wordnum)) if os.path.exists(poetry_path): logger.info("The poetry dataset has been built in path: %s" % poetry_path) else: logger.info("Preparing poetry...") processPoetry(args.rawdir, args.datadir, sentNum=args.sentnum, wordsNum=args.wordnum, max_title_len=args.max_len, tag=args.tag) logger.info("Poetry processed!") # preparing vocabulary vocab_path = os.path.join(args.datadir, "vocab.txt") if os.path.exists(vocab_path): logger.info("The vocabulary has been built in path: %s" % os.path.join(args.datadir, "vocab.txt")) else: logger.info("Building vocabulary...") build_vocabulary(args.rawdir, args.datadir) logger.info("The vocabulary has been built.") VocabDataSet = Vocabulary(vocab_path) PoetryDataSet = Poetry(VocabDataSet, args.max_len, poetry_path) # preparing model model = LSTMPoetry(vocab_size=len(VocabDataSet), embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, sents_len=args.sentnum, num_layers=args.num_layers, name=model_name) criterion = torch.nn.CrossEntropyLoss() model.to(device), criterion.to(device) optimizer = torch.optim.Adam(model.parameters()) # training process logger.info("Begin training model!") train(model, PoetryDataSet, criterion, optimizer, args, device) logger.info("End training model!")
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() # Set up datasets. train_dataset = QADataset(args, args.train_path, is_train=True) dev_dataset = QADataset(args, args.dev_path, is_train=False) print("Start creating vocabulary and tokenizer") # Create vocabulary and tokenizer. vocabulary = Vocabulary( train_dataset.samples + train_dataset.culled_samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. train_loss = train(args, epoch, model, train_dataset) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def main(mode='test', question=None, answers=None): """ This function is used to train, predict or test Args: mode (str): train/preddict/test question (str): this contains the question answers (list): this contains list of answers in string format Returns: index (integer): index of the most likely answer """ # get the train and predict model model vocabulary = Vocabulary("./data/vocab_all.txt") embedding_file = "./data/word2vec_100_dim.embeddings" qa_model = QAModel() train_model, predict_model = qa_model.get_bilstm_model( embedding_file, len(vocabulary)) epoch = 1 if mode == 'train': for i in range(epoch): print('Training epoch', i) # load training data qa_data = QAData() questions, good_answers, bad_answers = qa_data.get_training_data() # train the model Y = np.zeros(shape=(questions.shape[0], )) train_model.fit([questions, good_answers, bad_answers], Y, epochs=1, batch_size=64, validation_split=0.1, verbose=1) # save the trained model train_model.save_weights('model/train_weights_epoch_' + str(epoch) + '.h5', overwrite=True) predict_model.save_weights('model/predict_weights_epoch_' + str(epoch) + '.h5', overwrite=True) elif mode == 'predict': # load the evaluation data data = pickle.load(open("./data/dev.pkl", 'rb')) random.shuffle(data) # load weights from trained model qa_data = QAData() predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5') c = 0 c1 = 0 for i, d in enumerate(data): print(i, len(data)) # pad the data and get it in desired format indices, answers, question = qa_data.process_data(d) # get the similarity score sims = predict_model.predict([question, answers]) n_good = len(d['good']) max_r = np.argmax(sims) max_n = np.argmax(sims[:n_good]) r = rankdata(sims, method='max') c += 1 if max_r == max_n else 0 c1 += 1 / float(r[max_r] - r[max_n] + 1) precision = c / float(len(data)) mrr = c1 / float(len(data)) print("Precision", precision) print("MRR", mrr) elif mode == 'test': # question and answers come from params qa_data = QAData() answers, question = qa_data.process_test_data(question, answers) # load weights from the trained model predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5') # get similarity score sims = predict_model.predict([question, answers]) max_r = np.argmax(sims) return max_r
"stopping_loss": 1 } ################# locals().update(params) en_stop_words, fr_stop_words = None, None if en_stop_words_path: en_stop_words = read_stop_words(en_stop_words_path) if fr_stop_words_path: fr_stop_words = read_stop_words(fr_stop_words_path) en_sentences = SentenceIterator(en_data_path, stop_words=en_stop_words) fr_sentences = SentenceIterator(fr_data_path, stop_words=fr_stop_words) en_vocab = Vocabulary(en_sentences, max_size=vocab_x) fr_vocab = Vocabulary(fr_sentences, max_size=vocab_y) eam = EmbedAlignModel(en_vocab, fr_vocab, embedding_dim, random_state=random_state, use_cuda=use_cuda) optimizer = optim.Adam(eam.parameters()) tictoc = utils.TicToc() epoch_losses = [] for epoch in np.arange(1, n_epochs + 1): print("Running epoch: ", epoch) epoch_loss = utils.Mean()
head = '%(asctime)-15s %(message)s' ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] if args.gpus else [mx.gpu()] ngpus = len(ctx) rescale_loss = args.bptt # logging logging.basicConfig(level=logging.INFO, format=head) logging.info(args) logging.debug(sys.argv) # seeding mx.random.seed(args.seed) np.random.seed(args.seed) # data vocab = Vocabulary.from_file(args.vocab) ntokens = vocab.num_tokens train_data = mx.io.PrefetchingIter(MultiSentenceIter(args.data, vocab, args.batch_size * ngpus, args.bptt)) # model model = Model(args, ntokens, rescale_loss) train_loss_and_states = model.train() eval_loss_and_states = model.eval() # training module data_names, label_names = ['data', 'mask'], ['label'] eval_state_names = model.state_names num_sample_names = len(model.sample_names) train_state_names = model.state_names + model.sample_names module = CustomModule(symbol=train_loss_and_states, context=ctx,
class RNNLayerGenerator(Generator): def __init__(self, model_path, device="cpu", max_len=50, verbose=1): super().__init__(model_path, device) self.max_len = max_len self.verbose = verbose self.vocab = Vocabulary() self.races = Races() self.genders = Genders() self.to_tensor = ToTensor() self.name_transform = Compose([self.vocab, OneHot(self.vocab.size), ToTensor()]) self.race_transform = Compose([self.races, OneHot(self.races.size), ToTensor()]) self.gender_transform = Compose([self.genders, OneHot(self.genders.size), ToTensor()]) def _init_random_input(self, skip_random_gen=[]): """Helper function that initialize random letter, race and gender""" random_option = ['letter', 'race', 'gender'] letter = '' gender = '' race = '' if not skip_random_gen: letter = np.random.choice(self.vocab.start_letters) race = np.random.choice(self.races.available_races) gender = np.random.choice(self.genders.available_genders) else: for opt in random_option: if opt not in skip_random_gen: if opt is 'letter': letter = np.random.choice(self.vocab.start_letters) elif opt is 'race': race = np.random.choice(self.races.available_races) elif opt is 'gender': gender = np.random.choice(self.genders.available_genders) return letter, race, gender def _transform_input(self, letter, race, gender): """Helper function to transform input into tensors""" letter_tensor = self.name_transform(letter).to(self.device) race_tensor = self.race_transform(race).to(self.device) gender_tensor = self.gender_transform(gender).to(self.device) return letter_tensor, race_tensor, gender_tensor def _expand_dims(self, *tensors): """Add dimension along 0-axis to tensors""" return [torch.unsqueeze(t, 0) for t in tensors] def sample(self, letter, race, gender): """Sample name from start letter, race and gender""" with torch.no_grad(): assert letter in self.vocab.start_letters, "Invalid letter" assert race in self.races.available_races, "Invalid race" assert gender in self.genders.available_genders, "Invalid gender" # Prepare inputs letter_t, race_t, gender_t = self._transform_input(letter, race, gender) letter_t, race_t, gender_t = self._expand_dims(letter_t, race_t, gender_t) # Merge all input tensors input = torch.cat([letter_t, race_t, gender_t], 2) outputs = [letter] # Initialize hidden states hx, cx = self.model.init_states(batch_size=1, device=self.device) while True: output, hx, cx = self.model(input, hx, cx, lengths=torch.tensor([1])) sample = OneHotCategorical(logits=output).sample() index = torch.argmax(sample) char = self.vocab.get_char(index.item()) if char == '.' or len(outputs) == self.max_len: break outputs.append(char) input = torch.cat([sample, race_t, gender_t], 2) name = ''.join(map(str, outputs)) return name def generate(self, num_samples, in_race, in_gender): """Sample random names""" gen_names = [] ran_gen_names = [] if in_race is not '': ran_gen_names.append('race') if in_gender is not '': ran_gen_names.append('gender') for _ in range(num_samples): letter, race, gender = self._init_random_input(ran_gen_names) race = race + in_race gender = gender + in_gender gen_name = self.sample(letter, race, gender) gen_names.append([gen_name, race, gender]) return gen_names