def main(args): # with open(args.output_dir / 'config.json') as f: # config = json.load(f) # loading datasets from jsonl files # with open(config['train']) as f: # train = [json.loads(line) for line in f] with open(args.valid_data_path) as f: valid = [json.loads(valid) for valid in f] # with open(config['test']) as f: # test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open('embedding2.pkl', 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'valid_seq2seq.pkl', tokenizer.pad_token_id)
def __init__(self, topics_path, min_depth=0, max_depth=None, metadata=True, lemmatization=True, use_stop=True, pattern=None, exclude_pattern=None, **kwargs): super(TrecTopics, self).__init__(topics_path, dictionary={}, metadata=metadata, min_depth=min_depth, max_depth=max_depth, pattern=pattern, exclude_pattern=exclude_pattern, lines_are_documents=True, **kwargs) self.topics = {} self.topics_vecs = None self.topic_row_maps = {} self.oov = {} self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN, maximum_len=TOKEN_MAX_LEN, lowercase=True, output_lemma=lemmatization, use_stopwords=use_stop, extra_stopwords=EXTRA_STOPWORDS)
def main(args): train_df = pd.read_pickle(args.train_data) valid_df = pd.read_pickle(args.valid_data) tokenizer = Tokenizer() tokenizer.fit_word(train_df.repl_words.tolist()) train_sentences_idx = sentence_preprocessing(train_df, tokenizer) valid_sentences_idx = sentence_preprocessing(valid_df, tokenizer) bi_lm_model = BiLM(args.word_emb_size, args.lstm_unit_size, len(tokenizer.vocab_word)) if torch.cuda.device_count() > 1: print("Use", torch.cuda.device_count(), "GPUs.") bi_lm_model = torch.nn.DataParallel(bi_lm_model) elif torch.cuda.device_count() == 1: print("Use single GPU.") else: print("Use CPU.") bi_lm_model.to(device) bi_lm_model = train(bi_lm_model, train_sentences_idx, valid_sentences_idx, args.epochs, args.batch_size, args.early_stopping) torch.save(bi_lm_model.state_dict(), args.output)
def inference_random(): # 加载验证集验证 model = ClassificationModel(len(cfg.char2idx)) model = load_custom_model(model, cfg.save_model_path).to(cfg.device) tokenizer = Tokenizer(cfg.char2idx) error = 0 with open(cfg.test_data_path, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: pairs = line.split('\t') label, text = pairs[0], pairs[1] input_index, _ = tokenizer.encode(text, max_length=cfg.max_seq_len) inputs = torch.tensor(input_index).unsqueeze(0) inputs_mask = (inputs > 0).to(torch.float32) with torch.no_grad(): scores = model(inputs, inputs_mask) prediction = scores.argmax(-1).item() if prediction != int(label): print(scores[:, int(label)].item()) print(label) print(text) print('-' * 50) error += 1 print(error)
def main(args): # loading datasets from jsonl files with open(args.input_data_path) as f: valid = [json.loads(valid) for valid in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') """ embedding = Embedding("./glove.6B.300d.txt", words=words) with open('./embedding.pkl', 'wb') as f: pickle.dump(embedding, f) """ with open('./embedding.pkl', 'rb') as file: embedding = pickle.load(file) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl', tokenizer.pad_token_id)
def preProcess(): print 'PreProcess Reuters Corpus' start_time = time.time() docs = 0 bad = 0 tokenizer = Tokenizer() if not os.path.isdir(Paths.base): os.makedirs(Paths.base) with open(Paths.text_index, 'w') as fileid_out: with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out: with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(' '.join(tokens) + "\n") # if f.startswith("train"): # # else: # test.write(' '.join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1 print "Normalised %d documents" % (docs) print "Skipped %d bad documents" % (bad) print 'Finished building train file ' + Paths.texts_clean end_time = time.time() print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
def get_dataloaders(args): model_prefix = '{}_{}'.format(args.model_type, args.train_id) log_path = args.LOG_DIR + model_prefix + '/' checkpoint_path = args.CHK_DIR + model_prefix + '/' result_path = args.RESULT_DIR + model_prefix + '/' cp_file = checkpoint_path + "best_model.pth.tar" init_epoch = 0 if not os.path.exists(log_path): os.makedirs(log_path) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) ## set up the logger set_logger(os.path.join(log_path, 'train.log')) ## save argparse parameters with open(log_path + 'args.yaml', 'w') as f: for k, v in args.__dict__.items(): f.write('{}: {}\n'.format(k, v)) logging.info('Training model: {}'.format(model_prefix)) ## set up vocab txt # create txt here print('running setup') setup(args, clear=True) print(args.__dict__) # indicate src and tgt language if args.source_language == 'en': src, tgt = 'en', 'zh' else: src, tgt = 'zh', 'en' maps = {'en': args.TRAIN_VOCAB_EN, 'zh': args.TRAIN_VOCAB_ZH} vocab_src = read_vocab(maps[src]) tok_src = Tokenizer(language=src, vocab=vocab_src, encoding_length=args.MAX_INPUT_LENGTH) vocab_tgt = read_vocab(maps[tgt]) tok_tgt = Tokenizer(language=tgt, vocab=vocab_tgt, encoding_length=args.MAX_INPUT_LENGTH) logging.info('Vocab size src/tgt:{}/{}'.format(len(vocab_src), len(vocab_tgt))) ## Setup the training, validation, and testing dataloaders train_loader, val_loader, test_loader = create_split_loaders( args.DATA_DIR, (tok_src, tok_tgt), args.batch_size, args.MAX_VID_LENGTH, (src, tgt), num_workers=4, pin_memory=True) logging.info('train/val/test size: {}/{}/{}'.format( len(train_loader), len(val_loader), len(test_loader))) return train_loader, val_loader, test_loader, tok_src, tok_tgt, len( vocab_src), len(vocab_tgt)
def main(args): with open(args.test_input) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open(args.embedding_file, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.test_output, tokenizer.pad_token_id )
def main(path): with open(path) as f: test = [json.loads(line) for line in f] with open("./datasets/seq_tag/embedding.pkl", "rb") as f: embedding = pickle.load(f) tokenizer = Tokenizer(embedding.vocab, lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test), './datasets/seq_tag/test.pkl')
def main(args): with open(args.output_dir / 'config.json') as f: config = json.load(f) # loading datasets from jsonl files with open(config['train']) as f: train = [json.loads(line) for line in f] with open(config['valid']) as f: valid = [json.loads(valid) for valid in f] with open(config['test']) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in train] + [sample['summary'] for sample in train] + [sample['text'] for sample in valid] + [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=config['lower_case']) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') embedding = Embedding(config['embedding'], words=words) with open(args.output_dir / 'embedding.pkl', 'wb') as f: pickle.dump(embedding, f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating train dataset...') create_seq2seq_dataset( process_samples(tokenizer, train), args.output_dir / 'train.pkl', config, tokenizer.pad_token_id ) logging.info('Creating valid dataset...') create_seq2seq_dataset( process_samples(tokenizer, valid), args.output_dir / 'valid.pkl', config, tokenizer.pad_token_id ) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.output_dir / 'test.pkl', config, tokenizer.pad_token_id )
def main(args): # Read test file with open(args.input_dataname) as f: test = [json.loads(line) for line in f] # Read embedding with open(str(args.output_dir) + '/embedding_tag.pkl', 'rb') as f: embedding = pickle.load(f) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test), args.output_dir / 'test_tag.pkl', tokenizer.pad_token_id)
def tokenize_sentence(train, test, w2v_model, max_len=6): data = pd.concat([train["sentence"], test["sentence"]]).values tok = Tokenizer(max_features=15000, max_len=max_len) tokens = tok.fit_transform(data) #n = len(train) #train_tokens = tokens[:n] #test_tokens = tokens[n:] vocab_len = tok.vocabulary_size() idx_to_word = {v:k for k, v in tok.vocab_idx.items()} embedding_matrix = np.zeros((vocab_len+1, W2V_CONFIG["vector_size"])) for i in range(vocab_len): if i == 0: continue embedding_matrix[i] = w2v_model[idx_to_word[i]] return tok, embedding_matrix
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, batch_size=BATCH_SIZE): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(EncoderLSTM( len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx, dropout_ratio, bidirectional=args.bidirectional, glove=glove)) decoder = try_cuda(AttnDecoderLSTM( action_embedding_size, hidden_size, dropout_ratio, feature_size=feature_size)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok), eval.Evaluation([split])) for split in test_splits} return train_env, test_envs, encoder, decoder
def test_submission(path_type, max_episode_len, history, MAX_INPUT_LENGTH, feedback_method, n_iters, model_prefix, blind): ''' Train on combined training and validation sets, and generate test submission. ''' setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAINVAL_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) train_env = R2RBatch(features, batch_size=batch_size, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok, path_type=path_type, history=history, blind=blind) # Build models and train enc_hidden_size = hidden_size//2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(action_embedding_size, hidden_size, dropout_ratio).cuda() train(train_env, encoder, decoder, n_iters, path_type, history, feedback_method, max_episode_len, MAX_INPUT_LENGTH, model_prefix) # Generate test submission test_env = R2RBatch(features, batch_size=batch_size, splits=['test'], tokenizer=tok, path_type=path_type, history=history, blind=blind) agent = Seq2SeqAgent(test_env, "", encoder, decoder, max_episode_len) agent.results_path = '%s%s_%s_iter_%d.json' % (RESULT_DIR, model_prefix, 'test', 5000) agent.test(use_dropout=False, feedback='argmax') agent.write_results()
def train_val(): ''' Train on the training set, and validate on seen and unseen splits. ''' setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH) train_env = R2RBatch(features, batch_size=batch_size, splits=['train'], tokenizer=tok) # Creat validation environments val_envs = { split: (R2RBatch(features, batch_size=batch_size, splits=[split], tokenizer=tok), Evaluation([split])) for split in ['val_seen', 'val_unseen'] } # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio).cuda() train(train_env, encoder, decoder, n_iters, val_envs=val_envs)
def exact_adaptor(parser): """ exact matching """ tokenizer = Tokenizer() def method(_, queries): nearests = list() for query in tqdm(queries): protocol = query[0] toked_protocol = tokenizer.tokenize(protocol[0]) catprotocol = ' '.join(toked_protocol) toked_candidates = [q[0] for q in query[1:]] if not len(toked_protocol): nearests.append(None) continue max_matched = 0 max_idx = None for (idx, toked_can) in enumerate(toked_candidates): if ' '.join(toked_can ) in catprotocol and len(toked_can) > max_matched: max_idx = idx max_matched = len(toked_can) nearests.append(max_idx) return nearests return method
def create_train_data(data_dir, config): from utils import Tokenizer, get_logger logger = get_logger('log', './log/log.txt') t = Tokenizer(logger) model = Data.pre_process_data(data_dir, t, config, logger) model.create_tf_record_file(model.sample_file) return model
def finetune(): setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput) if args.fast_train: feat_dict = read_img_features(features_fast) else: feat_dict = read_img_features(features) candidate_dict = utils.read_candidates(CANDIDATE_FEATURES) featurized_scans = set( [key.split("_")[0] for key in list(feat_dict.keys())]) train_env = R2RBatch(feat_dict, candidate_dict, batch_size=args.batchSize, splits=['train'], tokenizer=tok) print("The finetune data_size is : %d\n" % train_env.size()) val_envs = { split: (R2RBatch(feat_dict, candidate_dict, batch_size=args.batchSize, splits=[split], tokenizer=tok), Evaluation([split], featurized_scans, tok)) for split in ['train', 'val_seen', 'val_unseen'] } train(train_env, tok, args.iters, val_envs=val_envs)
def make_env_and_models(args, train_vocab_path, train_splits, test_splits, test_instruction_limit=None): setup() image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=batch_size, splits=train_splits, tokenizer=tok) enc_hidden_size = hidden_size//2 if bidirectional else hidden_size glove = np.load(glove_path) feature_size = FEATURE_SIZE encoder = try_cuda(SpeakerEncoderLSTM( action_embedding_size, feature_size, enc_hidden_size, dropout_ratio, bidirectional=bidirectional)) decoder = try_cuda(SpeakerDecoderLSTM( len(vocab), word_embedding_size, hidden_size, dropout_ratio, glove=glove)) test_envs = { split: (R2RBatch(image_features_list, batch_size=batch_size, splits=[split], tokenizer=tok, instruction_limit=test_instruction_limit), eval_speaker.SpeakerEvaluation( [split], instructions_per_path=test_instruction_limit)) for split in test_splits} return train_env, test_envs, encoder, decoder
def train(): # 加载数据 char2idx, keep_tokens = load_chinese_base_vocab(cfg.vocab_path) tokenizer = Tokenizer(char2idx) # train_data = glob(cfg.train_data_path + '*')[16 * 1000 * 35:16 * 1000 * 40] train_data = glob(cfg.train_data_path + '*')[8 * 5000 * 5:8 * 5000 * 10] train_dataset = CustomDataset(train_data, tokenizer, cfg.max_seq_len) train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, collate_fn=padding, shuffle=True, num_workers=4, pin_memory=True) # # debug # train_data = glob(cfg.test_data_path + '*')[:8 * 5000 * 5] # train_dataset = CustomDataset(train_data, tokenizer, cfg.max_seq_len) # train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, collate_fn=padding) # # debug # 加载模型 model = CustomUnilmModel(len(char2idx)) # model = load_pretrained_bert(model, cfg.pretrained_model_path, keep_tokens=keep_tokens).to(cfg.device) model = load_custom_model(model, cfg.save_model_path).to(cfg.device) loss_function = nn.CrossEntropyLoss(ignore_index=0).to(cfg.device) optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learn_rate) # 迭代训练 iteration, train_loss = 0, 0 model.train() for inputs, token_type, targets in tqdm(train_dataloader, position=0, leave=True): attention_mask = unilm_mask(inputs, token_type).to(cfg.device) inputs, token_type, targets = inputs.to(cfg.device), token_type.to( cfg.device), targets.to(cfg.device) prediction = model(inputs, token_type, attention_mask) loss = loss_function( prediction[:, :-1, :].reshape(-1, prediction.shape[-1]), targets.reshape(-1)) optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() iteration += 1 if iteration % cfg.print_loss_steps == 0: eval_loss = evaluate(model, tokenizer, loss_function) print('') print('train_loss:{}'.format(train_loss / cfg.print_loss_steps)) print('evalu_loss:{}'.format(eval_loss)) test_string(s1, tokenizer, model) test_string(s2, tokenizer, model) model.train() train_loss = 0 if iteration % cfg.save_model_steps == 0: torch.save(model.state_dict(), cfg.save_model_path)
def eval(args): batch_size = 32 train_on_gpu = torch.cuda.is_available() enc = RNNEncoder(300, args.embedding_file) dec = RNNDecoder(300, args.embedding_file) device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') model = Seq2Seq(enc, dec, device).to(device) ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['state_dict']) model.eval() embedding_matrix = pickle.load(open(args.embedding_file, 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding_matrix.vocab) eval_data = pickle.load(open(args.test_data_path, 'rb')) eval_loader = DataLoader(eval_data, batch_size=batch_size, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn) output_file = open(args.output_path, 'w') val_losses = [] prediction = {} for batch in tqdm(eval_loader): pred = model(batch, 0) pred = torch.argmax(pred, dim=2) # batch, seq_len for i in range(len(pred)): prediction[batch['id'][i]] = tokenizer.decode( pred[i]).split('</s>')[0].split(' ', 1)[1] pred_output = [ json.dumps({ 'id': key, 'predict': value }) for key, value in sorted(prediction.items(), key=lambda item: item[0]) ] output_file.write('\n'.join(pred_output)) output_file.write('\n') output_file.close()
def handle(self, *args, **options): tok_tag = self._parse_arg('tokenizer', 'jieba', options) sample = self._parse_arg('sample', 1, options) w2v = self._parse_arg('w2v', 0, options) > 0 jtag = self._parse_arg('jtag', 0, options) > 0 print(tok_tag, sample, w2v, jtag) print('Loading doc2vec model...') d2v_model = Doc2Vec.load(D2V_PATH) print('Model loaded.') w2v_model = None if w2v: print('Loading word2vec model...') w2v_model = gensim.models.KeyedVectors.load_word2vec_format( W2V_PATH, binary=True, unicode_errors='ignore') print('Model loaded.') jiebatag_weight = {} if jtag: jtagweight = JiebaTagWeight.objects.all() for jt in jtagweight: jiebatag_weight[jt.name] = { 'weight': jt.weight, 'punish': jt.punish_factor } evaluator = Evaluator() for _ in range(sample): evaluator.draw() raw_push = evaluator.get_predict_push( tokenizer=tok_tag, w2v_model=w2v_model, jiebatag_weight=jiebatag_weight) topic = evaluator.get_topic_field('tokenized') topic_words = Tokenizer(tok_tag).cut(topic, pos=False) predict_words_ls = [ Tokenizer(tok_tag).cut(push, pos=False) for push in raw_push if 'http' not in push ] print(topic_words) print(len(predict_words_ls)) score = doc2vec_ndcg(topic_words, predict_words_ls, d2v_model) print(score)
def make_more_train_env(args, train_vocab_path, train_splits): setup(args.seed) image_features_list = ImageFeatures.from_args(args) vocab = read_vocab(train_vocab_path) tok = Tokenizer(vocab=vocab) train_env = R2RBatch(image_features_list, batch_size=args.batch_size, splits=train_splits, tokenizer=tok) return train_env
def train_all(eval_type, seed, max_episode_len, max_input_length, feedback, n_iters, prefix, blind, debug, train_vocab, trainval_vocab, batch_size, action_embedding_size, target_embedding_size, bidirectional, dropout_ratio, weight_decay, feature_size, hidden_size, word_embedding_size, lr, result_dir, snapshot_dir, plot_dir, train_splits, test_splits): ''' Train on the training set, and validate on the test split. ''' setup(seed, train_vocab, trainval_vocab) # Create a batch training environment that will also preprocess text vocab = read_vocab(train_vocab if eval_type == 'val' else trainval_vocab) tok = Tokenizer(vocab=vocab, encoding_length=max_input_length) train_env = R2RBatch(batch_size=batch_size, splits=train_splits, tokenizer=tok, seed=seed, blind=blind) # Creat validation environments val_envs = { split: (R2RBatch(batch_size=batch_size, splits=[split], tokenizer=tok, seed=seed, blind=blind), Evaluation([split], seed=seed)) for split in test_splits } # Build models and train enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, dropout_ratio, bidirectional=bidirectional).cuda() decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(), Seq2SeqAgent.n_outputs(), action_embedding_size, hidden_size, dropout_ratio, feature_size).cuda() train(eval_type, train_env, encoder, decoder, n_iters, seed, feedback, max_episode_len, max_input_length, prefix, blind, lr, weight_decay, result_dir, snapshot_dir, plot_dir, val_envs=val_envs, debug=debug)
def train_val_augment(): """ Train the listener with the augmented data """ setup() # Create a batch training environment that will also preprocess text vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput) # Load the env img features feat_dict = read_img_features(features) featurized_scans = set( [key.split("_")[0] for key in list(feat_dict.keys())]) # Load the augmentation data aug_path = args.aug # Create the training environment aug_env = R2RBatch(feat_dict, batch_size=args.batchSize, splits=[aug_path], tokenizer=tok, name='aug') # import sys # sys.exit() train_env = R2RBatch(feat_dict, batch_size=args.batchSize, splits=['train'], tokenizer=tok) # Printing out the statistics of the dataset stats = train_env.get_statistics() print("The training data_size is : %d" % train_env.size()) print("The average instruction length of the dataset is %0.4f." % (stats['length'])) print("The average action length of the dataset is %0.4f." % (stats['path'])) stats = aug_env.get_statistics() print("The augmentation data size is %d" % aug_env.size()) print("The average instruction length of the dataset is %0.4f." % (stats['length'])) print("The average action length of the dataset is %0.4f." % (stats['path'])) # Setup the validation data val_envs = { split: (R2RBatch(feat_dict, batch_size=args.batchSize, splits=[split], tokenizer=tok), Evaluation([split], featurized_scans, tok)) for split in ['train', 'val_seen', 'val_unseen'] } # Start training train(train_env, tok, args.iters, val_envs=val_envs, aug_env=aug_env)
def main(args): with open(args.output_dir / 'config.json', 'r') as f: config = json.load(f) with open(args.input_data) as f: test = [json.loads(line) for line in f] with open(os.path.join(args.output_dir, "embedding.pkl"), 'rb') as f: embedding = pickle.load(f) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset(process_samples(tokenizer, test), args.output_dir / 'test_seq.pkl', config, tokenizer.pad_token_id)
def __init__(self, data_base_dir, label_path, max_aspect_ratio, max_encoder_l_h, max_encoder_l_w, max_decoder_l, max_vocab_size, initial_id2voc, initial_voc2id): # folder with processed images self.data_base_dir = data_base_dir # .lst file with formulas self.label_path = label_path self.max_width = 10000 self.max_aspect_ratio = max_aspect_ratio self.max_encoder_l_h = max_encoder_l_h self.max_encoder_l_w = max_encoder_l_w self.max_decoder_l = max_decoder_l self.min_aspect_ratio = 0.5 self.vocab_size = max_vocab_size self.tokenizer = Tokenizer(initial_id2voc, initial_voc2id) # buffer to save groups of batches with same width and height self.buffer = defaultdict(lambda: defaultdict(list))
def train_val(test_only=False): ''' Train on the training set, and validate on seen and unseen splits. ''' setup() vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput) feat_dict = read_img_features(features, test_only=test_only) if test_only: featurized_scans = None val_env_names = ['val_train_seen'] else: featurized_scans = set( [key.split("_")[0] for key in list(feat_dict.keys())]) val_env_names = ['val_train_seen', 'val_seen', 'val_unseen'] if not args.test_obj: print('Loading compact pano-caffe object features ... (~3 seconds)') import pickle as pkl with open('img_features/objects/pano_object_class.pkl', 'rb') as f_pc: pano_caffe = pkl.load(f_pc) else: pano_caffe = None train_env = R2RBatch(feat_dict, pano_caffe, batch_size=args.batchSize, splits=['train'], tokenizer=tok) from collections import OrderedDict if args.submit: val_env_names.append('test') val_envs = OrderedDict(((split, (R2RBatch(feat_dict, pano_caffe, batch_size=args.batchSize, splits=[split], tokenizer=tok), Evaluation([split], featurized_scans, tok))) for split in val_env_names)) if args.train == 'listener': train(train_env, tok, args.iters, val_envs=val_envs) elif args.train == 'validlistener': if args.beam: beam_valid(train_env, tok, val_envs=val_envs) else: valid(train_env, tok, val_envs=val_envs) elif args.train == 'speaker': train_speaker(train_env, tok, args.iters, val_envs=val_envs) elif args.train == 'validspeaker': valid_speaker(tok, val_envs) else: assert False
def __init__(self, f_abs, n_best=1, min_length=1, max_length=50, beam_size=4, bert_model='bert-base-uncased'): self.n_best = n_best self.min_length = min_length self.max_length = max_length self.beam_size = beam_size self.abs_model = self.load_abs_model(f_abs) self.eval() logger.info(f'Loading BERT Tokenizer [{bert_model}]...') self.tokenizerB = BertTokenizer.from_pretrained('bert-base-uncased') self.spt_ids_B, self.spt_ids_C, self.eos_mapping = get_special_tokens() logger.info('Loading custom Tokenizer for using WBMET embeddings') self.tokenizerC = Tokenizer(self.abs_model.args.vocab_size) self.tokenizerC.from_pretrained(self.abs_model.args.file_dec_emb)
def __init__(self, input, dictionary=None, merge_title=True, spacy_tokenizer=True, lines_are_documents=True, min_depth=0, max_depth=None, **kwargs): """ Parameters ---------- input : str Path to input file/folder. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional If a dictionary is provided, it will not be updated with the given corpus on initialization. If None - new dictionary will be built for the given corpus. If `input` is None, the dictionary will remain uninitialized. metadata : bool, optional If True - yield metadata with each document. merge_title : bool, optional If True - merge document's title into body text, if title exist. min_depth : int, optional Minimum depth in directory tree at which to begin searching for files. max_depth : int, optional Max depth in directory tree at which files will no longer be considered. If None - not limited. pattern : str, optional Regex to use for file name inclusion, all those files *not* matching this pattern will be ignored. exclude_pattern : str, optional Regex to use for file name exclusion, all files matching this pattern will be ignored. lines_are_documents : bool, optional If True - each line is considered a document, otherwise - each file is one document. kwargs: keyword arguments passed through to the `TextCorpus` constructor. See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these. """ super(TrecCorpus, self).__init__(input=input, dictionary=dictionary, metadata=True, lines_are_documents=lines_are_documents, min_depth=min_depth, max_depth=max_depth, **kwargs) self.merge_title = merge_title self.line_feeder = None self.spacy_tokenizer = spacy_tokenizer # if self.spacy_tokenizer: self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN, maximum_len=TOKEN_MAX_LEN, lowercase=True, output_lemma=True, use_stopwords=True)
def main(argv): with open(CONFIG, 'r') as f: config = json.load(f) # loading datasets from jsonl files testName = argv[1] with open(testName, 'r') as f: test = [json.loads(line) for line in f] tokenizer = Tokenizer(lower=config['lower_case']) logging.info('Loading embedding...') with open(ENBEDDINT_NAME, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset(process_samples(tokenizer, test), 'testSeq2Seq.pkl', config, tokenizer.pad_token_id)