def make_dict_vocab_to_tensor(bert_config): """make dictionary vocabulary to BERT output tensor.""" #BERTの設定読み込み #BERTモデル構築 #vocabファイルを読み込み、語彙リストをつくる vocab = tokenization.load_vocab(flags_vocab_file) #input_idsを作る(0スタートで語彙の数のぶんだけ) input_ids = list(range(len(vocab))) #input_maskを作る(全部0だと思う) input_mask = [0] * (len(vocab)+1) #segment_idsを作る(全部0だと思う) segment_ids = [0] * (len(vocab)+1) for i in range(len(vocab)): model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids[i], input_mask=input_mask[i], token_type_ids=segment_ids[i], use_one_hot_embeddings=False) #vocabリストをBERTモデルに投入 final_hidden = model.get_sequence_output()#Bertの最終層 #デバッグ用。終わったら取り除く # final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) # batch_size = final_hidden_shape[0] # seq_length = final_hidden_shape[1] # hidden_size = final_hidden_shape[2] return final_hidden
def main(_): tf.logging.set_verbosity(tf.logging.INFO) output_files = FLAGS.output_file.split(",") writers = [tf.python_io.TFRecordWriter(out) for out in output_files] rng = random.Random(FLAGS.random_seed) tokenizer = tokenization.WordpieceTokenizer( vocab=tokenization.load_vocab(FLAGS.vocab_file)) estimator = get_embedding_estimator() sample = get_sample(FLAGS.input_sentence_file, FLAGS.input_mapping_file, rng, FLAGS.sample_size) batches = list(range(0, len(sample), 3000)) + [len(sample)] for brange in zip(batches, batches[1:]): batch_sample = sample[brange[0]:brange[1]] instances = create_training_instances( FLAGS.input_sentence_file, FLAGS.input_mapping_file, tokenizer, FLAGS.max_seq_length, rng, FLAGS.do_lower_case, batch_sample) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, writers, estimator) for writer in writers: writer.close()
def is_ontology_id(id): vocab = tokenization.load_vocab(flags_vocab_file) inv_vocab = {v:k for k,v in vocab.items()} ontology_tokens = [] for token in vocab.keys(): if len(token) >= 2: if (token[0] == '$' and token[-1] == '$') or (token[0] == '%' and token[-1]) == '%': ontology_tokens.append(token) if tokenization.convert_ids_to_tokens(inv_vocab, [id])[0] in ontology_tokens: return True else: return False
def main(args): vocabs = tokenization.load_vocab(args.vocab_file) # tokenizer = tokenization.BasicTokenizer(do_lower_case=False) phases = ['train'] if args.need_dev: phases.append('dev') if not args.no_test: phases.append('test') for phase in phases: print('phase:', phase) prepare_ner(args, vocabs, phase)
def is_special_id(id): vocab = tokenization.load_vocab(vocab_pass) inv_vocab = {v: k for k, v in vocab.items()} special_tokens = [] for token in vocab.keys(): if len(token) >= 2: if token[0] == '[' and token[-1] == ']': special_tokens.append(token) if tokenization.convert_ids_to_tokens(inv_vocab, [id])[0] in special_tokens: return True else: return False
def Memory_level_model_init(init_checkpoint): vocab_file = 'vocab.txt' embedding_dim = 128 dropout_prob = 0.1 processor = MemoryProcessor() label_list = processor.get_labels() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab_dim = len(tokenization.load_vocab(vocab_file)) model = SequenceClassification(vocab_dim, embedding_dim, dropout_prob, len(label_list), device) if init_checkpoint is not None: model.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) return model
def __init__(self, vocab_file, do_lower_case=False): self.vocab = tokenization.load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.wordpiece_tokenizer = tokenization.WordpieceTokenizer( vocab=self.vocab) self.do_lower_case = do_lower_case
parser.add_argument("--input_tokens", type=str, default=None, help="path to possibly incorrect token file") parser.add_argument("--edit_ids", type=str, default=None, help="path to edit ids to be applied on input_tokens") parser.add_argument("--output_tokens", type=str, default=None, help="path to edited (hopefully corrected) file") parser.add_argument("--infer_mode", type=str, default="conll", help="post processing mode bea or conll") parser.add_argument("--path_common_inserts",type=str,default=None,help="path of common unigram inserts") parser.add_argument("--path_common_multitoken_inserts",type=str,default=None,help="path of common bigram inserts") parser.add_argument("--path_common_deletes",type=str,default=None,help="path to common deletions observed in train data") parser = argparse.ArgumentParser() add_arguments(parser) FLAGS, unparsed = parser.parse_known_args() DO_PARALLEL = False INFER_MODE=FLAGS.infer_mode vocab = tokenization.load_vocab(FLAGS.vocab_path) basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=False,vocab=vocab) vocab_words = set(x for x in vocab) common_deletes = pickle.load(open(FLAGS.path_common_deletes,"rb")) path_common_inserts = FLAGS.path_common_inserts path_common_multitoken_inserts = FLAGS.path_common_multitoken_inserts opcodes = opcodes.Opcodes(path_common_inserts, path_common_multitoken_inserts) if __name__ == '__main__': class config: INPUT_UNCORRECTED_WORDS = FLAGS.input_tokens INPUT_EDITS = FLAGS.edit_ids OUTPUT_CORRECTED_WORDS = FLAGS.output_tokens
# counts = pair_counts(lines, maps) # most_frequent_pairs(counts) pairs = [ ('year', 'ano'), ('wanted', 'queria'), ('question', 'questão'), ('I', 'eu'), ('opportunity', 'oportunidade'), ('problem', 'problema'), ('love', 'amor'), ] K = 10 tokenizer = tokenization.WordpieceTokenizer(vocab=tokenization.load_vocab( "/home/arthur/Projects/bert/models/multi_aligned_cased_L-12_H-768_A-12/vocab.txt" )) rng = random.Random(1234) sample = set(get_sample(sent_path, map_path, rng, 50000)) lines = [l for i, l in enumerate(lines) if i in sample] sents = get_sentences(lines, pairs, tokenizer, k=K) embs = get_embeddings(sents) from sklearn.manifold import TSNE X = embs.reshape((-1, embs.shape[-1])) X_embedded = TSNE(n_components=2, perplexity=20, metric='cosine').fit_transform(X) X_embedded.shape
def __init__(self): vocab_file = 'vocab.txt' vocab = tokenization.load_vocab(vocab_file=vocab_file) tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) path = 'train_processed.txt' train_file = open(path, 'r', encoding='utf-8') lines = train_file.read().split('\n') max_length = 0 for i in range(len(lines)): TK = lines[i].split(' \t') if max_length < len(TK[0]): max_length = len(TK[0]) max_length += 1 self.input_ids = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.input_mask = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.label = np.zeros(shape=[len(lines)], dtype=np.int32) for i in range(len(lines) - 1): TK = lines[i].split(' \t') if len(TK) != 2: TK = lines[i].split('\t') sentence = TK[0] token = tokenizer.tokenize(sentence) tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab, tokens=token) for j in range(len(tk_ids)): self.input_ids[i, j + 1] = tk_ids[j] self.input_mask[i, j + 1] = 1 self.input_ids[i, 0] = tokenization.convert_tokens_to_ids( vocab=vocab, tokens=['[CLS]'])[0] self.input_mask[i, 0] = 1 self.label[i] = int(TK[1]) path = 'test_processed.txt' test_file = open(path, 'r', encoding='utf-8') lines = test_file.read().split('\n') max_length = 0 for i in range(len(lines)): TK = lines[i].split(' \t') if max_length < len(TK[0]): max_length = len(TK[0]) print(max_length) max_length += 1 self.test_input_ids = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.test_input_ids_masking = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.test_label = np.zeros(shape=[len(lines)], dtype=np.int32) for i in range(len(lines) - 1): TK = lines[i].split(' \t') if len(TK) != 2: TK = lines[i].split('\t') sentence = TK[0] token = tokenizer.tokenize(sentence) tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab, tokens=token) for j in range(len(tk_ids)): self.test_input_ids[i, j + 1] = tk_ids[j] self.test_input_ids_masking[i, j + 1] = 1 self.test_input_ids[i, 0] = tokenization.convert_tokens_to_ids( vocab=vocab, tokens=['[CLS]'])[0] self.test_input_ids_masking[i, 0] = 1 self.test_label[i] = int(TK[1]) self.Batch_Size = 8 self.random_idx = np.array(range(self.label.shape[0]), dtype=np.int32) np.random.shuffle(self.random_idx) self.Batch_Idx = 0 self.Test_Batch_Idx = 0
def main(): parser = argparse.ArgumentParser() # model parser.add_argument('--model', type=str, default='wordrnn') parser.add_argument('--dir', type=str, default=None) parser.add_argument('--tokenizer', type=str, default='nltk', help='Only effective when model set to wordrnn') parser.add_argument('--criterion', type=str, default='full') # data parser.add_argument('--set', type=str, default='msr') parser.add_argument('--partition', type=str, default='va') parser.add_argument('--no-move-cached', action='store_true') parser.add_argument('--log-dir', type=str, default='train/noname') parser.add_argument('--save-pred', action='store_true') args = parser.parse_args() problem_set = ProblemSet.load(args.set) examples = problem_set.get_examples(args.partition) logger.info("Evaluating models saved in {} on {}-{}".format( args.dir, args.set, args.partition)) if not os.path.exists(args.log_dir): logger.info("Creating directory at {}".format(args.log_dir)) os.makedirs(args.log_dir) args_path = os.path.join(args.log_dir, 'args.json') with open(args_path, 'w') as f: logger.info("Saving arguments at {}".format(args_path)) json.dump(vars(args), f, indent=2) log_path = os.path.join(args.log_dir, 'log.txt') file_handler = logging.FileHandler(log_path, mode='w') file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) model_type = args.model.lower() if model_type == 'wordrnn': args_path = osp.join(args.dir, 'args.json') with open(args_path, 'r') as f: arg_dict = json.load(f) vocab_path = osp.join(args.dir, 'vocab.txt') vocab = load_vocab(vocab_path) if args.tokenizer.lower() == 'nltk': tokenizer = NLTKTokenizer(vocab, arg_dict['lower']) elif args.tokenizer.lower() == 'wordpiece': tokenizer = BertTokenizer(vocab_path, arg_dict['lower']) model = WordRNN(len(vocab), len(vocab), arg_dict['rnncell'], arg_dict['emsize'], arg_dict['outsize'], arg_dict['nhid'], arg_dict['nlayers'], arg_dict['bidirec'], arg_dict.get('autoenc', False), arg_dict['decoder_bias']) logger.info(model) ckpt_paths = glob.glob(osp.join(args.dir, '*.pt')) ckpt_paths.sort(key=osp.getmtime) for path in ckpt_paths: model.load_state_dict(torch.load(path)) direction = 'autoenc' if model.autoenc else ( 'bidirec' if model.bidirec else 'forward') evaluate(examples, model, tokenizer, direction, args.criterion, str(osp.basename(path.split('.')[0]))) if args.save_pred: save_fn = osp.basename(path).replace('.pt', '.csv') save_preds(examples, osp.join(args.log_dir, save_fn)) elif model_type == 'lm1b': lm1b_dir = settings['lm1b_dir'] for e in examples: e.context[0] = ' '.join(['<S>', e.context[0]]) e.context[-1] = ' '.join([e.context[-1], '</S>']) vocab = load_vocab(osp.join(lm1b_dir, 'vocab-2016-09-10.txt')) special_tokens = ['<S>', '</S>', '<UNK>'] tokenizer = BaseTokenizer(vocab, False, '<UNK>', special_tokens) in_vocab = load_vocab(osp.join(lm1b_dir, args.dir, 'vocab.txt')) out_to_in = [in_vocab['<UNK>']] * 800000 for i, token in tokenizer.ids_to_tokens.items(): out_to_in[i] = in_vocab.get(token, in_vocab['<UNK>']) tf_path = osp.join(lm1b_dir, 'ckpt-*') npy_path = osp.join(lm1b_dir, args.dir, 'embeddings.npy') model = LM1B.from_tf(tf_path, npy_path, out_to_in, 8) logger.info(model) evaluate(examples, model, tokenizer, 'forward', args.criterion) if args.save_pred: save_preds(examples, osp.join(args.log_dir, 'preds.csv')) else: cache_dir = settings['pretrans_dir'] bert_dir = osp.join(settings['pretrans_dir'], args.dir) model_or_dir = bert_dir if osp.exists(bert_dir) else args.dir config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] config = config_class.from_pretrained(model_or_dir, cache_dir=cache_dir) tokenizer = tokenizer_class.from_pretrained( model_or_dir, cache_dir=cache_dir, max_len=config.max_position_embeddings, do_lower_case='-uncased' in model_or_dir) model = model_class.from_pretrained(model_or_dir, cache_dir=cache_dir, config=config) direction = 'forward' if model_type == 'bert': direction = 'autoenc' evaluate(examples, model, tokenizer, direction, args.criterion) if args.save_pred: save_preds(examples, osp.join(args.log_dir, 'preds.csv')) if not args.no_move_cached and not osp.exists(bert_dir): logger.info("Creating directory at {}".format(bert_dir)) os.mkdir(bert_dir) model_url = model.pretrained_model_archive_map[model_or_dir] model_path = osp.join(bert_dir, WEIGHTS_NAME) move_cached(model_url, cache_dir, model_path) config_url = model.config.pretrained_config_archive_map[ model_or_dir] config_path = osp.join(bert_dir, CONFIG_NAME) move_cached(config_url, cache_dir, config_path) for k, url_map in tokenizer.pretrained_vocab_files_map.items(): vocab_path = osp.join(bert_dir, tokenizer.vocab_files_names[k]) move_cached(url_map[model_or_dir], cache_dir, vocab_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--dev_batch_size", default=8, type=int, help="Total batch size for develop") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=3000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--model_path', type=str, default='./model', help='save model path') parser.add_argument('--load_model', type=str, default=None) parser.add_argument('--embedding_dim', type=int, default=300) parser.add_argument('--dropout_prob', type=float, default=0.2) args = parser.parse_args() processors = {"memory": MemoryProcessor, "logic": LogicalProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device((args.local_rank)) device = "cuda" n_gpu = torch.cuda.device_count() # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') torch.backends.cudnn.benchmark = True if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) vocab_dim = len(tokenization.load_vocab(args.vocab_file)) model = SequenceClassification(vocab_dim, args.embedding_dim, args.dropout_prob, len(label_list), device) if args.load_model is not None: model.load_state_dict(torch.load(args.load_model, map_location='cpu')) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.local_rank != -1: model = DDP(model) optimizer = FP16_Optimizer(optimizer) ''' model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) ''' elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: #train feature train_features = convert_to_ids(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_q_ids = torch.tensor([f.que_ids for f in train_features], dtype=torch.long) all_d_ids = torch.tensor([f.des_ids for f in train_features], dtype=torch.long) all_sd_ids = torch.tensor([f.scene_ids for f in train_features], dtype=torch.long) #all_Ld_ids = torch.tensor([f.local_scene_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_q_ids, all_d_ids, all_sd_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, num_workers=1, sampler=train_sampler, batch_size=args.train_batch_size) #developset feature dev_exmaples = processor.get_dev_examples(args.data_dir) dev_features = convert_to_ids(dev_exmaples, label_list, args.max_seq_length, tokenizer) all_dev_q_ids = torch.tensor([f.que_ids for f in dev_features], dtype=torch.long) all_dev_d_ids = torch.tensor([f.des_ids for f in dev_features], dtype=torch.long) all_dev_sd_ids = torch.tensor([f.scene_ids for f in dev_features], dtype=torch.long) #all_dev_Ld_ids = torch.tensor([f.local_scene_ids for f in dev_features], dtype=torch.long) all_dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_dev_q_ids, all_dev_d_ids, all_dev_sd_ids, all_dev_label_ids) if args.local_rank == -1: dev_sampler = RandomSampler(dev_data) else: dev_sampler = DistributedSampler(dev_data) dev_dataloader = DataLoader(dev_data, num_workers=1, sampler=dev_sampler, batch_size=args.eval_batch_size) model.train() losses = [] dev_accuracy_list = [] dev_losses = [] for epoch in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for (q_ids, d_ids, sd_ids, label_ids) in (train_dataloader): optimizer.zero_grad() q_ids = q_ids.to(device) d_ids = d_ids.to(device) sd_ids = sd_ids.to(device) #Ld_ids = Ld_ids.to(device) label_ids = label_ids.to(device) loss, _ = model.forward(q_ids, d_ids, sd_ids, label_ids) tr_loss += loss.item() nb_tr_examples += q_ids.size(0) nb_tr_steps += 1 loss.backward() optimizer.step() global_step += 1 if (epoch + 1) % 10 == 0: if args.task_name == 'memory': torch.save( model.state_dict(), os.path.join( args.model_path, 'non_crossPassage_res_memory_model' + str(epoch + 1) + '.bin')) else: torch.save( model.state_dict(), os.path.join( args.model_path, 'non_crossPassage_res_logic_model' + str(epoch + 1) + '.bin')) losses.append(tr_loss / nb_tr_steps) #develop dataset evaluation dev_accuracy, nb_dev_examples = 0, 0 for q_ids, d_ids, sd_ids, label_ids in dev_dataloader: q_ids = q_ids.to(device) d_ids = d_ids.to(device) sd_ids = sd_ids.to(device) #Ld_ids = Ld_ids.to(device) label_ids = label_ids.to(device) dev_loss, logits = model.forward(q_ids, d_ids, sd_ids, label_ids) label_ids = label_ids.to('cpu').numpy() logits = logits.to('cpu').detach().numpy() tmp_dev_accuracy = accuracy(logits, label_ids) dev_accuracy += tmp_dev_accuracy nb_dev_examples += q_ids.size(0) print('-' * 20) print("Epochs : {}".format(epoch + 1)) print("dev_accuracy : {}".format(dev_accuracy / nb_dev_examples)) print("train Loss : {}".format(tr_loss / nb_tr_steps)) print("validataion Loss : {}".format(dev_loss.item())) dev_losses.append(dev_loss.item()) print('-' * 20) if args.do_eval: eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_to_ids(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_q_vectors = torch.tensor([f.que_ids for f in eval_features], dtype=torch.long) all_d_vectors = torch.tensor([f.des_ids for f in eval_features], dtype=torch.long) all_sd_vectors = torch.tensor([f.scene_ids for f in eval_features], dtype=torch.long) #all_Ld_vectors = torch.tensor([f.local_scene_ids for f in eval_features], dtype=torch.long) all_label_vectors = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_q_vectors, all_d_vectors, all_sd_vectors, all_label_vectors) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, num_workers=1, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logit_label_list = [] for step, (q_vec, d_vec, sd_vec, label_vec) in enumerate( tqdm(eval_dataloader, desc="Iteration")): q_vec = q_vec.to(device) d_vec = d_vec.to(device) sd_vec = sd_vec.to(device) #Ld_vec = Ld_vec.to(device) label_vec = label_vec.to(device) tmp_eval_loss, logits = model.forward(q_vec, d_vec, sd_vec, label_vec) label_ids = label_vec.to('cpu').numpy() logits = logits.to('cpu').detach().numpy() tmp_eval_accuracy = accuracy(logits, label_ids) output = np.argmax(logits, axis=1) list(output) list(label_ids) logit_label_list.append([output, label_ids]) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += q_vec.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # len(eval_dataloader) eval_accuracy = eval_accuracy / nb_eval_examples # len(eval_dataloader) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step } #'loss': tr_loss / nb_tr_steps} # 'loss': loss.item()} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open('[memory]align_epoch20_output', 'w') as f: logit_output_list = [] Gold_output_list = [] for labels in logit_label_list: for logit in labels[0]: logit_output = convert_id_to_label(logit, label_list) logit_output_list.append(logit_output) for Gold in labels[1]: Gold_output = convert_id_to_label(Gold, label_list) Gold_output_list.append(Gold_output) for logit, gold in zip(logit_output_list, Gold_output_list): f.write(str(logit) + '\t' + str(gold) + '\n') with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def create_embedding_table(self, embedding_type, name): oov_vocab, in_vocab = set(), set() if embedding_type == 'random': embedding_table = tf.Variable(tf.random_uniform( [self.vocab_size, self.embed_size], -1.0, 1.0), name='embed_w') return embedding_table elif re.search('word2vec', embedding_type) is not None: embedding_file = self.params[ 'word2vec_file'] ##https://github.com/Embedding/Chinese-Word-Vectors embedding_vocab = gensim.models.KeyedVectors.load_word2vec_format( embedding_file, binary=True, encoding='utf-8', unicode_errors='ignore') embedding_table = np.zeros((self.vocab_size, self.embed_size)) self.vocab, index_vocab = tokenization.load_vocab( vocab_file=os.path.join(self.params['data_dir'], 'vocab_word.txt'), params=self.params) for word, i in self.vocab.items(): if word in embedding_vocab.vocab: embedding_table[i] = embedding_vocab[word] in_vocab.add(word) else: embedding_table[i] = np.random.random(self.embed_size) oov_vocab.add(word) tf.logging.info('OOV:%f' % (len(oov_vocab) / (len(oov_vocab) + len(in_vocab)))) if embedding_type == 'word2vec_finetune': trainable = True elif embedding_type == 'word2vec_static': trainable = False else: trainable = False print( "word2vec word embedding type please choose 'static' or 'finetune'" ) embedding_table2 = tf.get_variable( name='embedding_w', shape=[self.vocab_size, self.embed_size], initializer=tf.constant_initializer(embedding_table), trainable=trainable) return embedding_table2 elif re.search('fasttext', embedding_type) is not None: #https://fasttext.cc/docs/en/crawl-vectors.html embedding_vocab = self._load_embedding_pretrained( embedding_file=self.params['fasttext_file']) embedding_table = np.zeros((self.vocab_size, self.embed_size)) self.vocab, index_vocab = tokenization.load_vocab( vocab_file=os.path.join(self.params['data_dir'], 'vocab_word.txt'), params=self.params) for word, i in self.vocab.items(): if word in embedding_vocab.keys(): embedding_table[i] = embedding_vocab[word] in_vocab.add(word) else: embedding_table[i] = np.random.random(self.embed_size) oov_vocab.add(word) if embedding_type == 'fasttext_finetune': trainable = True elif embedding_type == 'fasttext_static': trainable = False else: trainable = False print( "fasttext word embedding type please choose 'static' or 'finetune'" ) embedding_table2 = tf.get_variable( name=name + 'embedding_w', shape=[self.vocab_size, self.embed_size], initializer=tf.constant_initializer(embedding_table), trainable=trainable) tf.logging.info('OOV:%f' % (len(oov_vocab) / (len(oov_vocab) + len(in_vocab)))) return embedding_table2 elif re.search('glove', embedding_type) is not None: pass elif re.search('elmo', embedding_type) is not None: print('Invalid embedding type: %s' % self.params['embedding_type']) print( 'elmo please refer to github repository: HIT-SCIR/ELMoForManyLangs' )
def Logic_level_model(qestion, clip_description, scene_description): #environment device = "cuda" if torch.cuda.is_available() else "cpu" max_sequence_length = 128 vocab_file = 'vocab.txt' embedding_dim = 200 dropout_prob = 0.2 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vocab_dim = len(tokenization.load_vocab(vocab_file)) processor = LogicalProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False) label_list = processor.get_labels() model = SequenceClassification(vocab_dim, embedding_dim, dropout_prob, len(label_list), device) init_checkpoint = 'model/Logic_model.bin' #Future save model Load code if init_checkpoint is not None: model.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) model.to(device) eval_example = processor._create_examples(qestion, clip_description, scene_description) eval_feature = convert_to_ids(eval_example, label_list, max_sequence_length, tokenizer) que_ids = torch.tensor([f.que_ids for f in eval_feature], dtype=torch.long) des_ids = torch.tensor([f.des_ids for f in eval_feature], dtype=torch.long) scene_ids = torch.tensor([f.scene_ids for f in eval_feature], dtype=torch.long) if eval_feature.label_id == None: label_ids = None else: label_ids = torch.tensor([f.label_ids for f in eval_feature], dtype=torch.long) #eval_data = TensorDataset(input_ids, input_mask, segment_ids, label_ids) #eval_dataloader = DataLoader(eval_data) model.eval() que_ids = que_ids.to(device) des_ids = des_ids.to(device) scene_ids = scene_ids.to(device) if label_ids == None: pass else: label_ids = label_ids.to(device) if label_ids == None: logits = model(que_ids, des_ids, scene_ids, label_ids) else: loss, logits = model(que_ids, des_ids, scene_ids, label_ids) logits = logits.detach().cpu().numpy() output = np.argmax(logits, axis=0) output = convert_id_to_label(output, label_list) return output
import tensorflow as tf import tokenization vocab_pass = '******' p_tokens = [ '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'genius', '[when]', 'morning' ] predict_ids = tokenization.convert_tokens_to_ids( tokenization.load_vocab(vocab_pass), p_tokens) r_tokens = [ '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'smart', '[when]', 'afternoon', 'and', 'evening' ] real_ids = tokenization.convert_tokens_to_ids( tokenization.load_vocab(vocab_pass), r_tokens) predict_tensor = tf.constant([[1, 2, 3], [98, 1, 6], [1, 2, 4], [22, 1, 6], [3, 2, 3], [7, 1, 6], [0, 2, 3], [11, 1, 9]], dtype=float) real_tensor = tf.constant( [[1, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1], [3, 2, 3], [12, 8, 1], [0, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1]], dtype=float) def is_special_id(id): vocab = tokenization.load_vocab(vocab_pass) inv_vocab = {v: k for k, v in vocab.items()} special_tokens = [] for token in vocab.keys():
def main(): parser = argparse.ArgumentParser() # model structure parser.add_argument('--rnncell', type=str, default='LSTM') parser.add_argument('--emsize', type=int, default=200) parser.add_argument('--nhid', type=int, default=600) parser.add_argument('--outsize', type=int, default=400) parser.add_argument('--nlayers', type=int, default=2) parser.add_argument('--bidirec', action='store_true') parser.add_argument('--autoenc', action='store_true') parser.add_argument('--forget-bias', type=float, default=False) parser.add_argument('--decoder-bias', action='store_true') # data parser.add_argument('--corpus', type=str, default='guten') parser.add_argument('--min-len', type=int, default=10) parser.add_argument('--max-len', type=int, default=80) # vocabulary parser.add_argument('--vocab', type=str, default=None) parser.add_argument('--lower', action='store_true') parser.add_argument('--min-cnt', type=int, default=6) # training parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--seed', type=int, default=3333) parser.add_argument('--batch-size', type=int, default=20) parser.add_argument('--eval-batch-size', type=int, default=10) # optimizer parser.add_argument('--optim', type=str, default='SGD') parser.add_argument('--lr', type=float, default=.5) parser.add_argument('--clip', type=float, default=5.0) parser.add_argument('--decay-after', type=int, default=5) parser.add_argument('--decay-rate', type=float, default=0.5) parser.add_argument('--decay-period', type=int, default=1) parser.add_argument('--epochs', type=int, default=10) # save and log parser.add_argument('--save-dir', type=str, default='train/noname') parser.add_argument('--log-interval', type=int, default=10000) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--save-all', action='store_false') parser.add_argument('--save-period', type=int, default=1) args = parser.parse_args() logger.debug("Running {}".format(__file__)) if not os.path.exists(args.save_dir): logger.debug("Creating directory at {}".format(args.save_dir)) os.makedirs(args.save_dir) args_path = os.path.join(args.save_dir, 'args.json') with open(args_path, 'w') as f: logger.debug("Saving arguments at {}".format(args_path)) json.dump(vars(args), f, indent=2) log_path = os.path.join(args.save_dir, 'log.txt') file_handler = logging.FileHandler(log_path, mode='w') file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Use pre built vocabulary if it exists if args.vocab and os.path.exists(args.vocab): vocab = load_vocab(args.vocab) update = False else: vocab = Vocabulary() update = True tokenizer = Tokenizer(vocab, args.lower) tr_txts = get_txts(args.corpus, 'train') va_txts = get_txts(args.corpus, 'valid') tr_input = LineInput(tr_txts, tokenizer, update, args.min_len, args.max_len) va_input = LineInput(va_txts, tokenizer, update, args.min_len, args.max_len) va_batches = va_input.batchify(args.eval_batch_size, False) if update: vocab.build_from_counter(args.min_cnt) logger.debug("Built vocab of size {}".format(len(vocab))) # Build the model model = WordRNN(len(vocab), len(vocab), args.rnncell, args.emsize, args.outsize, args.nhid, args.nlayers, args.bidirec, args.autoenc, args.decoder_bias, args.forget_bias, args.dropout) logger.debug(model) model.to(device) learnables = list(filter(lambda p: p.requires_grad, model.parameters())) optimizer = getattr(optim, args.optim)(learnables, lr=args.lr) save_vocab(vocab, os.path.join(args.save_dir, 'vocab.txt')) model_path = os.path.join(args.save_dir, 'model.pt') # At any point you can hit Ctrl + C to break out of training early. try: # Loop over epochs. best_val_loss = None logger.info('-' * 79) for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() tr_batches = tr_input.batchify(args.batch_size, True) train(model, tr_batches, learnables, optimizer, device, args) val_loss = evaluate(model, va_batches, device) logger.info('-' * 79) logger.info('| end of epoch {:2d} | time: {:5.2f}s ' '| valid loss {:5.2f} | valid ppl {:8.2f} |'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) logger.info('-' * 79) updated_best = not best_val_loss or val_loss < best_val_loss if epoch >= args.decay_after > 0: if (epoch - args.decay_after) % args.decay_period == 0: for group in optimizer.param_groups: group['lr'] *= args.decay_rate if (epoch % args.save_period == 0) and (updated_best or args.save_all): if args.save_all: model_path = os.path.join(args.save_dir, 'ep{}.pt'.format(epoch)) torch.save(model.state_dict(), model_path) if updated_best: best_val_loss = val_loss logger.debug("Completed training and saved to {}".format( args.save_dir)) except KeyboardInterrupt: logger.debug('-' * 79) logger.debug("Exiting from training early")