def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
test_shortest_responses_labels = test_data[6].tolist() print("Shortest responses count:", sum(test_shortest_responses_labels)) print("bucket indices len:", len(test_bucket_indices)) model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file,
parser.add_argument('--qrel', action='store', dest='qrel', help='qrel, to only check judged queries', required=False) args = parser.parse_args() # # load data & create vocab # ------------------------------- # loader = IrTupleDatasetReader(lazy=True, lowercase=True) vocab = Vocabulary.from_files(args.vocab) if args.qrel: qrels = load_reference(args.qrel) not_judged = 0 oov_queries = 0 non_oov_queries = 0 oov_count_list = [] instances = loader.read(args.query) with open(args.out_file_oov, "w", encoding="utf8") as out_file_oov: with open(args.out_file_no_oov, "w", encoding="utf8") as out_file_non_oov: for i in Tqdm.tqdm(instances): id_str = i["source_tokens"].tokens[0].text if args.qrel and int(id_str) not in qrels:
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': all_datasets = training_util.datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) else: vocab = Vocabulary.from_params(params.pop( "vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return TrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def load_decomposable_attention_elmo_softmax_model(): NEGATIVE_PERCENTAGE = 100 # EMBEDDING_TYPE = "" # LOSS_TYPE = "" # NLL # LOSS_TYPE = "_nll" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: model.load_state_dict(torch.load(f, map_location='cuda:0')) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor(model, dataset_reader=reader) return model, predictor
predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict(prediction_sentence)['tag_logits'] tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) # Here's how to save the model. with open("/tmp/model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("/tmp/vocabulary") # And here's how to reload the model. vocab2 = Vocabulary.from_files("/tmp/vocabulary") model2 = LstmTagger(word_embeddings, lstm, vocab2) with open("/tmp/model.th", 'rb') as f: model2.load_state_dict(torch.load(f)) if cuda_device > -1: model2.cuda(cuda_device) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits'] np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
def train(): reader = PWKPReader() train_dataset = reader.read(train_path) valid_dataset = reader.read(dev_path) if os.path.exists(vocab_dir): vocab = Vocabulary.from_files(vocab_dir) else: vocab = Vocabulary.from_instances(instances=train_dataset, max_vocab_size=opt.vocab_size) vocab.save_to_files(vocab_dir) iterator = BucketIterator(batch_size=opt.batch_size, sorting_keys=[("src", "num_tokens"), ("tgt", "num_tokens")]) iterator.index_with(vocab) model = Seq2Seq(emb_size=opt.emb_size, hidden_size=opt.hidden_size, enc_layers=opt.enc_layers, dec_layers=opt.dec_layers, dropout=opt.dropout, bidirectional=opt.bidirectional, beam_size=opt.beam_size, label_smoothing=opt.label_smoothing, vocab=vocab) optimizer = optim.Adam(model.parameters(), lr=opt.lr) #learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=opt.lr_decay) val_iterator = BasicIterator(batch_size=opt.batch_size) val_iterator.index_with(vocab) predictor = Predictor(iterator=val_iterator, max_decoding_step=opt.max_step, vocab=vocab, reader=reader, data_path=test_path, log_dir=save_dir, map_path=ner_path, cuda_device=opt.gpu) trainer = Trainer( model=model, optimizer=optimizer, #learning_rate_scheduler=learning_rate_scheduler, learning_rate_decay=opt.lr_decay, ema_decay=opt.ema_decay, predictor=predictor, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, validation_metric='+bleu', cuda_device=opt.gpu, num_epochs=opt.epoch, serialization_dir=save_dir, num_serialized_models_to_keep=5, #model_save_interval=60, #summary_interval=500, should_log_parameter_statistics=False, grad_norm=10) trainer.train()
trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000) trainer.train() predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] tag_ids = np.argmax(tag_logits, axis=1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) with open('models/tagger.th', 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files('models/vocabulary') vocab2 = Vocabulary.from_files('models/vocabulary') model2 = LstmTagger(word_embeddings, lstm, vocab2) with open('models/tagger.th', 'rb') as f: model2.load_state_dict(torch.load(f)) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) tag_logits2 = predictor2.predict("The dog ate the apple")["tag_logits"] assert tag_logits == tag_logits2
def main(args): ALL_DATASET_PATHS = get_all_dataset_paths( args.dataset_paths_file, args.dataset_path_prefix ) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: ConLLDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES] ) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS) model.load_state_dict(torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator( partition_key="dataset", batch_size=BATCH_SIZE * 2 ) test_iterator.index_with(vocab) model = model.eval() model.set_inference_mode(True) return TASKS, vocab, model, readers, test_iterator
from allennlp.modules.similarity_functions import BilinearSimilarity, CosineSimilarity, DotProductSimilarity, LinearSimilarity, MultiHeadedSimilarity from allennlp.modules.feedforward import FeedForward from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits from allennlp.training.metrics import CategoricalAccuracy from allennlp.data.iterators import BucketIterator from allennlp.training.trainer import Trainer from allennlp.predictors import SentenceTaggerPredictor from allennlp.nn import Activation torch.manual_seed(1) if __name__ == '__main__': vocab2 = Vocabulary.from_files("./wikiqavucabulary") model2 = LstmTagger(word_embeddings, esim, vocab2) with open("./wikiqamodel.th", 'rb') as f: model2.load_state_dict(torch.load(f)) if cuda_device > -1: model2.cuda(cuda_device) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits'] np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: ConLLDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) if not TEST_MODE: train_dataset = read_datasets(dataset_paths, readers, data_split="train") validation_dataset = read_datasets(dataset_paths, readers, data_split="dev") vocab = create_vocab([train_dataset, validation_dataset]) # Special case for CCG if "ccg" in task_suffixes or "pos" in task_suffixes: for task in TASKS: if task.task_type == "ccg": for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]: vocab.add_token_to_namespace(tag, task.tag_namespace) if task.tag_namespace == "ud_pos": for tag in ["CONJ"]: vocab.add_token_to_namespace(tag, task.tag_namespace) else: vocab = Vocabulary.from_files( os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS) model = model.cuda(device=CUDA_DEVICE) if not TEST_MODE: iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True) iterator.index_with(vocab) if CLEAN_MODEL_DIR: if os.path.exists(SERIALIZATION_DIR): logger.info(f"Deleting {SERIALIZATION_DIR}") shutil.rmtree(SERIALIZATION_DIR) logger.info(f"Creating {SERIALIZATION_DIR}") os.makedirs(SERIALIZATION_DIR) logger.info( f"Writing arguments to arguments.json in {SERIALIZATION_DIR}") with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp: json.dump(vars(args), fp, indent=2) logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}") vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # Use list to ensure each epoch is a full pass through the data combined_training_dataset = list( roundrobin_iterator(*train_dataset.values())) combined_validation_dataset = list( roundrobin_iterator(*validation_dataset.values())) # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) training_stats = [] trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=combined_training_dataset, validation_dataset=combined_validation_dataset, patience=PATIENTCE, num_epochs=NUM_EPOCHS, cuda_device=CUDA_DEVICE, serialization_dir=SERIALIZATION_DIR, # model_save_interval=600 ) stats = trainer.train() training_stats.append(stats) with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp: json.dump(training_stats, fp, indent=2) else: model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() test_filepaths = { task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS } logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() test_stats = evaluate_multiple_data(model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE) with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp: json.dump(test_stats, fp, indent=2)
def main(): # language model, fixed pop swap = True vocab = Vocabulary.from_files("saved_models/vocabulary-linzen") model = StackRNNLanguageModel(vocab, rnn_dim=100, stack_dim=16, num_embeddings=10030, swap_push_pop=swap) with open("saved_models/stack-linzen-swap.th", "rb") as fh: model.load_state_dict(torch.load(fh)) # classification model, fixed pop """swap = True vocab = Vocabulary.from_files("saved_models/vocabulary-linzen") print(vocab.get_vocab_size()) model = StackRNNAgreementPredictor(vocab, rnn_dim=100, stack_dim=16, rnn_cell_type=torch.nn.GRUCell, num_embeddings=10000, swap_push_pop=swap) with open("saved_models/stack-linzen-class.th", "rb") as fh: model.load_state_dict(torch.load(fh))""" # language model fixed push """swap = False vocab = Vocabulary.from_files("saved_models/vocabulary-linzen") model = StackRNNLanguageModel(vocab, rnn_dim=100, stack_dim=16, num_embeddings=10030, swap_push_pop=swap) with open("saved_models/stack-linzen.th", "rb") as fh: model.load_state_dict(torch.load(fh))""" """swap=False vocab = Vocabulary.from_files("saved_models/vocabulary-linzen") model = StackRNNAgreementPredictor(vocab, rnn_dim=100, stack_dim=16, rnn_cell_type=torch.nn.GRUCell, num_embeddings=9968, push_ones=False, swap_push_pop=swap) with open("saved_models/stack-linzen-class-nopushpop.th", "rb") as fh: model.load_state_dict(torch.load(fh))""" """swap=True vocab = Vocabulary.from_files("saved_models/vocabulary-linzen") model = StackRNNAgreementPredictor(vocab, rnn_dim=100, stack_dim=16, rnn_cell_type=torch.nn.GRUCell, num_embeddings=9968, swap_push_pop=swap) with open("saved_models/stack-linzen-class-pop.th", "rb") as fh: model.load_state_dict(torch.load(fh))""" dataset_reader = BrownDatasetReader(labels=False) # true? predictor = TreePredictor(model, dataset_reader) sentence = "the man in the hospitals eats an apple" prediction = predictor.predict(sentence) fig = plt.figure() #one_hist_wonder(prediction, sentence.split(" "), fig) profile_sentence(prediction, sentence.split(" "), fig, swap) plt.show() sentence = "the cat that dogs chase eats apples" prediction = predictor.predict(sentence) fig = plt.figure() #one_hist_wonder(prediction, sentence.split(" "), fig) profile_sentence(prediction, sentence.split(" "), fig, swap) plt.show() sentence = "the man who likes eating apples is full" prediction = predictor.predict(sentence) fig = plt.figure() #one_hist_wonder(prediction, sentence.split(" "), fig) profile_sentence(prediction, sentence.split(" "), fig, swap) plt.show() """sentence = "dogs chase the cat"
def train(args): _train_data_path = os.path.join(args.data_dir, args.train_file_name) _validation_data_path = os.path.join(args.data_dir, args.validation_file_name) print(_train_data_path) reader = PosDatasetReader() train_dataset = reader.read(_train_data_path) validation_dataset = reader.read(_validation_data_path) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embedding_dim) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(args.embedding_dim, args.hidden_dim, batch_first=True)) model = LstmTagger(word_embeddings, lstm, vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD(model.parameters(), lr=args.lr) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=args.epochs, cuda_device=cuda_device) metrics = trainer.train() for m in metrics: if m.startswith("validation"): print("{}={}".format(m, metrics[m])) predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) # Here's how to save the model. model_path = os.path.join(args.model_dir, "model.th") vocab_path = os.path.join(args.model_dir, "vocabulary") with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) # And here's how to reload the model. vocab2 = Vocabulary.from_files(vocab_path) model2 = LstmTagger(word_embeddings, lstm, vocab2) with open(model_path, 'rb') as f: model2.load_state_dict(torch.load(f)) if cuda_device > -1: model2.cuda(cuda_device) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits'] np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
def __init__( self, vocab_path=None, model_paths=None, weigths=None, max_len=50, min_len=3, lowercase_tokens=False, log=False, iterations=3, min_probability=0.0, model_name='roberta', special_tokens_fix=1, is_ensemble=True, # is_ensemble=False, min_error_probability=0.0, confidence=0, resolve_cycles=False, prune_amount=0., num_layers_to_keep=12): # print('here') self.model_weights = list(map( float, weigths)) if weigths else [1] * len(model_paths) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.max_len = max_len self.min_len = min_len self.lowercase_tokens = lowercase_tokens self.min_probability = min_probability self.min_error_probability = min_error_probability self.vocab = Vocabulary.from_files(vocab_path) self.log = log self.iterations = iterations self.confidence = confidence self.resolve_cycles = resolve_cycles # set training parameters and operations self.indexers = [] self.models = [] for model_path in model_paths: # print('model_path:', model_path); exit(0) if is_ensemble: model_name, special_tokens_fix = self._get_model_data( model_path) weights_name = get_weights_name(model_name, lowercase_tokens) self.indexers.append( self._get_indexer(weights_name, special_tokens_fix)) # token_embs = get_token_embedders(model_name, tune_bert=1, special_tokens_fix=special_tokens_fix) model = Seq2Labels( vocab=self.vocab, text_field_embedder=self._get_embbeder(weights_name, special_tokens_fix), # text_field_embedder= token_embs, confidence=self.confidence).to(self.device) # count number of params pytorch_total_params = sum(p.numel() for p in model.parameters()) print('total params:', pytorch_total_params) # print('model:', model) print('type:', type(model)) #exit(0) if torch.cuda.is_available(): model.load_state_dict(torch.load(model_path)) else: model.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) # print('chk1'); exit(0) # get model size def print_size_of_model(model): torch.save(model.state_dict(), "temp.p") print('Size (MB):', os.path.getsize("temp.p") / 1e6) os.remove('temp.p') # print(model) print_size_of_model(model) #exit(0) print('type:', type(model)) #exit(0) def deleteEncodingLayers( model, num_layers_to_keep): # must pass in the full bert model oldModuleList = model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer # print('oldModuleList:', oldModuleList) # print('oldModuleList:', len(oldModuleList)); exit(0) newModuleList = nn.ModuleList() # Now iterate over all layers, only keeping only the relevant layers. for i in range(0, num_layers_to_keep): # for i in range(0, len(num_layers_to_keep)): newModuleList.append(oldModuleList[i]) # create a copy of the model, modify it with the new list, and return copyOfModel = copy.deepcopy(model) copyOfModel.text_field_embedder.token_embedder_bert.bert_model.encoder.layer = newModuleList return copyOfModel print('before model 12:', model) # model = deleteEncodingLayers(model, 12) # print('after 12:', model) # print_size_of_model(model) # # print('before model:', model) # model = deleteEncodingLayers(model, 11) # print ('after 11:', model) # print_size_of_model(model) model = deleteEncodingLayers(model, num_layers_to_keep) print('after', num_layers_to_keep, ' :', model) print_size_of_model(model) # exit(0) # # save model # torch.save(model, 'pytorch-saved.pth') # # print('model:', model) # # for name, module in model.named_modules(): # print('name:', name) # print('module:', module) # exit(0) # onnx_batch_size = 64 # dummy_input = {'tokens': { # 'bert': torch.zeros(onnx_batch_size, 64, dtype=torch.long, device=torch.device('cuda:0')), # 'bert-offsets':torch.zeros(onnx_batch_size, 64, dtype=torch.long, device=torch.device('cuda:0')), # 'mask': torch.zeros(onnx_batch_size, 64, dtype=torch.long, device=torch.device('cuda:0')) # }} # # # print('dummy_input:', dummy_input.shape) # # # pred = model(dummy_input['tokens']) # # # print('pred:', pred) # # d_inp = (dummy_input['tokens']['bert'], dummy_input['tokens']['bert-offsets'], # # dummy_input['tokens']['mask']) # d_inp = dummy_input['tokens'] # input_names = ['bert', 'bert-offsets', 'mask'] # output_names = ['output'] # # # convert model to onnx # torch.onnx.export(model, d_inp, 'bert_64.onnx', # input_names=input_names, output_names=output_names, verbose = False) # # torch.onnx.export(model, dummy_input['tokens'], 'try.onnx', verbose=False) # # d_inp = {'bert': np.zeros(shape=(1, 64), 'bert-offsets': np.zeros(1, 64), 'mask': torch.zeros(1, 64)} # exit(0) # model = torch.quantization.quantize_dynamic( # model, # # {torch.nn.Linear}, # dtype=torch.qint8 # ) # print_size_of_model(model) # ########################## # # # quantized_model = torch.quantization.quantize_dynamic( # # # model, {torch.nn.Linear}, dtype=torch.qint8 # # # ) # quantized_model = torch.quantization.quantize_dynamic( # model.cpu(), # # model, # # {torch.nn.Linear}, # dtype=torch.qint8 # ) # # # print_size_of_model(model) # print_size_of_model(quantized_model) # # # quantized_model.cuda() # # # exit(0) # # quantized_model.eval() # self.models.append(quantized_model) # ####################################### # prune model ################################################# # random unstructured # model = prune.random_unstructured(model, 'weight', amount=0.2) # # l1_unstructured # # m = prune.l1_unstructured(model, 'weight', amount=0.2) # # m = prune.l1_unstructured(model, 'bias', amount=3) print_size_of_model(model) for name, module in model.named_modules(): # print('name:', name) # print('module:', module)#; exit(0) # prune.random_unstructured(module, name='weight', amount=0.2) # # prune 20% of connections in all 2D-conv layers # if isinstance(module, torch.nn.Conv2d): # prune.l1_unstructured(module, name='weight', amount=0.2) # prune 40% of connections in all linear layers if isinstance(module, torch.nn.Linear): # print('prune_amount:', prune_amount) # print('.....pruning.....') # print('before pruning:', torch.sum(module.weight)); #exit(0) # print(list(module.named_parameters())) prune.l1_unstructured(module, name='weight', amount=prune_amount) # print('shape:', module.weight.shape); #exit(0) # prune.ln_structured(module, name='weight', amount=prune_amount, n=1, dim=module.weight.shape[1]) # print(list(module.named_parameters())); exit(0) # print('after pruning:', torch.sum(module.weight)); prune.remove(module, name='weight') # module.weight = torch.nn.Parameter(module.weight.data.to_sparse()) # print('after removing:', torch.sum(module.weight)); # print('shape:', module.weight.shape); exit(0) # exit(0) # prune.random_unstructured(module, name='weight', amount=0.25) # exit(0) # exit(0) print('About to return') print_size_of_model(model) #exit(0) ############################################################## model.eval() self.models.append(model)
def save_top_results(process_no, start_index, end_index): print("Starting process {} with start at {} and end at {}".format( process_no, start_index, end_index)) DATA_FOLDER = "train_data" # EMBEDDING_TYPE = "" LOSS_TYPE = "" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt") # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt") # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt") #NOTE: Squad dev test set q_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt") r_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt") rules_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt") reader = QuestionResponseSoftmaxReader(q_file, r_file, token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) glove_embeddings_file = os.path.join("data", "glove", "glove.840B.300d.txt") # RESULTS_DIR = "squad_seq2seq_train2" #NOTE: All other experiments # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized" # make_dir_if_not_exists(RESULTS_DIR) # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index)) #NOTE: Squad dev test set RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized" make_dir_if_not_exists(RESULTS_DIR) all_results_save_file = os.path.join( RESULTS_DIR, "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format( start_index, end_index)) with open(all_results_save_file, "w") as all_writer: print("Testing out model with", EMBEDDING_TYPE, "embeddings") print("Testing out model with", LOSS_TYPE, "loss") # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]: for NEGATIVE_PERCENTAGE in [100]: model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder( {"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: device = torch.device('cpu') model.load_state_dict(torch.load(f, map_location=device)) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor( model, dataset_reader=reader) # Read test file and get predictions gold = list() predicted_labels = list() probs = list() total_time = avg_time = 0.0 print("Started Testing:", NEGATIVE_PERCENTAGE) # before working on anything just save all the questions and responses in a list all_data = list() examples_count = processed_examples_count = 0 with open(q_file, 'r') as q_reader, open(r_file, "r") as r_reader, open( rules_file, "r") as rule_reader: logger.info("Reading questions from : %s", q_file) logger.info("Reading responses from : %s", r_file) q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() for i, (response, rule) in enumerate(zip(r_reader, rule_reader)): response = response.strip() rule = rule.strip() if response and rule: # get current_answer from response a = get_answer_from_response(response) if not current_qa[1]: current_qa = (q, a) else: # verify if the a is same as the one in current_qa if a != current_qa[1]: # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response) current_qa = (current_qa[0], a) # print(current_rules_and_responses) # exit() # Add it to the current responses current_rules_and_responses.append((response, rule)) elif len(current_rules_and_responses) > 0: # Create a instance # print(current_qa) # print(current_rules_and_responses) # exit() if rule or response: print("Rule Response mismatch") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() if examples_count < start_index: examples_count += 1 q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() continue elif examples_count > end_index: break all_data.append( (current_qa, current_rules_and_responses)) try: q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) except StopIteration: # previous one was the last question q = "" current_qa = (q, "") current_rules_and_responses = list() examples_count += 1 # if(examples_count%100 == 0): # print(examples_count) else: # Serious Bug print("Serious BUG!!") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() print("{}:\tFINISHED IO".format(process_no)) examples_count = start_index processed_examples_count = 0 for current_qa, responses_and_rules in all_data: start_time = time.time() # Tokenize and preprocess the responses preprocessed_responses = [ mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False) for response, rule in responses_and_rules ] # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules]) predictions = predictor.predict(current_qa[0], preprocessed_responses) label_probs = predictions["label_probs"] tuples = zip(responses_and_rules, label_probs) sorted_by_score = sorted(tuples, key=lambda tup: tup[1], reverse=True) count = 0 all_writer.write("{}\n".format(current_qa[0])) all_writer.write("{}\n".format(current_qa[1])) for index, ((response, rule), label_prob) in enumerate(sorted_by_score): if index == 3: break all_writer.write("{}\t{}\t{}\t{}\n".format( response, mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False), rule, label_prob)) all_writer.write("\n") all_writer.flush() end_time = time.time() processed_examples_count += 1 examples_count += 1 total_time += end_time - start_time avg_time = total_time / float(processed_examples_count) print( "{}:\ttime to write {} with {} responses is {} secs. {} avg time" .format(process_no, examples_count, len(responses_and_rules), end_time - start_time, avg_time))
parser.add_argument('--learning_rate', type=float, default=2.0) parser.add_argument('--max_steps', type=int, default=30) parser.add_argument('--num_updates', type=int, default=1) parser.add_argument('--beam_size', type=int, default=1) parser.add_argument('--maskers', type=str, default=IDENTITY_TOKEN, help='string with comma-separated values') parser.add_argument('--early_stopping', action='store_true') parser.add_argument('--sample', type=int, default=None) if __name__ == '__main__': args = parser.parse_args() class_reader = ClassificationReader(skip_start_end=True) class_vocab = Vocabulary.from_files(Path(args.classifier_path) / 'vocab') class_model_args = get_args_from_path( Path(args.classifier_path) / 'args.json') class_model = get_model_by_name(**class_model_args, vocab=class_vocab) load_weights(class_model, Path(args.classifier_path) / 'best.th') reader = CopyNetReader(masker=None) copynet_vocab = Vocabulary.from_files(Path(args.copynet_path) / 'vocab') copynet_model_args = get_args_from_path( Path(args.copynet_path) / 'args.json') copynet_model = get_model_by_name(**copynet_model_args, vocab=copynet_vocab, beam_size=args.beam_size) load_weights(copynet_model, Path(args.copynet_path) / 'best.th') class_model_copynet_args = get_args_from_path(
def build_tasks( args: config.Params, cuda_device: Any ) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]): """Main logic for preparing tasks: 1. create or load the tasks 2. configure classifiers for tasks 3. set up indexers 4. build and save vocab to disk 5. load vocab from disk 6. if specified, load word embeddings 7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing 8. index tasks using vocab and task-specific MPI, save to disk. 9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings Parameters ---------- args : Params config map Returns ------- List[Task] list of pretrain Tasks. List[Task] list of target Tasks. allennlp.data.Vocabulary vocabulary from task data. Union[np.ndarray, float] Word embeddings. """ # 1) create / load tasks tasks, pretrain_task_names, target_task_names = get_tasks(args, cuda_device) for task in tasks: task_classifier = config.get_task_attr(args, task.name, "use_classifier") setattr(task, "_classifier_name", task_classifier if task_classifier else task.name) tokenizer_names = {task.name: task.tokenizer_name for task in tasks} assert not len(set(tokenizer_names.values())) > 1, ( f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}" ) # 2) build / load vocab and indexers indexers = build_indexers(args) vocab_path = os.path.join(args.exp_dir, "vocab",input_module_tokenizer_name(args.input_module)) #to allow roberta and albert (with diff vocabs) in one exp folder if args.reload_vocab or not os.path.exists(vocab_path): _build_vocab(args, tasks, vocab_path) # Always load vocab from file. vocab = Vocabulary.from_files(vocab_path) log.info("\tLoaded vocab from %s", vocab_path) for namespace, mapping in vocab._index_to_token.items(): log.info("\tVocab namespace %s: size %d", namespace, len(mapping)) log.info("\tFinished building vocab.") args.max_word_v_size = vocab.get_vocab_size("tokens") args.max_char_v_size = vocab.get_vocab_size("chars") # 3) build / load word vectors word_embs = None if args.input_module in ["glove", "fastText"]: emb_file = os.path.join(args.exp_dir, "embs.pkl") if args.reload_vocab or not os.path.exists(emb_file): word_embs = _build_embeddings(args, vocab, emb_file) else: # load from file word_embs = pkl.load(open(emb_file, "rb")) log.info("Trimmed word embeddings: %s", str(word_embs.size())) # 4) Set up model_preprocessing_interface model_preprocessing_interface = ModelPreprocessingInterface(args) # 5) Index tasks using vocab (if preprocessed copy not available). preproc_dir = os.path.join(args.exp_dir, "preproc") utils.maybe_make_dir(preproc_dir) reindex_tasks = parse_task_list_arg(args.reindex_tasks) utils.assert_for_log( not (args.reload_indexing and not reindex_tasks), 'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks' ' = "task1,task2,..."")', ) for task in tasks: force_reindex = args.reload_indexing and task.name in reindex_tasks for split in ALL_SPLITS: log_prefix = "\tTask '%s', split '%s'" % (task.name, split) # To store preprocessed data for models that use different indexers in the same exp directory indexer = input_module_tokenizer_name(args.input_module) relative_path = _get_serialized_record_path(task.name, split, "preproc",indexer) cache_found =_find_cached_file( args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix #TODO change global one to point to arwen, and local one to be in one exp folder with diff runs ) if force_reindex or not cache_found: # Re-index from scratch. record_file = _get_serialized_record_path(task.name, split, preproc_dir,indexer) if os.path.exists(record_file) and os.path.islink(record_file): os.remove(record_file) _index_split( task, split, indexers, vocab, record_file, model_preprocessing_interface ) # Delete in-memory data - we'll lazy-load from disk later. # TODO: delete task.{split}_data_text? log.info("\tFinished indexing tasks") # 6) Initialize tasks with data iterators. pretrain_tasks = [] target_tasks = [] for task in tasks: indexer = input_module_tokenizer_name(args.input_module) # Replace lists of instances with lazy generators from disk. task.val_data = _get_instance_generator(task.name, "val", preproc_dir,indexer=indexer) task.test_data = _get_instance_generator(task.name, "test", preproc_dir,indexer=indexer) # When using pretrain_data_fraction, we need modified iterators for use # only on training datasets at pretraining time. if task.name in pretrain_task_names: log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction ,indexer=indexer) pretrain_tasks.append(task) # When using target_train_data_fraction, we need modified iterators # only for training datasets at do_target_task_training time. if task.name in target_task_names: log.info("\tCreating trimmed target-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.target_train_data_fraction ,indexer=indexer) target_tasks.append(task) log.info("\t Training on %s", ", ".join(pretrain_task_names)) log.info("\t Evaluating on %s", ", ".join(target_task_names)) return pretrain_tasks, target_tasks, vocab, word_embs
# change paths to your data directory config = { "vocab_directory": "../data/allen_vocab_lower_10", "pre_trained_embedding": "../data/glove.42B.300d.txt", "model": "knrm", "train_data": "../data/triples.train.tsv", "validation_data": "../data/tuples.validation.tsv", "test_data": "../data/tuples.test.tsv", } # # data loading # vocab = Vocabulary.from_files(config["vocab_directory"]) tokens_embedder = Embedding.from_params( vocab, Params({ "pretrained_file": config["pre_trained_embedding"], "embedding_dim": 300, "trainable": True, "padding_index": 0 })) word_embedder = BasicTextFieldEmbedder({"tokens": tokens_embedder}) # recommended default params for the models (but you may change them if you want) if config["model"] == "knrm": model = KNRM(word_embedder, n_kernels=11) elif config["model"] == "conv_knrm":
def from_params( cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "TrainerPieces": all_datasets = training_util.meta_dataset_from_params(params, cache_directory, cache_prefix) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab_params = params.pop("vocabulary", {}) vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary"), vocab_params.get("padding_token", None), vocab_params.get("oov_token", None), ) else: instance_train = ( instance for key, dataset in all_datasets.items() if key == 'train' for subdata in dataset for instance in subdata ) instance_valid_test = ( instance for key, dataset in all_datasets.items() if key != 'train' for instance in dataset ) instances = chain(instance_train, instance_valid_test) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), # Using a generator comprehension here is important # because, being lazy, it allows us to not iterate over the # dataset when directory_path is specified. # ( # instance # for key, dataset in all_datasets.items() # if (key in datasets_for_vocab_creation) # for instance in dataset # ), instances ) model = Model.from_params(vocab=vocab, params=params.pop("model")) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary # Save the vocab only in the master if not is_distributed() or is_master(): vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) # print('[info] iterator in meta_pieces is:{}'.format(params.pop("iterator"))) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets["train"] validation_data = all_datasets.get("validation") test_data = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names( model ) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return cls( model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params, )
def test_from_files_with_model_archive(self): vocab = Vocabulary.from_files(str(self.model_archive)) vocab.get_namespaces() == {"tokens", "labels"} assert vocab.get_token_from_index(3, namespace="tokens") == "u.n."
validation_dataset=validation_dataset, patience=10, num_epochs=1000, cuda_device=cuda_device) trainer.train() predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) with open('/tmp/model.th', 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files('/tmp/vocabulary') vocab2 = Vocabulary.from_files('/tmp/vocabulary') model2 = LstmTagger(word_embeddings, lstm, vocab2) with open('/tmp/model.th', 'rb') as f: model2.load_state_dict(torch.load(f)) if cuda_device > -1: model2.cuda(cuda_device) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) tag_logits2 = predictor2.predict('The dog ate the apple')['tag_logits'] np.testing.assert_array_almost_equal(tag_logits, tag_logits2)
overrides = overrides = json.dumps( {"trainer": { "cuda_device": cuda_device }}) params = Params.from_file(model_config, overrides) model_file = 'checkpoint/%s%s/' % (model_name, attn) iterator = DataIterator.from_params(params.pop("iterator")) torch.manual_seed(0) numpy.random.seed(0) if write_file: wf = Write_outfile(Wfile_name) print("Loading vocabulary") vocab = Vocabulary.from_files(model_file + 'vocabulary') print('Initialing model') model = Model.from_params(vocab=vocab, params=params.pop('model')) print("Loading Model file from %s" % (model_file + 'best.th')) with open(model_file + 'best.th', 'rb') as f: model.load_state_dict(torch.load(f, encoding='utf-8')) iterator.index_with(vocab) dataset_reader_params = params.pop('dataset_reader') datareader = DatasetReader.from_params(dataset_reader_params) model.eval() #读取文件数据 for file in files: dom = xml.dom.minidom.parse(file)
@overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) return self._dataset_reader.text_to_instance( [Token(t) for t in tokens]) EMBEDDING_DIM = 6 HIDDEN_DIM = 6 lstm = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) vocab = Vocabulary.from_files("{}/vocabulary".format(out_dir)) model = LstmClassifier(word_embeddings, lstm, vocab) with open("{}/model.th".format(out_dir), 'rb') as f: model.load_state_dict(torch.load(f)) if cuda_device > -1: model.cuda(cuda_device) predictor = SentenceClassifierPredictor(model, dataset_reader=reader) # EVALUATION true_pos = 0 false_pos = 0 false_neg = 0 fo = open('{}/test.txt'.format(out_dir), 'r') lines = fo.readlines() fo.close()
parser.add_argument('--vocab-file', action='store', dest='vocab_file', help='vocab directory path', required=True) args = parser.parse_args() # # load data & create vocab # ------------------------------- # #_token_indexers = {"tokens": FastTextNGramIndexer(20)} #_token_indexers = {"tokens": FastTextNGramIndexer(20)} #_token_indexers = {"tokens": ELMoTokenCharactersIndexer()} loader = IrTripleDatasetReader(lazy=True,#token_indexers=_token_indexers, tokenizer=BlingFireTokenizer()) #BlingFireTokenizer()) #WordTokenizer(word_splitter=JustSpacesWordSplitter())) #,max_doc_length=200,max_query_length=20,min_doc_length=200,min_query_length=20) instances = loader.read(args.dataset_file) _iterator = BucketIterator(batch_size=64, sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) #vocab_map,vocab_data = FastTextVocab.load_ids(args.vocab_file,20) #vocab = FastTextVocab(vocab_map, vocab_data,20) _iterator.index_with(Vocabulary.from_files(args.vocab_file)) with Timer("iterate over all"): for i in _iterator(instances, num_epochs=1): exit()
def main(args): # fix_seed() 不要fix效果才好.!!!!!!!!!!!!!!否则shuffle没用了 if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) # list(train_data) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab # 这里面是生成字典的算法 if args.vocab_path: old_vocab = Vocabulary.from_files(args.vocab_path) # 代码修改成,不管传入不传入都根据数据集重新简历字典,然后进行2个字典的合并. if 1: # 生成字典. 利用数据集生成字典!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # 下面看看这个生成vocab 如何做的. 生成之后的东西就是对应到目录下的output_vocabulary里面的内容. # 直接调用的是allennlp库包,还是需要看懂里面实现的算法. #-------------需要对这个from_instances进行修改,看里面如何生成我的更大字典.这个是已经封装好的了.所以不用看了.改上面数据才行. new_vocab = Vocabulary.from_instances(train_data, max_vocab_size={ 'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2 }, tokens_to_add=tokens_to_add) from allennlp.common.params import Params params = Params({"non_padded_namespaces": set(namespaces)}) vocab = old_vocab old_vocab.extend_from_instances(params, train_data) old_vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) from pathlib import Path vocabdir = Path(__file__).resolve().parent.parent / os.path.join( args.model_dir, 'vocabulary', 'labels.txt') print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: # 我们不用这个地方来加载 model.load_state_dict( torch.load( os.path.join(args.pretrain_folder, args.pretrain + '.th'))) model = model.to(device) print("Model is set", '模型加载完毕') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator( batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=True, # 吧这个地方改了.true accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None) print("Start training") trainer.train(args.oldmodel) # Here's how to save the model. # 最优模型再存一遍.所以最后这个目录里面只存model.th即可.而不用管那些带系数的. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped", "训练全部结束,model存在了", args.model_dir + ' / model.th')
def from_params(cls, params: Params, instances: Iterable['adi.Instance'] = None): """ There are two possible ways to build a vocabulary; from a collection of instances, using :func:`Vocabulary.from_instances`, or from a pre-saved vocabulary, using :func:`Vocabulary.from_files`. You can also extend pre-saved vocabulary with collection of instances using this method. This method wraps these options, allowing their specification from a ``Params`` object, generated from a JSON configuration file. Parameters ---------- params: Params, required. instances: Iterable['adi.Instance'], optional If ``params`` doesn't contain a ``directory_path`` key, the ``Vocabulary`` can be built directly from a collection of instances (i.e. a dataset). If ``extend`` key is set False, dataset instances will be ignored and final vocabulary will be one loaded from ``directory_path``. If ``extend`` key is set True, dataset instances will be used to extend the vocabulary loaded from ``directory_path`` and that will be final vocabulary used. Returns ------- A ``Vocabulary``. """ # pylint: disable=arguments-differ # Vocabulary is ``Registrable`` so that you can configure a custom subclass, # but (unlike most of our registrables) almost everyone will want to use the # base implementation. So instead of having an abstract ``VocabularyBase`` or # such, we just add the logic for instantiating a registered subclass here, # so that most users can continue doing what they were doing. vocab_type = params.pop("type", None) if vocab_type is not None: return cls.by_name(vocab_type).from_params(params=params, instances=instances) extend = params.pop("extend", False) vocabulary_directory = params.pop("directory_path", None) if not vocabulary_directory and not instances: raise ConfigurationError( "You must provide either a Params object containing a " "vocab_directory key or a Dataset to build a vocabulary from.") if extend and not instances: raise ConfigurationError( "'extend' is true but there are not instances passed to extend." ) if extend and not vocabulary_directory: raise ConfigurationError( "'extend' is true but there is not 'directory_path' to extend from." ) if vocabulary_directory and instances: if extend: logger.info( "Loading Vocab from files and extending it with dataset.") else: logger.info("Loading Vocab from files instead of dataset.") if vocabulary_directory: vocab = Vocabulary.from_files(vocabulary_directory) if not extend: params.assert_empty("Vocabulary - from files") return vocab if extend: vocab.extend_from_instances(params, instances=instances) return vocab min_count = params.pop("min_count", None) max_vocab_size = pop_max_vocab_size(params) non_padded_namespaces = params.pop("non_padded_namespaces", EXTENDED_NON_PADDED_NAMESPACES) pretrained_files = params.pop("pretrained_files", {}) min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None) only_include_pretrained_words = params.pop_bool( "only_include_pretrained_words", False) tokens_to_add = params.pop("tokens_to_add", None) params.assert_empty("Vocabulary - from dataset") return ExtendedVocabulary.from_instances( instances=instances, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer( lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # initialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
] vocab = Vocabulary() for ns in ["tokens", "token_in", "token_out"]: for chord in itertools.product(note_list, accidental_list, chord_type_list): vocab.add_token_to_namespace("".join(chord), namespace=ns) vocab.add_token_to_namespace(START_SYMBOL, namespace=ns) vocab.add_token_to_namespace(END_SYMBOL, namespace=ns) key_list = [ "".join(x) for x in itertools.product(note_list, accidental_list) ] form_list = ["m", "+", "o", "M", "%", "It", "Ger", "Fr"] figbass_list = ["7", "6"] for char in (key_list + form_list + figbass_list): vocab.add_token_to_namespace(char, namespace="token_characters") note_number_list = [str(x) for x in range(12)] for note_number in note_number_list: vocab.add_token_to_namespace(note_number, namespace="notes") vocab.save_to_files("data/vocabulary") if __name__ == "__main__": generate_vocab() vocab = Vocabulary.from_files("data/vocabulary") print(vocab.get_token_to_index_vocabulary())
def multiprocess_single_sequence_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file, _fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data): torch.manual_seed(_config["random_seed"]) numpy.random.seed(_config["random_seed"]) random.seed(_config["random_seed"]) if _config["token_embedder_type"] == "bert_cls": _tokenizer = BlingFireTokenizer() _ind = PretrainedBertIndexer( pretrained_model=_config["bert_pretrained_model"], do_lowercase=True) _token_indexers = {"tokens": _ind} _tuple_loader = IrSingleSequenceDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_seq_length=_config["max_doc_length"], min_seq_length=_config["min_doc_length"], ) _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]), sorting_keys=[("seq_tokens", "num_tokens")]) _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"])) else: _tokenizer = BlingFireTokenizer() if _config["token_embedder_type"] == "embedding": _token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True) } _vocab = Vocabulary.from_files(_config["vocab_directory"]) elif _config["token_embedder_type"] == "fasttext": _token_indexers = { "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"]) } _vocab = FastTextVocab(_fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data, _config["fasttext_max_subwords"]) elif _config["token_embedder_type"] == "elmo": _token_indexers = {"tokens": ELMoTokenCharactersIndexer()} _vocab = None _tuple_loader = IrSingleSequenceDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_seq_length=_config["max_doc_length"], min_seq_length=_config["min_doc_length"], ) _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]), sorting_keys=[("seq_tokens", "num_tokens")]) _iterator.index_with(_vocab) for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1): _queue.put( training_batch) # this moves the tensors in to shared memory _queue.put(None) # signal end of queue _queue.close() # indicate this local thread is done _wait_for_exit.wait( ) # keep this process alive until all the shared memory is used and not needed anymore
def main(args): fix_seed() if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab if args.vocab_path: vocab = Vocabulary.from_files(args.vocab_path) else: vocab = Vocabulary.from_instances(train_data, max_vocab_size={ 'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2 }, tokens_to_add=tokens_to_add) vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: model.load_state_dict( torch.load(os.path.join(args.pretrain_folder, args.pretrain + '.th'), map_location=torch.device('cpu'))) model = model.to(device) print("Model is set") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator( batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=False, accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None) print("Start training") trainer.train() # Here's how to save the model. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped")
def main(args): fix_seed() if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens) # read datasets reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct), skip_complex=args.skip_complex, test_mode=False, tag_strategy=args.tag_strategy, lowercase_tokens=args.lowercase_tokens, max_pieces_per_token=args.pieces_per_token, tn_prob=args.tn_prob, tp_prob=args.tp_prob, special_tokens_fix=args.special_tokens_fix) train_data = reader.read(args.train_set) dev_data = reader.read(args.dev_set) default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN] namespaces = ['labels', 'd_tags'] tokens_to_add = {x: default_tokens for x in namespaces} # build vocab if args.vocab_path: vocab = Vocabulary.from_files(args.vocab_path) else: vocab = Vocabulary.from_instances(train_data, max_vocab_size={ 'tokens': 30000, 'labels': args.target_vocab_size, 'd_tags': 2 }, tokens_to_add=tokens_to_add) vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary')) print("Data is loaded") model = get_model(weights_name, vocab, tune_bert=args.tune_bert, predictor_dropout=args.predictor_dropout, label_smoothing=args.label_smoothing, special_tokens_fix=args.special_tokens_fix) # model = GecBERTModel(vocab_path=args.vocab_path, # model_paths=args.model_path, # max_len=args.max_len, min_len=args.min_len, # iterations=args.iteration_count, # min_error_probability=args.min_error_probability, # min_probability=args.min_error_probability, # lowercase_tokens=args.lowercase_tokens, # model_name=args.transformer_model, # special_tokens_fix=args.special_tokens_fix, # log=False, # confidence=args.additional_confidence, # is_ensemble=args.is_ensemble, # weigths=args.weights) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): if torch.cuda.device_count() > 1: cuda_device = list(range(torch.cuda.device_count())) else: cuda_device = 0 else: cuda_device = -1 if args.pretrain: model.load_state_dict( torch.load( os.path.join(args.pretrain_folder, args.pretrain + '.th'))) print('cuda_device:', cuda_device) #exit(0) model = model.to(device) print("Model is set") # print('model:', model) def print_size_of_model(model): torch.save(model.state_dict(), "temp.p") print('Size (MB):', os.path.getsize("temp.p") / 1e6) os.remove('temp.p') if args.keep != 12: prev_model = GecBERTModel( vocab_path=args.vocab_path, model_paths=args.model_path, max_len=args.max_len, min_len=args.min_len, iterations=args.iteration_count, min_error_probability=args.min_error_probability, min_probability=args.min_error_probability, lowercase_tokens=args.lowercase_tokens, model_name=args.transformer_model, special_tokens_fix=args.special_tokens_fix, log=False, confidence=args.additional_confidence, is_ensemble=args.is_ensemble, weigths=args.weights, num_layers_to_keep=args.keep) # print('prev_model:', prev_model.models) # print(model) print_size_of_model(model) print_size_of_model(prev_model.models[0]) model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer = \ prev_model.models[0].text_field_embedder.token_embedder_bert.bert_model.encoder.layer print_size_of_model(model) # exit(0) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10) instances_per_epoch = None if not args.updates_per_epoch else \ int(args.updates_per_epoch * args.batch_size * args.accumulation_size) iterator = BucketIterator( batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True, max_instances_in_memory=args.batch_size * 20000, instances_per_epoch=instances_per_epoch, ) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, scheduler=scheduler, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, serialization_dir=args.model_dir, patience=args.patience, num_epochs=args.n_epoch, cuda_device=cuda_device, shuffle=False, accumulated_batch_count=args.accumulation_size, cold_step_count=args.cold_steps_count, cold_lr=args.cold_lr, cuda_verbose_step=int(args.cuda_verbose_steps) if args.cuda_verbose_steps else None) GPUtil.showUtilization() print("Start training") trainer.train() # Here's how to save the model. out_model = os.path.join(args.model_dir, 'model.th') with open(out_model, 'wb') as f: torch.save(model.state_dict(), f) print("Model is dumped")