def _get_model(self) -> nn.Module: word_embedder = TrainableWordEmbedder( embedding_type=self.hparams.get("emb_type"), datasets_manager=self.data_manager, ) elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager, layer_aggregation="sum") embedder = ConcatEmbedders([word_embedder, elmo_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), bidirectional=self.hparams.get("bidirectional"), combine_strategy=self.hparams.get("combine_strategy"), rnn_bias=True, dropout_value=self.hparams.get("lstm2seq_dropout", 0.0), add_projection_layer=False, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") and self.hparams.get("combine_strategy") == "concat" else self.hparams.get("hidden_dim"), datasets_manager=self.data_manager, ) return model
def build_model(self): embedder = TrainableWordEmbedder( embedding_type=self.hparams.get("emb_type"), datasets_manager=self.data_manager, device=self.hparams.get("device"), ) embedder = ConcatEmbedders([embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=self.hparams.get("dropout"), hidden_dim=self.hparams.get("hidden_dim"), bidirectional=self.hparams.get("bidirectional"), combine_strategy=self.hparams.get("combine_strategy"), rnn_bias=True, device=self.hparams.get("device"), num_layers=self.hparams.get("num_layers"), ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") and self.hparams.get("combine_strategy") == "concat" else self.hparams.get("hidden_dim"), device=self.hparams.get("device"), tagging_type="IOB1", datasets_manager=self.data_manager, ) return model
def setup_seq2seq_model(abs_sum_dataset_manager): EMBEDDING_DIM = 100 HIDDEN_DIM = 100 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" NUM_LAYERS = 1 MAX_LENGTH = 4 datasets_manager = abs_sum_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, add_projection_layer=False, ) decoder = Lstm2SeqDecoder( embedder=embedder, vocab=datasets_manager.namespace_to_vocab["tokens"], max_length=MAX_LENGTH, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, num_layers=NUM_LAYERS, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, ) model = Seq2SeqModel( rnn2seqencoder=encoder, rnn2seqdecoder=decoder, datasets_manager=datasets_manager, enc_hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, ) return ( model, datasets_manager, { "EMBEDDING_DIM": EMBEDDING_DIM, "MAX_LENGTH": MAX_LENGTH, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, }, )
def setup_lstm2seqencoder(request): EMBEDDING_DIM = 100 VOCAB_SIZE = 1000 BATCH_SIZE = 2 HIDDEN_DIM = 1024 NUM_TIME_STEPS = 10 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] NUM_LAYERS = request.param[2] EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([VOCAB_SIZE, EMBEDDING_DIM])) tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS)) tokens = torch.LongTensor(tokens) embedder = VanillaEmbedder(embedding=EMBEDDING, embedding_dim=EMBEDDING_DIM) encoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, num_layers=NUM_LAYERS, ) return ( encoder, { "EMBEDDING_DIM": EMBEDDING_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "HIDDEN_DIM": HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "tokens": tokens, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, "TIME_STEPS": NUM_TIME_STEPS, "NUM_LAYERS": NUM_LAYERS, }, )
def setup_parscit_tagger(seq_dataset_manager): EMBEDDING_DIM = 100 HIDDEN_DIM = 100 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" dataset_manager = seq_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") char_embedder = CharEmbedder( char_embedding_dimension=10, hidden_dimension=20, datasets_manager=dataset_manager, ) embedder = ConcatEmbedders([embedder, char_embedder]) encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, add_projection_layer=False, ) tagger = RnnSeqCrfTagger( rnn2seqencoder=encoder, encoding_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, datasets_manager=dataset_manager, ) return ( tagger, dataset_manager, { "EMBEDDING_DIM": EMBEDDING_DIM, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, }, )
def build_science_ie_model(dirname: str): exp_dirpath = pathlib.Path(dirname) data_dir = pathlib.Path(DATA_DIR) train_filename = data_dir.joinpath("train_science_ie_conll.txt") dev_filename = data_dir.joinpath("dev_science_ie_conll.txt") test_filename = data_dir.joinpath("dev_science_ie_conll.txt") data_manager = CoNLLDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, column_names=["TASK", "PROCESS", "MATERIAL"], ) word_embedder = TrainableWordEmbedder( embedding_type="glove_6B_100", datasets_manager=data_manager ) char_embedder = CharEmbedder( char_embedding_dimension=20, hidden_dimension=25, datasets_manager=data_manager ) embedder = ConcatEmbedders([word_embedder, char_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=350, bidirectional=True, combine_strategy="concat", rnn_bias=True, device=torch.device("cpu"), num_layers=2, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=700, datasets_manager=data_manager, namespace_to_constraints=None, tagging_type="BIOUL", ) infer = SequenceLabellingInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return infer
def setup_lstm2seqencoder(request): HIDDEN_DIM = 1024 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] NUM_LAYERS = request.param[2] ADD_PROJECTION_LAYER = request.param[3] embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, num_layers=NUM_LAYERS, add_projection_layer=ADD_PROJECTION_LAYER, ) lines = [] texts = ["First sentence", "second sentence"] for text in texts: line = Line(text=text) lines.append(line) return ( encoder, { "HIDDEN_DIM": HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL and not ADD_PROJECTION_LAYER else HIDDEN_DIM, "NUM_LAYERS": NUM_LAYERS, "LINES": lines, "TIME_STEPS": 2, }, )
def encoder(): embedder = WordEmbedder(embedding_type="glove_6B_50") hidden_dim = 50 lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=hidden_dim, bidirectional=False ) attn_module = DotProductAttention() context_embedder = WordEmbedder( embedding_type="glove_6B_50", word_tokens_namespace="tokens" ) encoder = Lstm2SeqAttnContextEncoder( rnn2seqencoder=lstm2seqencoder, attn_module=attn_module, context_embedder=context_embedder, ) return encoder
def build_model(self): word_embedder = TrainableWordEmbedder( embedding_type=self.hparams.get("emb_type"), datasets_manager=self.data_manager, ) char_embedder = CharEmbedder( char_embedding_dimension=self.hparams.get("char_emb_dim"), hidden_dimension=self.hparams.get("char_encoder_hidden_dim"), datasets_manager=self.data_manager, ) elmo_embedder = BowElmoEmbedder(datasets_manager=self.data_manager) embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=self.hparams.get("hidden_dim"), bidirectional=self.hparams.get("bidirectional"), combine_strategy=self.hparams.get("combine_strategy"), rnn_bias=True, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * self.hparams.get("hidden_dim") if self.hparams.get("bidirectional") and self.hparams.get("combine_strategy") == "concat" else self.hparams.get("hidden_dim"), datasets_manager=self.data_manager, ) self.printer.good("Finished Loading the Model") return model
device=args.device) char_embedder = CharEmbedder( char_embedding_dimension=args.char_emb_dim, hidden_dimension=args.char_encoder_hidden_dim, datasets_manager=data_manager, device=args.device, ) embedder = ConcatEmbedders([embedder, char_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=args.dropout, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, rnn_bias=True, device=torch.device(args.device), num_layers=args.num_layers, add_projection_layer=False, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * args.hidden_dim if args.bidirectional and args.combine_strategy == "concat" else args.hidden_dim, device=torch.device(args.device), tagging_type="BIOUL", datasets_manager=data_manager, ) optimizer = optim.Adam(params=model.parameters(),
char_emb_dim=CHAR_EMBEDDING_DIMENSION, char_embedder=char_embedder, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, num_layers=NUM_LAYERS, rnn_bias=True, device=torch.device(DEVICE), ) model = ScienceIETagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, task_constraints=task_constraints, process_constraints=process_constraints, material_constraints=material_constraints, device=torch.device(DEVICE), )
hidden_dimension=args.char_encoder_hidden_dim, datasets_manager=data_manager, device=args.device, ) elmo_embedder = BowElmoEmbedder( datasets_manager=data_manager, layer_aggregation="sum", device=args.device ) embedder = ConcatEmbedders([word_embedder, char_embedder, elmo_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, rnn_bias=True, device=torch.device(args.device), dropout_value=0.1, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * args.hidden_dim if args.bidirectional and args.combine_strategy == "concat" else args.hidden_dim, device=torch.device(args.device), datasets_manager=data_manager, ) optimizer = optim.Adam(params=model.parameters(), lr=args.lr) train_metric = TokenClassificationAccuracy(datasets_manager=data_manager)
def setup_engine_once( config_dict: Dict[str, str], experiment_name: str, train_data_filepath: pathlib.Path, test_data_filepath: pathlib.Path, ): DEBUG = config_dict["DEBUG"] DEBUG_DATASET_PROPORTION = config_dict["DEBUG_DATASET_PROPORTION"] BATCH_SIZE = config_dict["BATCH_SIZE"] LEARNING_RATE = config_dict["LEARNING_RATE"] NUM_EPOCHS = config_dict["NUM_EPOCHS"] SAVE_EVERY = config_dict["SAVE_EVERY"] LOG_TRAIN_METRICS_EVERY = config_dict["LOG_TRAIN_METRICS_EVERY"] EMBEDDING_DIMENSION = config_dict["EMBEDDING_DIMENSION"] CHAR_EMBEDDING_DIMENSION = config_dict["CHAR_EMBEDDING_DIMENSION"] EMBEDDING_TYPE = config_dict["EMBEDDING_TYPE"] MAX_NUM_WORDS = config_dict["MAX_NUM_WORDS"] MAX_LENGTH = config_dict["MAX_LENGTH"] DEVICE = config_dict["DEVICE"] HIDDEN_DIM = config_dict["HIDDEN_DIM"] BIDIRECTIONAL = config_dict["BIDIRECTIONAL"] COMBINE_STRATEGY = config_dict["COMBINE_STRATEGY"] MAX_CHAR_LENGTH = config_dict["MAX_CHAR_LENGTH"] USE_CHAR_ENCODER = config_dict["USE_CHAR_ENCODER"] CHAR_ENCODER_HIDDEN_DIM = config_dict["CHAR_ENCODER_HIDDEN_DIM"] DROPOUT = config_dict["DROPOUT"] EXP_NAME = experiment_name EXP_DIR_PATH = os.path.join(OUTPUT_DIR, EXP_NAME) if not os.path.isdir(EXP_DIR_PATH): os.mkdir(EXP_DIR_PATH) MODEL_SAVE_DIR = os.path.join(EXP_DIR_PATH, "checkpoints") if not os.path.isdir(MODEL_SAVE_DIR): os.mkdir(MODEL_SAVE_DIR) VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "vocab.json") CHAR_VOCAB_STORE_LOCATION = os.path.join(EXP_DIR_PATH, "char_vocab.json") CAPITALIZATION_VOCAB_STORE_LOCATION = os.path.join( EXP_DIR_PATH, "capitalization_vocab.json") CAPITALIZATION_EMBEDDING_DIMENSION = 10 TENSORBOARD_LOGDIR = os.path.join(".", "runs", EXP_NAME) train_dataset = ParscitDataset( filename=str(train_data_filepath), dataset_type="train", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) validation_dataset = ParscitDataset( filename=str(test_data_filepath), dataset_type="valid", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) test_dataset = ParscitDataset( filename=str(test_data_filepath), dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, captialization_vocab_store_location= CAPITALIZATION_VOCAB_STORE_LOCATION, capitalization_emb_dim=CAPITALIZATION_EMBEDDING_DIMENSION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) train_dataset.print_stats() validation_dataset.print_stats() test_dataset.print_stats() VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len() NUM_CLASSES = train_dataset.get_num_classes() embedding = train_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding, freeze=False) char_embedding = train_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding, freeze=False) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder( embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION) char_encoder = CharLSTMEncoder( char_emb_dim=CHAR_EMBEDDING_DIMENSION, char_embedder=char_embedder, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=True, device=torch.device(DEVICE), ) model = ParscitTagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) metric = TokenClassificationAccuracy( idx2labelname_mapping=train_dataset.idx2classname) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode="max", factor=0.1, patience=2) engine = Engine( model=model, train_dataset=train_dataset, validation_dataset=validation_dataset, test_dataset=test_dataset, optimizer=optimizer, batch_size=BATCH_SIZE, save_dir=MODEL_SAVE_DIR, num_epochs=NUM_EPOCHS, save_every=SAVE_EVERY, log_train_metrics_every=LOG_TRAIN_METRICS_EVERY, tensorboard_logdir=TENSORBOARD_LOGDIR, device=torch.device(DEVICE), metric=metric, track_for_best="macro_fscore", lr_scheduler=scheduler, ) config_dict["VOCAB_STORE_LOCATION"] = VOCAB_STORE_LOCATION config_dict["CHAR_VOCAB_STORE_LOCATION"] = CHAR_VOCAB_STORE_LOCATION config_dict["MODEL_SAVE_DIR"] = MODEL_SAVE_DIR config_dict["VOCAB_SIZE"] = VOCAB_SIZE config_dict["NUM_CLASSES"] = NUM_CLASSES with open(os.path.join(f"{EXP_DIR_PATH}", "config.json"), "w") as fp: json.dump(config_dict, fp) return engine
def get_science_ie_infer(dirname: str): model_folder = pathlib.Path(dirname) hyperparam_config_filename = model_folder.joinpath("config.json") with open(hyperparam_config_filename, "r") as fp: config = json.load(fp) MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None) MAX_LENGTH = config.get("MAX_LENGTH", None) MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None) VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None) DEBUG = config.get("DEBUG", None) DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None) EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None) EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None) HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None) BIDIRECTIONAL = config.get("BIDIRECTIONAL", None) COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None) DEVICE = config.get("DEVICE", "cpu") NUM_CLASSES = config.get("NUM_CLASSES", None) MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None) model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt") CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None) CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None) USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None) CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None) NUM_LAYERS = config.get("NUM_LAYERS", 1) DROPOUT = config.get("DROPOUT", 0.0) print(f"NUM_LAYERS", NUM_LAYERS) test_science_ie_conll_filepath = pathlib.Path(DATA_DIR, "dev_science_ie_conll.txt") test_dataset = ScienceIEDataset( filename=test_science_ie_conll_filepath, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) embedding = test_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding) char_embedding = test_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding) classnames2idx = ScienceIEDataset.get_classname2idx() idx2classnames = {idx: classname for classname, idx in classnames2idx.items()} task_idx2classnames = { idx: classname for idx, classname in idx2classnames.items() if idx in range(0, 8) } process_idx2classnames = { idx - 8: classname for idx, classname in idx2classnames.items() if idx in range(8, 16) } material_idx2classnames = { idx - 16: classname for idx, classname in idx2classnames.items() if idx in range(16, 24) } task_constraints = allowed_transitions( constraint_type="BIOUL", labels=task_idx2classnames ) process_constraints = allowed_transitions( constraint_type="BIOUL", labels=process_idx2classnames ) material_constraints = allowed_transitions( constraint_type="BIOUL", labels=material_idx2classnames ) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder( embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION ) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHAR_EMBEDDING_DIMENSION, bidirectional=True, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, combine_strategy="concat", device=torch.device(DEVICE), ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, num_layers=NUM_LAYERS, rnn_bias=True, device=torch.device(DEVICE), ) model = ScienceIETagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION, task_constraints=task_constraints, process_constraints=process_constraints, material_constraints=material_constraints, device=torch.device(DEVICE), ) inference_client = ScienceIEInference( model=model, model_filepath=str(model_filepath), dataset=test_dataset ) return inference_client
def setup_science_ie_tagger(request): EMBEDDING_DIM = 100 CHARACTER_EMBEDDING_DIM = 25 VOCAB_SIZE = 1000 BATCH_SIZE = 2 HIDDEN_DIM = 1024 CHARACTER_ENCODER_HIDDEN_DIM = 100 NUM_TIME_STEPS = 10 MAX_CHAR_LENGTH = 25 CHAR_VOCAB_SIZE = 100 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] HAVE_CHARACTER_ENCODER = request.param[2] DEVICE = torch.device("cpu") NUM_CLASSES = 8 EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([VOCAB_SIZE, EMBEDDING_DIM])) CHARACTER_EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM])) tokens = np.random.randint(0, VOCAB_SIZE, size=(BATCH_SIZE, NUM_TIME_STEPS)) task_labels = np.random.randint(0, 8, size=(BATCH_SIZE, NUM_TIME_STEPS)) process_labels = np.random.randint(8, 16, size=(BATCH_SIZE, NUM_TIME_STEPS)) material_labels = np.random.randint(16, 24, size=(BATCH_SIZE, NUM_TIME_STEPS)) task_labels = torch.LongTensor(task_labels) process_labels = torch.LongTensor(process_labels) material_labels = torch.LongTensor(material_labels) labels = torch.cat([task_labels, process_labels, material_labels], dim=1) char_tokens = np.random.randint(0, CHAR_VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS, MAX_CHAR_LENGTH)) tokens = torch.LongTensor(tokens) labels = torch.LongTensor(labels) char_tokens = torch.LongTensor(char_tokens) classnames2idx = ScienceIEDataset.get_classname2idx() idx2classnames = { idx: classname for classname, idx in classnames2idx.items() } task_idx2classnames = { idx: classname for idx, classname in idx2classnames.items() if idx in range(0, 8) } process_idx2classnames = { idx - 8: classname for idx, classname in idx2classnames.items() if idx in range(8, 16) } material_idx2classnames = { idx - 16: classname for idx, classname in idx2classnames.items() if idx in range(16, 24) } task_constraints: List[(int, int)] = allowed_transitions( constraint_type="BIOUL", labels=task_idx2classnames) process_constraints: List[(int, int)] = allowed_transitions( constraint_type="BIOUL", labels=process_idx2classnames) material_constraints: List[(int, int)] = allowed_transitions( constraint_type="BIOUL", labels=material_idx2classnames) embedder = VanillaEmbedder(embedding=EMBEDDING, embedding_dim=EMBEDDING_DIM) if HAVE_CHARACTER_ENCODER: char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING, embedding_dim=CHARACTER_EMBEDDING_DIM) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHARACTER_EMBEDDING_DIM, hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM, bidirectional=True, combine_strategy="concat", ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIM += 2 * CHARACTER_ENCODER_HIDDEN_DIM encoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, ) tagger = ScienceIETagger( rnn2seqencoder=encoder, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, num_classes=NUM_CLASSES, task_constraints=task_constraints, process_constraints=process_constraints, material_constraints=material_constraints, device=DEVICE, ) return ( tagger, { "EMBEDDING_DIM": EMBEDDING_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "tokens": tokens, "labels": labels, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, "TIME_STEPS": NUM_TIME_STEPS, "NUM_CLASSES": NUM_CLASSES, "HAVE_CHAR_ENCODER": HAVE_CHARACTER_ENCODER, "char_tokens": char_tokens, }, )
# # instantiate the elmo embedder # elmo_embedder = BowElmoEmbedder(layer_aggregation="sum", device=args.device) # # # instantiate the vanilla embedder # vanilla_embedder = WordEmbedder(embedding_type=args.emb_type, device=args.device) # # # concat the embeddings # embedder = ConcatEmbedders([vanilla_embedder, elmo_embedder]) embedder = WordEmbedder(embedding_type=args.emb_type, device=args.device) encoder = Lstm2SeqEncoder( embedder=embedder, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, device=torch.device(args.device), ) encoding_dim = (2 * args.hidden_dim if args.bidirectional and args.combine_strategy == "concat" else args.hidden_dim) decoder = Lstm2SeqDecoder( embedder=embedder, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, device=torch.device(args.device), max_length=args.pred_max_length, vocab=vocab,
device=args.device) char_embedder = CharEmbedder( char_embedding_dimension=args.char_emb_dim, hidden_dimension=args.char_encoder_hidden_dim, datasets_manager=data_manager, device=args.device, ) embedder = ConcatEmbedders([embedder, char_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=args.dropout, hidden_dim=2 * args.hidden_dim if args.bidirectional and args.combine_strategy == "concat" else args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, rnn_bias=True, device=torch.device(args.device), num_layers=args.num_layers, ) model = RnnSeqCrfTagger( rnn2seqencoder=lstm2seqencoder, encoding_dim=2 * args.hidden_dim if args.bidirectional and args.combine_strategy == "concat" else args.hidden_dim, device=torch.device(args.device), tagging_type="BIOUL", datasets_manager=data_manager, ) optimizer = optim.Adam(params=model.parameters(),
def get_bilstm_crf_infer(dirname: str): hyperparam_config_filepath = pathlib.Path(dirname, "config.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) MAX_NUM_WORDS = config.get("MAX_NUM_WORDS", None) MAX_LENGTH = config.get("MAX_LENGTH", None) MAX_CHAR_LENGTH = config.get("MAX_CHAR_LENGTH", None) VOCAB_STORE_LOCATION = config.get("VOCAB_STORE_LOCATION", None) DEBUG = config.get("DEBUG", None) DEBUG_DATASET_PROPORTION = config.get("DEBUG_DATASET_PROPORTION", None) EMBEDDING_TYPE = config.get("EMBEDDING_TYPE", None) EMBEDDING_DIMENSION = config.get("EMBEDDING_DIMENSION", None) HIDDEN_DIMENSION = config.get("HIDDEN_DIM", None) BIDIRECTIONAL = config.get("BIDIRECTIONAL", None) COMBINE_STRATEGY = config.get("COMBINE_STRATEGY", None) DEVICE = config.get("DEVICE", "cpu") NUM_CLASSES = config.get("NUM_CLASSES", None) MODEL_SAVE_DIR = config.get("MODEL_SAVE_DIR", None) model_filepath = pathlib.Path(MODEL_SAVE_DIR, "best_model.pt") CHAR_VOCAB_STORE_LOCATION = config.get("CHAR_VOCAB_STORE_LOCATION", None) CHAR_EMBEDDING_DIMENSION = config.get("CHAR_EMBEDDING_DIMENSION", None) USE_CHAR_ENCODER = config.get("USE_CHAR_ENCODER", None) CHAR_ENCODER_HIDDEN_DIM = config.get("CHAR_ENCODER_HIDDEN_DIM", None) DROPOUT = config.get("DROPOUT", 0.0) test_conll_filepath = pathlib.Path(DATA_DIR, "cora_conll.txt") test_dataset = ParscitDataset( filename=test_conll_filepath, dataset_type="test", max_num_words=MAX_NUM_WORDS, max_instance_length=MAX_LENGTH, max_char_length=MAX_CHAR_LENGTH, word_vocab_store_location=VOCAB_STORE_LOCATION, char_vocab_store_location=CHAR_VOCAB_STORE_LOCATION, debug=DEBUG, debug_dataset_proportion=DEBUG_DATASET_PROPORTION, word_embedding_type=EMBEDDING_TYPE, word_embedding_dimension=EMBEDDING_DIMENSION, char_embedding_dimension=CHAR_EMBEDDING_DIMENSION, word_start_token="<SOS>", word_end_token="<EOS>", word_pad_token="<PAD>", word_unk_token="<UNK>", word_add_start_end_token=False, ) embedding = test_dataset.word_vocab.load_embedding() embedding = nn.Embedding.from_pretrained(embedding) embedder = VanillaEmbedder(embedding=embedding, embedding_dim=EMBEDDING_DIMENSION) char_embedding = test_dataset.char_vocab.load_embedding() char_embedding = nn.Embedding.from_pretrained(char_embedding) if USE_CHAR_ENCODER: char_embedder = VanillaEmbedder(embedding=char_embedding, embedding_dim=CHAR_EMBEDDING_DIMENSION) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHAR_EMBEDDING_DIMENSION, hidden_dim=CHAR_ENCODER_HIDDEN_DIM, bidirectional=True, combine_strategy="concat", ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIMENSION += 2 * CHAR_ENCODER_HIDDEN_DIM lstm2seqencoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIMENSION, embedder=embedder, dropout_value=DROPOUT, hidden_dim=HIDDEN_DIMENSION, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=True, device=torch.device(DEVICE), ) model = ParscitTagger( rnn2seqencoder=lstm2seqencoder, num_classes=NUM_CLASSES, hid_dim=2 * HIDDEN_DIMENSION if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIMENSION, ) inference_client = ParscitInference(model=model, model_filepath=str(model_filepath), dataset=test_dataset) return inference_client
def setup_parscit_tagger(request): EMBEDDING_DIM = 100 CHARACTER_EMBEDDING_DIM = 25 VOCAB_SIZE = 1000 BATCH_SIZE = 2 HIDDEN_DIM = 1024 CHARACTER_ENCODER_HIDDEN_DIM = 100 NUM_TIME_STEPS = 10 MAX_CHAR_LENGTH = 25 CHAR_VOCAB_SIZE = 100 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] HAVE_CHARACTER_ENCODER = request.param[2] NUM_CLASSES = 5 EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([VOCAB_SIZE, EMBEDDING_DIM])) CHARACTER_EMBEDDING = nn.Embedding.from_pretrained( torch.zeros([CHAR_VOCAB_SIZE, CHARACTER_EMBEDDING_DIM])) tokens = np.random.randint(0, VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS)) labels = np.random.randint(0, NUM_CLASSES - 1, size=(BATCH_SIZE, NUM_TIME_STEPS)) char_tokens = np.random.randint(0, CHAR_VOCAB_SIZE - 1, size=(BATCH_SIZE, NUM_TIME_STEPS, MAX_CHAR_LENGTH)) tokens = torch.LongTensor(tokens) labels = torch.LongTensor(labels) char_tokens = torch.LongTensor(char_tokens) embedder = VanillaEmbedder(embedding=EMBEDDING, embedding_dim=EMBEDDING_DIM) if HAVE_CHARACTER_ENCODER: char_embedder = VanillaEmbedder(embedding=CHARACTER_EMBEDDING, embedding_dim=CHARACTER_EMBEDDING_DIM) char_encoder = CharLSTMEncoder( char_embedder=char_embedder, char_emb_dim=CHARACTER_EMBEDDING_DIM, hidden_dim=CHARACTER_ENCODER_HIDDEN_DIM, bidirectional=True, combine_strategy="concat", ) embedder = ConcatEmbedders([embedder, char_encoder]) EMBEDDING_DIM = EMBEDDING_DIM + (2 * CHARACTER_ENCODER_HIDDEN_DIM) encoder = Lstm2SeqEncoder( emb_dim=EMBEDDING_DIM, embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, ) tagger = ParscitTagger( rnn2seqencoder=encoder, hid_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, num_classes=NUM_CLASSES, ) return ( tagger, { "EMBEDDING_DIM": EMBEDDING_DIM, "VOCAB_SIZE": VOCAB_SIZE, "BATCH_SIZE": BATCH_SIZE, "HIDDEN_DIM": 2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "tokens": tokens, "labels": labels, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL else HIDDEN_DIM, "TIME_STEPS": NUM_TIME_STEPS, "NUM_CLASSES": NUM_CLASSES, "HAVE_CHAR_ENCODER": HAVE_CHARACTER_ENCODER, "char_tokens": char_tokens, }, )
def setup_parscit_inference(seq_dataset_manager, tmpdir_factory): HIDDEN_DIM = 100 BIDIRECTIONAL = True COMBINE_STRATEGY = "concat" dataset_manager = seq_dataset_manager embedder = WordEmbedder(embedding_type="glove_6B_50") char_embedder = CharEmbedder( char_embedding_dimension=10, hidden_dimension=20, datasets_manager=dataset_manager, ) embedder = ConcatEmbedders([embedder, char_embedder]) encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, add_projection_layer=False, ) tagger = RnnSeqCrfTagger( rnn2seqencoder=encoder, encoding_dim=2 * HIDDEN_DIM if BIDIRECTIONAL and COMBINE_STRATEGY == "concat" else HIDDEN_DIM, datasets_manager=dataset_manager, ) train_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) dev_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) test_metric = TokenClassificationAccuracy(datasets_manager=dataset_manager) optimizer = torch.optim.Adam(params=tagger.parameters()) batch_size = 1 save_dir = tmpdir_factory.mktemp("experiment_1") num_epochs = 1 save_every = 1 log_train_metrics_every = 10 engine = Engine( model=tagger, datasets_manager=dataset_manager, optimizer=optimizer, batch_size=batch_size, save_dir=save_dir, num_epochs=num_epochs, save_every=save_every, log_train_metrics_every=log_train_metrics_every, train_metric=train_metric, validation_metric=dev_metric, test_metric=test_metric, track_for_best="macro_fscore", ) engine.run() model_filepath = pathlib.Path(save_dir).joinpath("best_model.pt") inference_client = SequenceLabellingInference( model=tagger, model_filepath=model_filepath, datasets_manager=dataset_manager ) return inference_client