def setup_bert_embedder_large(request): emb_dim = 1024 dropout_value = 0.0 bow_bert_encoder = BertEmbedder( emb_dim=emb_dim, dropout_value=dropout_value, aggregation_type=request.param[1], bert_type=request.param[0], ) strings = [ "Lets start by talking politics", "there are radical ways to test your code", ] iter_dict = {"raw_instance": strings} return bow_bert_encoder, iter_dict
def get_bow_bert_emb_lc_gensect_infer(dirname: str): exp_dirpath = pathlib.Path(dirname) hyperparam_config_filepath = exp_dirpath.joinpath("config.json") test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json") with open(hyperparam_config_filepath, "r") as fp: config = json.load(fp) with open(test_dataset_params, "r") as fp: test_dataset_args = json.load(fp) EMBEDDING_DIM = config["EMBEDDING_DIMENSION"] NUM_CLASSES = config["NUM_CLASSES"] BERT_TYPE = config["BERT_TYPE"] DEVICE = config["DEVICE"] MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"] model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt") embedder = BertEmbedder( emb_dim=EMBEDDING_DIM, dropout_value=0.0, aggregation_type="average", bert_type=BERT_TYPE, device=torch.device(DEVICE), ) encoder = BOW_Encoder( embedder=embedder, emb_dim=EMBEDDING_DIM, aggregation_type="average" ) model = SimpleClassifier( encoder=encoder, encoding_dim=EMBEDDING_DIM, num_classes=NUM_CLASSES, classification_layer_bias=True, ) dataset = GenericSectDataset(**test_dataset_args) parsect_inference = ClassificationInference( model=model, model_filepath=model_filepath, dataset=dataset ) return parsect_inference
def setup_bert_embedder(request): dropout_value = 0.0 bert_type, aggregation_type = request.param bert_embedder = BertEmbedder( dropout_value=dropout_value, aggregation_type=aggregation_type, bert_type=bert_type, ) strings = [ "Lets start by talking politics", "there are radical ways to test your code", ] lines = [] for string in strings: line = Line(text=string) lines.append(line) return bert_embedder, lines
def build_sectlabel_bow_bert(dirname: str): exp_dirpath = pathlib.Path(dirname) DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) embedder = BertEmbedder( dropout_value=0.0, aggregation_type="average", bert_type="bert-base-uncased", device=torch.device("cpu"), ) encoder = BOW_Encoder(embedder=embedder, aggregation_type="average") model = SimpleClassifier( encoder=encoder, encoding_dim=768, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, ) parsect_inference = ClassificationInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return parsect_inference
args = parser.parse_args() DATA_PATH = pathlib.Path(DATA_DIR) train_file = DATA_PATH.joinpath("sectLabel.train") dev_file = DATA_PATH.joinpath("sectLabel.dev") test_file = DATA_PATH.joinpath("sectLabel.test") data_manager = TextClassificationDatasetManager( train_filename=str(train_file), dev_filename=str(dev_file), test_filename=str(test_file), ) embedder = BertEmbedder( dropout_value=0.0, aggregation_type="average", bert_type=args.bert_type, device=torch.device(args.device), ) encoder = BOW_Encoder(embedder=embedder, aggregation_type="average", device=args.device) model = SimpleClassifier( encoder=encoder, encoding_dim=768, num_classes=23, classification_layer_bias=True, datasets_manager=data_manager, device=args.device, )
with open(os.path.join(EXP_DIR_PATH, "test_dataset_params.json"), "w") as fp: json.dump(test_dataset_params, fp) VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len() NUM_CLASSES = train_dataset.get_num_classes() config["VOCAB_SIZE"] = VOCAB_SIZE config["NUM_CLASSES"] = NUM_CLASSES with open(os.path.join(EXP_DIR_PATH, "config.json"), "w") as fp: json.dump(config, fp) embedder = BertEmbedder( emb_dim=EMBEDDING_DIMENSION, dropout_value=0.0, aggregation_type="average", bert_type=BERT_TYPE, device=torch.device(DEVICE), ) encoder = BOW_Encoder(emb_dim=EMBEDDING_DIMENSION, embedder=embedder, aggregation_type="average") model = SimpleClassifier( encoder=encoder, encoding_dim=EMBEDDING_DIMENSION, num_classes=NUM_CLASSES, classification_layer_bias=True, ) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, column_names=["POS", "DEP", "NER"], train_only="ner", namespace_vocab_options={ "tokens": {"preprocessing_pipeline": [instance_preprocessing.lowercase]} }, ) embedder = WordEmbedder( embedding_type=args.emb_type, datasets_manager=data_manager, device=args.device ) bert_embedder = BertEmbedder( datasets_manager=data_manager, device=args.device, bert_type=args.bert_type ) embedder = ConcatEmbedders([embedder, bert_embedder]) lstm2seqencoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=args.dropout, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, combine_strategy=args.combine_strategy, rnn_bias=True, device=args.device, num_layers=args.num_layers, add_projection_layer=args.add_projection_layer, )