# Preprocess qa from pathlib import Path from pipeline.init import experiment_config from src.utils.dvc import dvc_run from src.utils.helpers import filter_config from src.data_preprocessing.pretrained_embeddings import create_embeddings data_params = experiment_config.pop("data") dvc_run("src/data_preprocessing/pretrained_embeddings.py", script_arguments=filter_config(data_params.as_dict(), create_embeddings), dvc_file_name=f"{Path(__file__).stem}.dvc", input_deps=[data_params.get("vocab_result_file")], output_deps=data_params.get("embeddings_result_file"))
# Preprocess qa from pathlib import Path from pipeline.init import experiment_config from src.utils.dvc import dvc_run from src.utils.helpers import filter_config from src.data_preprocessing.vocab_creation import create_vocab data_params = experiment_config.pop("data") dvc_run("src/data_preprocessing/vocab_creation.py", script_arguments=filter_config(data_params.as_dict(), create_vocab), dvc_file_name=f"{Path(__file__).stem}.dvc", input_deps=[ data_params.get("train_qa_result_file"), data_params.get("val_qa_result_file") ], output_deps=data_params.get("vocab_result_file"))
from itertools import chain from pathlib import Path import pandas as pd from allennlp.common import Params from src.utils.helpers import filter_config, create_parent_dir_if_not_exists def create_vocab(train_qa_result_file, val_qa_result_file, vocab_result_file): qa_train = pd.read_pickle(train_qa_result_file) qa_val = pd.read_pickle(val_qa_result_file) combined = pd.concat([qa_train, qa_val]) question_tokens = set(" ".join( combined.preprocessed_question.values).split()) train_ans_tokens = set((" ".join(qa_train.answer)).split()) # val_ans_tokens = set(" ".join(chain(*qa_val.answer.values)).split()) vocab = ["[UNK]", "[EOS]"] + sorted(question_tokens | train_ans_tokens) vocab_result_file = Path(vocab_result_file) create_parent_dir_if_not_exists(vocab_result_file) with vocab_result_file.open("w") as f: print("\n".join(vocab), file=f) return if __name__ == "__main__": data_params = Params.from_file("config.jsonnet").pop("data").as_dict() data_params = filter_config(data_params, create_vocab) create_vocab(**data_params)
logging.info( "Run was started in debugging mode: no info will be stored in mlflow or tensorboard" ) else: logging.info( "Run was started in normal mode: info will be stored in mlflow and tensorboard" ) device = "cuda" if torch.cuda.is_available() else "cpu" logging.info(f"Using device: {device}") experiment_config = init_config() data_config = experiment_config.pop("data") training_config = experiment_config.pop("training") train_dataset = VisualQATrainDataset( **filter_config(data_config, VisualQATrainDataset.__init__)) vocab = train_dataset.vocab val_dataset = VisualQAValDataset( **filter_config(data_config, VisualQAValDataset.__init__), vocab=vocab, answer_vocabulary=train_dataset.answer_vocabulary) train_loader = DataLoader(train_dataset, batch_size=training_config.pop("train_batch_size"), shuffle=True, collate_fn=partial(my_collate, vocab=vocab), num_workers=6) val_loader = DataLoader(val_dataset, batch_size=training_config.pop("val_batch_size"), shuffle=False,
# Preprocess qa from pathlib import Path from pipeline.init import experiment_config from src.utils.dvc import dvc_run from src.utils.helpers import filter_config from src.data_preprocessing.load_images import preprocess_images data_params = experiment_config.pop("data") dvc_run("src/data_preprocessing/load_images.py", script_arguments=filter_config(data_params.as_dict(), preprocess_images), dvc_file_name=f"{Path(__file__).stem}.dvc", output_deps=[ data_params.get("train_images_result_file"), data_params.get("val_images_result_file"), data_params.get("train_filenames_result_file"), data_params.get("val_filenames_result_file") ])