예제 #1
0
# Preprocess qa
from pathlib import Path

from pipeline.init import experiment_config
from src.utils.dvc import dvc_run
from src.utils.helpers import filter_config
from src.data_preprocessing.pretrained_embeddings import create_embeddings

data_params = experiment_config.pop("data")
dvc_run("src/data_preprocessing/pretrained_embeddings.py",
        script_arguments=filter_config(data_params.as_dict(),
                                       create_embeddings),
        dvc_file_name=f"{Path(__file__).stem}.dvc",
        input_deps=[data_params.get("vocab_result_file")],
        output_deps=data_params.get("embeddings_result_file"))
예제 #2
0
# Preprocess qa
from pathlib import Path

from pipeline.init import experiment_config
from src.utils.dvc import dvc_run
from src.utils.helpers import filter_config
from src.data_preprocessing.vocab_creation import create_vocab

data_params = experiment_config.pop("data")
dvc_run("src/data_preprocessing/vocab_creation.py",
        script_arguments=filter_config(data_params.as_dict(), create_vocab),
        dvc_file_name=f"{Path(__file__).stem}.dvc",
        input_deps=[
            data_params.get("train_qa_result_file"),
            data_params.get("val_qa_result_file")
        ],
        output_deps=data_params.get("vocab_result_file"))
예제 #3
0
from itertools import chain
from pathlib import Path

import pandas as pd
from allennlp.common import Params

from src.utils.helpers import filter_config, create_parent_dir_if_not_exists


def create_vocab(train_qa_result_file, val_qa_result_file, vocab_result_file):
    qa_train = pd.read_pickle(train_qa_result_file)
    qa_val = pd.read_pickle(val_qa_result_file)
    combined = pd.concat([qa_train, qa_val])
    question_tokens = set(" ".join(
        combined.preprocessed_question.values).split())
    train_ans_tokens = set((" ".join(qa_train.answer)).split())
    # val_ans_tokens = set(" ".join(chain(*qa_val.answer.values)).split())

    vocab = ["[UNK]", "[EOS]"] + sorted(question_tokens | train_ans_tokens)
    vocab_result_file = Path(vocab_result_file)
    create_parent_dir_if_not_exists(vocab_result_file)
    with vocab_result_file.open("w") as f:
        print("\n".join(vocab), file=f)
    return


if __name__ == "__main__":
    data_params = Params.from_file("config.jsonnet").pop("data").as_dict()
    data_params = filter_config(data_params, create_vocab)
    create_vocab(**data_params)
예제 #4
0
    logging.info(
        "Run was started in debugging mode: no info will be stored in mlflow or tensorboard"
    )
else:
    logging.info(
        "Run was started in normal mode: info will be stored in mlflow and tensorboard"
    )

device = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {device}")

experiment_config = init_config()
data_config = experiment_config.pop("data")
training_config = experiment_config.pop("training")
train_dataset = VisualQATrainDataset(
    **filter_config(data_config, VisualQATrainDataset.__init__))
vocab = train_dataset.vocab

val_dataset = VisualQAValDataset(
    **filter_config(data_config, VisualQAValDataset.__init__),
    vocab=vocab,
    answer_vocabulary=train_dataset.answer_vocabulary)

train_loader = DataLoader(train_dataset,
                          batch_size=training_config.pop("train_batch_size"),
                          shuffle=True,
                          collate_fn=partial(my_collate, vocab=vocab),
                          num_workers=6)
val_loader = DataLoader(val_dataset,
                        batch_size=training_config.pop("val_batch_size"),
                        shuffle=False,
예제 #5
0
# Preprocess qa
from pathlib import Path

from pipeline.init import experiment_config
from src.utils.dvc import dvc_run
from src.utils.helpers import filter_config
from src.data_preprocessing.load_images import preprocess_images

data_params = experiment_config.pop("data")
dvc_run("src/data_preprocessing/load_images.py",
        script_arguments=filter_config(data_params.as_dict(),
                                       preprocess_images),
        dvc_file_name=f"{Path(__file__).stem}.dvc",
        output_deps=[
            data_params.get("train_images_result_file"),
            data_params.get("val_images_result_file"),
            data_params.get("train_filenames_result_file"),
            data_params.get("val_filenames_result_file")
        ])