Пример #1
0
def load_artifacts(run_id: str, device: torch.device = torch.device("cpu")) -> Dict:
    """Load artifacts for current model.

    Args:
        run_id (str): ID of the model run to load artifacts.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Artifacts needed for inference.
    """
    # Load artifacts
    artifact_uri = mlflow.get_run(run_id=run_id).info.artifact_uri.split("file://")[-1]
    params = Namespace(**utils.load_dict(filepath=Path(artifact_uri, "params.json")))
    label_encoder = data.MultiLabelLabelEncoder.load(fp=Path(artifact_uri, "label_encoder.json"))
    tokenizer = data.Tokenizer.load(fp=Path(artifact_uri, "tokenizer.json"))
    model_state = torch.load(Path(artifact_uri, "model.pt"), map_location=device)
    performance = utils.load_dict(filepath=Path(artifact_uri, "performance.json"))

    # Initialize model
    model = models.initialize_model(
        params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder)
    )
    model.load_state_dict(model_state)

    return {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "performance": performance,
    }
Пример #2
0
def load_artifacts(
    model_dir: Path = config.MODEL_DIR,
    device: torch.device = torch.device("cpu")
) -> Dict:
    """Load artifacts for current model.

    Args:
        model_dir (Path): location of model artifacts. Defaults to config.MODEL_DIR.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Artifacts needed for inference.
    """
    # Load artifacts
    params = Namespace(**utils.load_dict(
        filepath=Path(model_dir, "params.json")))
    label_encoder = data.MultiLabelLabelEncoder.load(
        fp=Path(model_dir, "label_encoder.json"))
    tokenizer = data.Tokenizer.load(fp=Path(model_dir, "tokenizer.json"))
    model_state = torch.load(Path(model_dir, "model.pt"), map_location=device)
    performance = utils.load_dict(filepath=Path(model_dir, "performance.json"))

    # Initialize model
    model = models.initialize_model(params=params,
                                    vocab_size=len(tokenizer),
                                    num_classes=len(label_encoder))
    model.load_state_dict(model_state)

    return {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "performance": performance,
    }
Пример #3
0
def load_artifacts(
        run_id: str,
        device: torch.device = torch.device("cpu"),
) -> Dict:
    """Load artifacts for a particular `run_id`.

    Args:
        run_id (str): ID of the run to load model artifacts from.
        device (torch.device): Device to run model on. Defaults to CPU.

    Returns:
        Artifacts needed for inference.
    """
    # Load model
    client = mlflow.tracking.MlflowClient()
    device = torch.device("cpu")
    with tempfile.TemporaryDirectory() as fp:
        client.download_artifacts(run_id=run_id, path="", dst_path=fp)
        label_encoder = data.LabelEncoder.load(
            fp=Path(fp, "label_encoder.json"))
        tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json"))
        model_state = torch.load(Path(fp, "model.pt"), map_location=device)
        performance = utils.load_dict(filepath=Path(fp, "performance.json"))

    # Load model
    run = mlflow.get_run(run_id=run_id)
    args = Namespace(**run.data.params)
    model = models.initialize_model(args=args,
                                    vocab_size=len(tokenizer),
                                    num_classes=len(label_encoder))
    model.load_state_dict(model_state)

    return {
        "args": args,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "performance": performance,
    }
Пример #4
0
    def setup_method(self):
        """Called before every method."""
        # Args
        self.max_filter_size = 4
        self.embedding_dim = 128
        self.num_filters = 100
        self.hidden_dim = 128
        self.dropout_p = 0.5
        args = Namespace(
            max_filter_size=self.max_filter_size,
            embedding_dim=self.embedding_dim,
            num_filters=self.num_filters,
            hidden_dim=self.hidden_dim,
            dropout_p=self.dropout_p,
        )

        # Model
        self.vocab_size = 1000
        self.num_classes = 10
        utils.set_seed()
        self.cnn = models.initialize_model(args=args,
                                           vocab_size=self.vocab_size,
                                           num_classes=self.num_classes)
Пример #5
0
def run(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.
    1. Set seed
    2. Set device
    3. Load data
    4. Clean data
    5. Preprocess data
    6. Encode labels
    7. Split data
    8. Tokenize inputs
    9. Create dataloaders
    10. Initialize model
    11. Train model
    12. Evaluate model
    Args:
        args (Namespace): Input arguments for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.
    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=args.seed)

    # 2. Set device
    device = utils.set_device(cuda=args.cuda)

    # 3. Load data
    df, projects_dict, tags_dict = data.load(
        shuffle=args.shuffle, num_samples=args.num_samples
    )

    # 4. Clean data
    df, tags_dict, tags_above_frequency = data.clean(
        df=df, tags_dict=tags_dict, min_tag_freq=args.min_tag_freq
    )

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess, lower=args.lower, stem=args.stem)

    # 6. Encode labels
    y, class_weights, label_encoder = data.encode_labels(labels=df.tags)

    # 7. Split data
    utils.set_seed(seed=args.seed)  # needed for skmultilearn
    X_train, X_val, X_test, y_train, y_val, y_test = data.split(
        X=df.text.to_numpy(), y=y, train_size=args.train_size
    )

    # 8. Tokenize inputs
    X_train, tokenizer = data.tokenize_text(
        X=X_train, char_level=args.char_level
    )
    X_val, _ = data.tokenize_text(
        X=X_val, char_level=args.char_level, tokenizer=tokenizer
    )
    X_test, _ = data.tokenize_text(
        X=X_test, char_level=args.char_level, tokenizer=tokenizer
    )

    # 9. Create dataloaders
    train_dataloader = data.get_dataloader(
        data=[X_train, y_train],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )
    val_dataloader = data.get_dataloader(
        data=[X_val, y_val],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )
    test_dataloader = data.get_dataloader(
        data=[X_test, y_test],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )

    # 10. Initialize model
    model = models.initialize_model(
        args=args,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Arguments: {json.dumps(args.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    args, model, loss = train(
        args=args,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    device = torch.device("cpu")
    performance = evaluate(
        dataloader=test_dataloader,
        model=model.to(device),
        device=device,
        threshold=args.threshold,
        classes=label_encoder.classes,
    )

    return {
        "args": args,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
        "performance": performance,
    }
Пример #6
0
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=params.seed)

    # 2. Set device
    device = utils.set_device(cuda=params.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # 4. Prepare data (feature engineering, filter, clean)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # 6. Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # 7. Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # 8. Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # 9. Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # 10. Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    params, model, loss = train.train(
        params=params,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts
Пример #7
0
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # Set up
    utils.set_seed(seed=params.seed)
    device = utils.set_device(cuda=params.cuda)

    # Load features
    features_fp = Path(config.DATA_DIR, "features.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    features = utils.load_dict(filepath=features_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(features)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # Prepare data (filter, clean, etc.)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)
    optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="min",
                                                           factor=0.05,
                                                           patience=5)

    # Trainer module
    trainer = Trainer(
        model=model,
        device=device,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        trial=trial,
    )

    # Train
    best_val_loss, best_model = trainer.train(params.num_epochs,
                                              params.patience,
                                              train_dataloader, val_dataloader)

    # Find best threshold
    _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)
    params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob)

    # Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": best_model,
        "loss": best_val_loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts