Пример #1
0
def compute_features(params: Namespace) -> None:
    """Compute features to use for training.

    Args:
        params (Namespace): Input parameters for operations.
    """
    # Set up
    utils.set_seed(seed=params.seed)

    # Load data
    projects_url = (
        "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json"
    )
    projects = utils.load_json_from_url(url=projects_url)
    df = pd.DataFrame(projects)

    # Compute features
    df["text"] = df.title + " " + df.description
    df.drop(columns=["title", "description"], inplace=True)
    df = df[["id", "created_on", "text", "tags"]]

    # Save
    features = df.to_dict(orient="records")
    df_dict_fp = Path(config.DATA_DIR, "features.json")
    utils.save_dict(d=features, filepath=df_dict_fp)

    return df, features
Пример #2
0
def test_set_seed():
    utils.set_seed()
    a = np.random.randn(2, 3)
    b = np.random.randn(2, 3)
    utils.set_seed()
    x = np.random.randn(2, 3)
    y = np.random.randn(2, 3)
    assert np.array_equal(a, x)
    assert np.array_equal(b, y)
Пример #3
0
 def test_initialize_model(self):
     utils.set_seed()
     model = models.CNN(
         embedding_dim=self.embedding_dim,
         vocab_size=self.vocab_size,
         num_filters=self.num_filters,
         filter_sizes=[1, 2, 3, 4],
         hidden_dim=self.hidden_dim,
         dropout_p=self.dropout_p,
         num_classes=self.num_classes,
     )
     for param1, param2 in zip(self.cnn.parameters(), model.parameters()):
         assert not param1.data.ne(param2.data).sum() > 0
     assert self.cnn.filter_sizes == model.filter_sizes
Пример #4
0
    def setup_method(self):
        """Called before every method."""
        # Args
        self.max_filter_size = 4
        self.embedding_dim = 128
        self.num_filters = 100
        self.hidden_dim = 128
        self.dropout_p = 0.5
        args = Namespace(
            max_filter_size=self.max_filter_size,
            embedding_dim=self.embedding_dim,
            num_filters=self.num_filters,
            hidden_dim=self.hidden_dim,
            dropout_p=self.dropout_p,
        )

        # Model
        self.vocab_size = 1000
        self.num_classes = 10
        utils.set_seed()
        self.cnn = models.initialize_model(args=args,
                                           vocab_size=self.vocab_size,
                                           num_classes=self.num_classes)
Пример #5
0
    import json
    from argparse import Namespace
    from pathlib import Path

    import numpy as np
    import pandas as pd

    from tagifai import config, data, main, utils
    from tagifai.config import logger

    # Set experiment and start run
    args_fp = Path(config.CONFIG_DIR, "args.json")
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # 1. Set seed
    utils.set_seed(seed=args.seed)

    # 2. Set device
    device = utils.set_device(cuda=args.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if args.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:args.num_samples]  # None = all samples
Пример #6
0
def run(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.
    1. Set seed
    2. Set device
    3. Load data
    4. Clean data
    5. Preprocess data
    6. Encode labels
    7. Split data
    8. Tokenize inputs
    9. Create dataloaders
    10. Initialize model
    11. Train model
    12. Evaluate model
    Args:
        args (Namespace): Input arguments for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.
    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=args.seed)

    # 2. Set device
    device = utils.set_device(cuda=args.cuda)

    # 3. Load data
    df, projects_dict, tags_dict = data.load(
        shuffle=args.shuffle, num_samples=args.num_samples
    )

    # 4. Clean data
    df, tags_dict, tags_above_frequency = data.clean(
        df=df, tags_dict=tags_dict, min_tag_freq=args.min_tag_freq
    )

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess, lower=args.lower, stem=args.stem)

    # 6. Encode labels
    y, class_weights, label_encoder = data.encode_labels(labels=df.tags)

    # 7. Split data
    utils.set_seed(seed=args.seed)  # needed for skmultilearn
    X_train, X_val, X_test, y_train, y_val, y_test = data.split(
        X=df.text.to_numpy(), y=y, train_size=args.train_size
    )

    # 8. Tokenize inputs
    X_train, tokenizer = data.tokenize_text(
        X=X_train, char_level=args.char_level
    )
    X_val, _ = data.tokenize_text(
        X=X_val, char_level=args.char_level, tokenizer=tokenizer
    )
    X_test, _ = data.tokenize_text(
        X=X_test, char_level=args.char_level, tokenizer=tokenizer
    )

    # 9. Create dataloaders
    train_dataloader = data.get_dataloader(
        data=[X_train, y_train],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )
    val_dataloader = data.get_dataloader(
        data=[X_val, y_val],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )
    test_dataloader = data.get_dataloader(
        data=[X_test, y_test],
        max_filter_size=args.max_filter_size,
        batch_size=args.batch_size,
    )

    # 10. Initialize model
    model = models.initialize_model(
        args=args,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Arguments: {json.dumps(args.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    args, model, loss = train(
        args=args,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    device = torch.device("cpu")
    performance = evaluate(
        dataloader=test_dataloader,
        model=model.to(device),
        device=device,
        threshold=args.threshold,
        classes=label_encoder.classes,
    )

    return {
        "args": args,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
        "performance": performance,
    }
Пример #7
0
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # 1. Set seed
    utils.set_seed(seed=params.seed)

    # 2. Set device
    device = utils.set_device(cuda=params.cuda)

    # 3. Load data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    projects = utils.load_dict(filepath=projects_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(projects)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # 4. Prepare data (feature engineering, filter, clean)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # 5. Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # 6. Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # 7. Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # 8. Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # 9. Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # 10. Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # 11. Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    params, model, loss = train.train(
        params=params,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        model=model,
        device=device,
        class_weights=class_weights,
        trial=trial,
    )

    # 12. Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": model,
        "loss": loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts
Пример #8
0
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Operations for training.

    Args:
        params (Namespace): Input parameters for operations.
        trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None.

    Returns:
        Artifacts to save and load for later.
    """
    # Set up
    utils.set_seed(seed=params.seed)
    device = utils.set_device(cuda=params.cuda)

    # Load features
    features_fp = Path(config.DATA_DIR, "features.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    features = utils.load_dict(filepath=features_fp)
    tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp),
                                   key="tag")
    df = pd.DataFrame(features)
    if params.shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    df = df[:params.subset]  # None = all samples

    # Prepare data (filter, clean, etc.)
    df, tags_above_freq, tags_below_freq = data.prepare(
        df=df,
        include=list(tags_dict.keys()),
        exclude=config.EXCLUDED_TAGS,
        min_tag_freq=params.min_tag_freq,
    )
    params.num_samples = len(df)

    # Preprocess data
    df.text = df.text.apply(data.preprocess,
                            lower=params.lower,
                            stem=params.stem)

    # Encode labels
    labels = df.tags
    label_encoder = data.MultiLabelLabelEncoder()
    label_encoder.fit(labels)
    y = label_encoder.encode(labels)

    # Class weights
    all_tags = list(itertools.chain.from_iterable(labels.values))
    counts = np.bincount(
        [label_encoder.class_to_index[class_] for class_ in all_tags])
    class_weights = {i: 1.0 / count for i, count in enumerate(counts)}

    # Split data
    utils.set_seed(seed=params.seed)  # needed for skmultilearn
    X = df.text.to_numpy()
    X_train, X_, y_train, y_ = data.iterative_train_test_split(
        X=X, y=y, train_size=params.train_size)
    X_val, X_test, y_val, y_test = data.iterative_train_test_split(
        X=X_, y=y_, train_size=0.5)
    test_df = pd.DataFrame({
        "text": X_test,
        "tags": label_encoder.decode(y_test)
    })

    # Tokenize inputs
    tokenizer = data.Tokenizer(char_level=params.char_level)
    tokenizer.fit_on_texts(texts=X_train)
    X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
    X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object)
    X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)

    # Create dataloaders
    train_dataset = data.CNNTextDataset(X=X_train,
                                        y=y_train,
                                        max_filter_size=params.max_filter_size)
    val_dataset = data.CNNTextDataset(X=X_val,
                                      y=y_val,
                                      max_filter_size=params.max_filter_size)
    train_dataloader = train_dataset.create_dataloader(
        batch_size=params.batch_size)
    val_dataloader = val_dataset.create_dataloader(
        batch_size=params.batch_size)

    # Initialize model
    model = models.initialize_model(
        params=params,
        vocab_size=len(tokenizer),
        num_classes=len(label_encoder),
        device=device,
    )

    # Train model
    logger.info(
        f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}"
    )
    class_weights_tensor = torch.Tensor(np.array(list(class_weights.values())))
    loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor)
    optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="min",
                                                           factor=0.05,
                                                           patience=5)

    # Trainer module
    trainer = Trainer(
        model=model,
        device=device,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        trial=trial,
    )

    # Train
    best_val_loss, best_model = trainer.train(params.num_epochs,
                                              params.patience,
                                              train_dataloader, val_dataloader)

    # Find best threshold
    _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader)
    params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob)

    # Evaluate model
    artifacts = {
        "params": params,
        "label_encoder": label_encoder,
        "tokenizer": tokenizer,
        "model": best_model,
        "loss": best_val_loss,
    }
    device = torch.device("cpu")
    y_true, y_pred, performance = eval.evaluate(df=test_df,
                                                artifacts=artifacts)
    artifacts["performance"] = performance

    return artifacts