def compute_features(params: Namespace) -> None: """Compute features to use for training. Args: params (Namespace): Input parameters for operations. """ # Set up utils.set_seed(seed=params.seed) # Load data projects_url = ( "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json" ) projects = utils.load_json_from_url(url=projects_url) df = pd.DataFrame(projects) # Compute features df["text"] = df.title + " " + df.description df.drop(columns=["title", "description"], inplace=True) df = df[["id", "created_on", "text", "tags"]] # Save features = df.to_dict(orient="records") df_dict_fp = Path(config.DATA_DIR, "features.json") utils.save_dict(d=features, filepath=df_dict_fp) return df, features
def test_set_seed(): utils.set_seed() a = np.random.randn(2, 3) b = np.random.randn(2, 3) utils.set_seed() x = np.random.randn(2, 3) y = np.random.randn(2, 3) assert np.array_equal(a, x) assert np.array_equal(b, y)
def test_initialize_model(self): utils.set_seed() model = models.CNN( embedding_dim=self.embedding_dim, vocab_size=self.vocab_size, num_filters=self.num_filters, filter_sizes=[1, 2, 3, 4], hidden_dim=self.hidden_dim, dropout_p=self.dropout_p, num_classes=self.num_classes, ) for param1, param2 in zip(self.cnn.parameters(), model.parameters()): assert not param1.data.ne(param2.data).sum() > 0 assert self.cnn.filter_sizes == model.filter_sizes
def setup_method(self): """Called before every method.""" # Args self.max_filter_size = 4 self.embedding_dim = 128 self.num_filters = 100 self.hidden_dim = 128 self.dropout_p = 0.5 args = Namespace( max_filter_size=self.max_filter_size, embedding_dim=self.embedding_dim, num_filters=self.num_filters, hidden_dim=self.hidden_dim, dropout_p=self.dropout_p, ) # Model self.vocab_size = 1000 self.num_classes = 10 utils.set_seed() self.cnn = models.initialize_model(args=args, vocab_size=self.vocab_size, num_classes=self.num_classes)
import json from argparse import Namespace from pathlib import Path import numpy as np import pandas as pd from tagifai import config, data, main, utils from tagifai.config import logger # Set experiment and start run args_fp = Path(config.CONFIG_DIR, "args.json") args = Namespace(**utils.load_dict(filepath=args_fp)) # 1. Set seed utils.set_seed(seed=args.seed) # 2. Set device device = utils.set_device(cuda=args.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if args.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:args.num_samples] # None = all samples
def run(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. 1. Set seed 2. Set device 3. Load data 4. Clean data 5. Preprocess data 6. Encode labels 7. Split data 8. Tokenize inputs 9. Create dataloaders 10. Initialize model 11. Train model 12. Evaluate model Args: args (Namespace): Input arguments for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=args.seed) # 2. Set device device = utils.set_device(cuda=args.cuda) # 3. Load data df, projects_dict, tags_dict = data.load( shuffle=args.shuffle, num_samples=args.num_samples ) # 4. Clean data df, tags_dict, tags_above_frequency = data.clean( df=df, tags_dict=tags_dict, min_tag_freq=args.min_tag_freq ) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=args.lower, stem=args.stem) # 6. Encode labels y, class_weights, label_encoder = data.encode_labels(labels=df.tags) # 7. Split data utils.set_seed(seed=args.seed) # needed for skmultilearn X_train, X_val, X_test, y_train, y_val, y_test = data.split( X=df.text.to_numpy(), y=y, train_size=args.train_size ) # 8. Tokenize inputs X_train, tokenizer = data.tokenize_text( X=X_train, char_level=args.char_level ) X_val, _ = data.tokenize_text( X=X_val, char_level=args.char_level, tokenizer=tokenizer ) X_test, _ = data.tokenize_text( X=X_test, char_level=args.char_level, tokenizer=tokenizer ) # 9. Create dataloaders train_dataloader = data.get_dataloader( data=[X_train, y_train], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) val_dataloader = data.get_dataloader( data=[X_val, y_val], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) test_dataloader = data.get_dataloader( data=[X_test, y_test], max_filter_size=args.max_filter_size, batch_size=args.batch_size, ) # 10. Initialize model model = models.initialize_model( args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Arguments: {json.dumps(args.__dict__, indent=2, cls=NumpyEncoder)}" ) args, model, loss = train( args=args, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model device = torch.device("cpu") performance = evaluate( dataloader=test_dataloader, model=model.to(device), device=device, threshold=args.threshold, classes=label_encoder.classes, ) return { "args": args, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, "performance": performance, }
def run(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # 1. Set seed utils.set_seed(seed=params.seed) # 2. Set device device = utils.set_device(cuda=params.cuda) # 3. Load data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") projects = utils.load_dict(filepath=projects_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(projects) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # 4. Prepare data (feature engineering, filter, clean) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # 5. Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # 6. Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # 7. Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # 8. Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # 9. Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # 10. Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # 11. Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) params, model, loss = train.train( params=params, train_dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, device=device, class_weights=class_weights, trial=trial, ) # 12. Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "loss": loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts
def train(params: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict: """Operations for training. Args: params (Namespace): Input parameters for operations. trial (optuna.trial._trial.Trail, optional): Optuna optimization trial. Defaults to None. Returns: Artifacts to save and load for later. """ # Set up utils.set_seed(seed=params.seed) device = utils.set_device(cuda=params.cuda) # Load features features_fp = Path(config.DATA_DIR, "features.json") tags_fp = Path(config.DATA_DIR, "tags.json") features = utils.load_dict(filepath=features_fp) tags_dict = utils.list_to_dict(utils.load_dict(filepath=tags_fp), key="tag") df = pd.DataFrame(features) if params.shuffle: df = df.sample(frac=1).reset_index(drop=True) df = df[:params.subset] # None = all samples # Prepare data (filter, clean, etc.) df, tags_above_freq, tags_below_freq = data.prepare( df=df, include=list(tags_dict.keys()), exclude=config.EXCLUDED_TAGS, min_tag_freq=params.min_tag_freq, ) params.num_samples = len(df) # Preprocess data df.text = df.text.apply(data.preprocess, lower=params.lower, stem=params.stem) # Encode labels labels = df.tags label_encoder = data.MultiLabelLabelEncoder() label_encoder.fit(labels) y = label_encoder.encode(labels) # Class weights all_tags = list(itertools.chain.from_iterable(labels.values)) counts = np.bincount( [label_encoder.class_to_index[class_] for class_ in all_tags]) class_weights = {i: 1.0 / count for i, count in enumerate(counts)} # Split data utils.set_seed(seed=params.seed) # needed for skmultilearn X = df.text.to_numpy() X_train, X_, y_train, y_ = data.iterative_train_test_split( X=X, y=y, train_size=params.train_size) X_val, X_test, y_val, y_test = data.iterative_train_test_split( X=X_, y=y_, train_size=0.5) test_df = pd.DataFrame({ "text": X_test, "tags": label_encoder.decode(y_test) }) # Tokenize inputs tokenizer = data.Tokenizer(char_level=params.char_level) tokenizer.fit_on_texts(texts=X_train) X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object) X_val = np.array(tokenizer.texts_to_sequences(X_val), dtype=object) X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object) # Create dataloaders train_dataset = data.CNNTextDataset(X=X_train, y=y_train, max_filter_size=params.max_filter_size) val_dataset = data.CNNTextDataset(X=X_val, y=y_val, max_filter_size=params.max_filter_size) train_dataloader = train_dataset.create_dataloader( batch_size=params.batch_size) val_dataloader = val_dataset.create_dataloader( batch_size=params.batch_size) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder), device=device, ) # Train model logger.info( f"Parameters: {json.dumps(params.__dict__, indent=2, cls=NumpyEncoder)}" ) class_weights_tensor = torch.Tensor(np.array(list(class_weights.values()))) loss_fn = nn.BCEWithLogitsLoss(weight=class_weights_tensor) optimizer = torch.optim.Adam(model.parameters(), lr=params.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.05, patience=5) # Trainer module trainer = Trainer( model=model, device=device, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, trial=trial, ) # Train best_val_loss, best_model = trainer.train(params.num_epochs, params.patience, train_dataloader, val_dataloader) # Find best threshold _, y_true, y_prob = trainer.eval_step(dataloader=train_dataloader) params.threshold = find_best_threshold(y_true=y_true, y_prob=y_prob) # Evaluate model artifacts = { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": best_model, "loss": best_val_loss, } device = torch.device("cpu") y_true, y_pred, performance = eval.evaluate(df=test_df, artifacts=artifacts) artifacts["performance"] = performance return artifacts