def main(num_epochs: int = 100, batch_size: int = 128): args = { "news_csv": "data/news_with_splits.csv", "save_dir": "model_storage/yelp/", "model_state_file": "model.pth", "glove_filepath": "data/glove.6B.100d.txt", "vectorizer_file": "vectorizer.json", "use_glove": False, "embedding_size": 100, "hidden_dim": 100, "num_channels": 100, "learning_rate": 0.001, "num_epochs": num_epochs, "batch_size": batch_size, "early_stopping_criteria": 5, "frequency_cutoff": 25, "dropout_p": 0.1, "cuda": False, } train_state = make_train_state() if torch.cuda.is_available(): args["cuda"] = True args["device"] = torch.device("cuda:0" if args["cuda"] else "cpu") print(args) dataset = NewsDataset.load_dataset_and_make_vectorizer(args["news_csv"]) vectorizer = dataset.vectorizer words = vectorizer.title_vocab._token_to_idx.keys() embeddings = make_embedding_matrix(glove_filepath=args["glove_filepath"], words=words) classifier = NewsClassifier( embedding_size=args["embedding_size"], num_embeddings=len(vectorizer.title_vocab), num_channels=args["num_channels"], hidden_dim=args["hidden_dim"], num_classes=len(vectorizer.title_vocab), dropout_p=args["dropout_p"], pretrained_embeddings=torch.from_numpy(embeddings), ) classifier = classifier.to(args["device"]) classifier.double() loss_func = CrossEntropyLoss() optimizer = Adam(classifier.parameters(), lr=args["learning_rate"]) train(args, train_state, dataset, classifier, optimizer, loss_func, compute_accuracy) return { "train_state": train_state, "args": args, "dataset": dataset, "classifier": classifier, "loss_func": loss_func, "optimizer": optimizer, }
def main(batch_size: int = 128, num_epochs: int = 100, hidden_dim: int = 100): args = { "hidden_dim": hidden_dim, "num_channels": 256, "surname_csv": "data/surnames_with_splits.csv", "save_dir": "model_storage/yelp/", "model_state_file": "model.pth", "vectorizer_file": "vectorizer.json", "learning_rate": 0.001, "num_epochs": num_epochs, "batch_size": batch_size, "early_stopping_criteria": 5, "frequency_cutoff": 25, "cuda": False, } train_state = make_train_state() if torch.cuda.is_available(): args["cuda"] = True args["device"] = torch.device("cuda:0" if args["cuda"] else "cpu") print(args) dataset = SurnameDataset.load_dataset_and_make_vectorizer( args["surname_csv"], SurnameVectorizer.from_dataframe) vectorizer = dataset.vectorizer classifier = SurnameCnnClassifier( initial_num_channels=len(vectorizer.surname_vocab), num_classes=len(vectorizer.nationality_vocab), num_channels=args["num_channels"], ) classifier = classifier.to(args["device"]) loss_func = CrossEntropyLoss(dataset.class_weights) optimizer = Adam(classifier.parameters(), lr=args["learning_rate"]) train(args, train_state, dataset, classifier, optimizer, loss_func, compute_accuracy) return { "train_state": train_state, "args": args, "dataset": dataset, "classifier": classifier, "loss_func": loss_func, "optimizer": optimizer, }
def main(num_epochs: int = 100, batch_size: int = 128): args = { "cbow_csv": "data/frankenstein_with_splits.csv", "save_dir": "model_storage/yelp/", "model_state_file": "model.pth", "vectorizer_file": "vectorizer.json", "embedding_size": 300, "learning_rate": 0.001, "num_epochs": num_epochs, "batch_size": batch_size, "early_stopping_criteria": 5, "frequency_cutoff": 25, "cuda": False, } train_state = make_train_state() if torch.cuda.is_available(): args["cuda"] = True args["device"] = torch.device("cuda:0" if args["cuda"] else "cpu") print(args) dataset = CbowDataset.load_dataset_and_make_vectorizer(args["cbow_csv"]) vectorizer = dataset.vectorizer classifier = CbowClassifier( vocabulary_size=len(vectorizer.cbow_vocab), embedding_size=args["embedding_size"], ) classifier = classifier.to(args["device"]) loss_func = CrossEntropyLoss() optimizer = Adam(classifier.parameters(), lr=args["learning_rate"]) train(args, train_state, dataset, classifier, optimizer, loss_func, compute_accuracy) return { "train_state": train_state, "args": args, "dataset": dataset, "classifier": classifier, "loss_func": loss_func, "optimizer": optimizer, }
def main(batch_size: int = 128, num_epochs: int = 100): args = { "review_csv": "data/yelp_reviews_lite.json", "save_dir": "model_storage/yelp/", "model_state_file": "model.pth", "vectorizer_file": "vectorizer.json", "learning_rate": 0.001, "num_epochs": num_epochs, "batch_size": batch_size, "early_stopping_criteria": 5, "frequency_cutoff": 25, "cuda": False, } train_state = make_train_state() if torch.cuda.is_available(): args["cuda"] = True args["device"] = torch.device("cuda:0" if args["cuda"] else "cpu") print(args) dataset = ReviewDataset.load_dataset_and_make_vectorizer( args["review_csv"]) vectorizer = dataset.vectorizer classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab)) classifier = classifier.to(args["device"]) loss_func = nn.BCEWithLogitsLoss() optimizer = optim.Adam(classifier.parameters(), lr=args["learning_rate"]) train(args, train_state, dataset, classifier, optimizer, loss_func, compute_accuracy) return { "train_state": train_state, "args": args, "dataset": dataset, "classifier": classifier, "loss_func": loss_func, "optimizer": optimizer, }