def run_experiment(args: dict[str, Any]): set_seeds(seed=0) # Remove subolder so we can control location directly NER_Results.subfolder = "" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive( args["model"]) state_dict, ent_embed_size = mutate_for_ner( state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size, bert_attention=args["bert_attention"], dropout=args["dropout"]) cv_results = cross_validate(model, dataset, args["k"], args) log(f"Saving results to {args['location']}") for i, r in enumerate(cv_results): r.save(os.path.join(args["location"], f"res-cv{i}")) log("Micro avg. F1 estimate", np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
def run_experiment(args: dict[str, Any]): set_seeds(seed=0) # Remove subfolder so we can control location directly NER_Results.subfolder = "" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"]) state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) log("Setting up sampler") with open(args["params"], "r") as f: param_lists = json.load(f) sampler = SAMPLERS[args["sampler"]](param_lists) log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) log("Loading model ...") model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size) optimize(model, dataset, args, sampler)
def main(path: str, n: int): log.configure(os.path.join(path, "geometry-examples.log"), "daLUKE examples", print_level=Levels.DEBUG) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hardcoded to train data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA, device).data[Split.TRAIN] set_seeds() GeometryResults.subfolder = "" res = GeometryResults.load(path) for field, axis in OF_INTEREST.items(): log.section(field) X = getattr(res, field) order = X[:, axis].argsort() log(f"Examples where dim. {axis} is high") _show_examples(res, X, order[::-1][:n], data) log(f"Examples where dim. {axis} is low") _show_examples(res, X, order[:n], data)
def main(path: str, model: str, n_components: int, reducer_subsample: Optional[int], tsne_perplexity: float, umap_neighbours: int, umap_min_dist: float, only_positives: bool, fine_tuned: bool): set_seeds() log.configure(os.path.join(path, "geometry-analysis.log"), "daLUKE embedding geometry analysis", print_level=Levels.DEBUG) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): representations, labels, content = collect_representations( model, device, torch.device("cpu"), only_positives, fine_tuned) log(f"Acquired representations of shape {representations.shape}") log("Performing principal component analysis") pca_transformed, principal_components = pca(representations, n_components) if reducer_subsample is not None: log.debug( f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE" ) representations = representations[:reducer_subsample] log("Running the UMAP algorithm") umap_transformed = umap(representations, umap_neighbours, umap_min_dist) log("Running the t-SNE algorithm") tsne_transformed = tsne(representations, tsne_perplexity) log( "Saved analysis results to", GeometryResults( pca_transformed=pca_transformed, umap_transformed=umap_transformed, tsne_transformed=tsne_transformed, labels=labels, principal_components=principal_components, content=content, ).save(path), )
import numpy as np import pytest import torch import torch.nn as nn from pelutils import set_seeds from pelutils.ds import unique, no_grad set_seeds(sum(ord(c) for c in "GME TO THE MOON! 🚀🚀🚀🚀🚀🚀🚀🚀")) def test_unique(): # Simple case: Ordered numbers from 0 to 99 n = 100 a = np.arange(n, dtype=np.uint32) u, index, inverse, counts = unique(a, return_index=True, return_inverse=True, return_counts=True) assert np.all(a == u) assert np.all(a == index) assert np.all(a == inverse) assert np.all(counts == 1) # Slightly more complex case with some non-unique values a[2:4] = 50 a[[5, 16, 3]] = 69 a = a.astype(np.float16) u, index, inverse, counts = unique(a, return_index=True, return_inverse=True, return_counts=True) argsort = np.argsort(u) npu, npindex, npcounts = np.unique(a, return_index=True, return_counts=True) assert np.all(u[argsort] == npu)
def run_experiment(args: dict[str, Any]): log.configure( os.path.join(args["location"], "daluke-train-ner.log"), args["name"] + " Fine-tuning", logger=args["name"] + "-fine-tune", print_level=Levels.INFO if args["quieter"] else Levels.DEBUG, ) set_seeds(seed=args["seed"]) assert not (args["words_only"] and args["entities_only"]), "--words-only and --entities-only cannot be used together" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"]) state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"]) # Add new NER specific fields to metadata metadata["NER-words-only"] = args["words_only"] metadata["NER-entities-only"] = args["entities_only"] log(f"Loading dataset {args['dataset']} ...") dataset = load_dataset(args, metadata, device, token_map) dataloader = dataset.build(Split.TRAIN, args["batch_size"]) dev_dataloader = dataset.build(Split.DEV, args["batch_size"]) if args["eval"] else None # Remember the dimensionality that the model will be trained with metadata["output-size"] = len(dataset.all_labels) log("Loading model ...") model = load_model( state_dict, dataset, metadata, device, bert_attention = args["bert_attention"], entity_embedding_size = ent_embed_size, dropout = args["dropout"], ) log(f"Starting training of DaLUKE for NER on {args['dataset']}") training = TrainNER( model, dataloader, dataset, device = device, epochs = args["epochs"], lr = args["lr"], warmup_prop = args["warmup_prop"], weight_decay = args["weight_decay"], dev_dataloader = dev_dataloader, loss_weight = args["loss_weight"], ) # Log important information out log.debug(training.model) log.debug(training.scheduler) log.debug(training.optimizer) dataset.document(dataloader, Split.TRAIN) type_distribution(dataset.data[Split.TRAIN].annotations) results = training.run() log("Saving results and model to %s" % args["location"]) save_to_archive(os.path.join(args["location"], TRAIN_OUT), entity_vocab, metadata, model, token_map) if args["eval"]: log("True dev. set distributions") results.dev_true_type_distribution = type_distribution(dataset.data[Split.DEV].annotations) log("True dev. set distributions") results.train_true_type_distribution = type_distribution(dataset.data[Split.TRAIN].annotations) log("Saving best model") save_to_archive(os.path.join(args["location"], TRAIN_OUT_BEST), entity_vocab, metadata, training.best_model, token_map) results.save(args["location"])