Пример #1
0
def word_preds(datadir: str, ff_size: int):
    log.configure(os.path.join(datadir, "dabert-word-preds.log"), "daBERT word predictions")
    log("Loading metadata")
    with open(os.path.join(datadir, DatasetBuilder.metadata_file)) as f:
        metadata = json.load(f)
    log("Loading model")
    dabert = AutoModelForPreTraining.from_pretrained(daBERT).to(device)
    log("Loading data")
    dataloader = DataLoader(
        datadir,
        metadata,
        dict(),
        device,
    )
    loader = dataloader.get_dataloader(ff_size, None)
    log("Forward passing")
    correct_preds = np.zeros(len(loader))
    for i, batch in tqdm(enumerate(loader), total=len(loader)):
        logits = dabert(batch.words.ids).prediction_logits
        masked_logits = logits[batch.word_mask]
        preds = masked_logits.argmax(dim=1)
        correct_preds[i] = (preds == batch.word_mask_labels).float().mean().cpu()
    log(
        "MLM token prediction accuracy",
        "  Mean: %.4f %%" % (100 * correct_preds.mean()),
        "  Std.: %.4f %%" % (100 * correct_preds.std(ddof=1)),
    )
Пример #2
0
def type_distribution(seqs: list[list[str]]):
    dist = defaultdict(lambda: 0)
    for seq in seqs:
        for pred in seq:
            dist[pred if "-" not in pred else pred.split("-")[-1]] += 1
    log("Type distribution:", json.dumps(dist, indent=4))
    return dist
Пример #3
0
 def build(self):
     log("Saving tokenizer config and word token config to '%s'" %
         self.out_dir)
     with open(path := os.path.join(self.out_dir, self.entity_vocab_file),
               "w",
               encoding="utf-8") as ev:
         log("Saving entity vocab to '%s'" % path)
         ujson.dump(self.entity_vocab, ev, indent=2)
Пример #4
0
def _show_examples(res: GeometryResults, X: np.ndarray, I: np.ndarray,
                   data: Sequences):
    for i, idx in enumerate(I):
        num, span = res.content[idx]["text_num"], res.content[idx]["span"]
        t, a = [*data.texts[num]], data.annotations[num]
        t.insert(span[0], "{")
        t.insert(span[1] + 1, "}")
        t = " ".join(t)
        log(f"{i} ({X[idx]}) {a[span[0]].split('-')[1] if '-' in a[span[0]] else a[span[0]]}: {t}\n",
            with_info=False)
Пример #5
0
    def _reduce_tokens(self) -> tuple[np.ndarray, int]:
        token_counts = np.zeros(self.tokenizer.vocab_size, dtype=np.int32)

        log("Counting tokens in dataset")
        for example in tqdm(self.examples):
            word_ids = np.array(example["word_ids"])
            word_ids, counts = unique(word_ids, return_counts=True)
            token_counts[word_ids] += counts

        log("%i of %i tokens in the vocab are used" %
            ((token_counts > 0).sum(), self.tokenizer.vocab_size))
        *ids, unk_id = get_special_ids(self.tokenizer)
        unk_count = token_counts[unk_id]
        token_counts[[*ids, unk_id]] = -1
        sort_idx = np.argsort(token_counts)[::-1]
        keep_idx = sort_idx[:self.vocab_size]
        keep = np.zeros_like(token_counts, dtype=bool)
        keep[keep_idx] = True
        keep[[*ids, unk_id]] = True  # Always keep special tokens
        token_map = np.arange(self.tokenizer.vocab_size)
        token_map[~keep] = unk_id
        for i, j in enumerate(np.where(keep)[0]):
            token_map[j] = i
        log(
            "Reduced token vocabulary to %i tokens" % keep.sum(),
            "%.6f %% of word tokens in the dataset are now %s" % (
                100 * (unk_count + 1 + token_counts[~keep].sum()) /
                (unk_count + 1 + token_counts.sum()),
                self.tokenizer.unk_token,
            ),
        )
        np.save(self.token_map_file, token_map)
        log("Saved token map to '%s'" % self.token_map_file)

        return token_map, int(keep.sum())
Пример #6
0
 def document(self, loader: DataLoader, split: Split) -> dict[str, int]:
     """
     To be run after _build_examples to document the resulting data.
     """
     examples = [ex for _, ex in loader.dataset]
     non_zeros = [
         (ex.entities.labels[ex.entities.labels != -1] !=
          self.label_to_idx[self.null_label]).float().mean().item()
         for ex in examples
     ]
     log(f"Built dataset of {len(self.data[split].texts)} documents divided into {len(examples)} examples to be forward passed"
         )
     log(f"Average proportion of spans in each example that have positive labels: {np.mean(non_zeros)*100:.2f}%"
         )
Пример #7
0
def main(daluke_path: str, other_path: str, show: bool):
    other_name = os.path.split(other_path)[-1]
    log.configure(os.path.join(daluke_path,
                               f"comparison_with_{other_name}.log"),
                  print_level=Levels.DEBUG)

    daluke_res = NER_Results.load(daluke_path)
    other_res = NER_TestResults.load(other_path)
    if show:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                            device).data[Split.TEST]
        for da_preds, ot_preds, truths, text in zip(daluke_res.preds,
                                                    other_res.predictions,
                                                    data.annotations,
                                                    data.texts):
            if da_preds != ot_preds:
                t = Table()
                t.add_row(["Text:"] + text)
                t.add_row(["Truth:"] + truths)
                t.add_row(["DaLUKE pred:"] + da_preds)
                t.add_row([f"{other_name} pred:"] + ot_preds)
                log(str(t).replace("|", ""), with_info=False)

    log(f"Confusion matrix with DaLUKE results ↓ and results from {other_name} →"
        )
    log(
        _format_confmat(
            confusion_matrix(daluke_res.preds, other_res.predictions,
                             ["LOC", "PER", "ORG", "MISC", "O"])))
    log(f"Covar. {sequence_covar(daluke_res.preds, other_res.predictions)}")
Пример #8
0
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subolder so we can control location directly
    NER_Results.subfolder = ""

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    state_dict, ent_embed_size = mutate_for_ner(
        state_dict,
        mask_id=entity_vocab["[MASK]"]["id"],
        pad_id=entity_vocab["[PAD]"]["id"])

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict,
                       dataset,
                       metadata,
                       device,
                       entity_embedding_size=ent_embed_size,
                       bert_attention=args["bert_attention"],
                       dropout=args["dropout"])

    cv_results = cross_validate(model, dataset, args["k"], args)

    log(f"Saving results to {args['location']}")
    for i, r in enumerate(cv_results):
        r.save(os.path.join(args["location"], f"res-cv{i}"))
    log("Micro avg. F1 estimate",
        np.mean([r.statistics["micro avg"]["f1-score"] for r in cv_results]))
Пример #9
0
def optimize(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any], sampler: Sampler):
    results, tried_params = list(), list()
    best = None
    i = 0
    while (sampled_params := sampler.sample()) is not None:
        log.section(f"Sampling #{i}: chose", f(sampled_params))
        result = objective_function(deepcopy(model), dataset, {**args, **sampled_params})
        score = result.statistics["micro avg"]["f1-score"]
        if best is None or score > results[best].statistics["micro avg"]["f1-score"]:
            log(f"Found new best at F1 of {score}")
            best = i
        result.save(out := os.path.join(args['location'], f"res-optim{i}"))
        log.debug(f"Saved results to {out}")
        results.append(result)
        tried_params.append(sampled_params)
        i += 1
Пример #10
0
def run_experiment(args: dict[str, str]):
    if args["models"] == "all":
        args["models"] = ALL_MODEL_NAMES
    if args["datasets"] == "all":
        args["datasets"] = ALL_DATASET_NAMES

    models = setup_models(args["models"].split(), args["location"], daner_path=args["daner"])
    log(f"Succesfully set up {len(models)} models")

    datasets = setup_datasets(args["datasets"].split(), wikiann_path=args["wikiann"], plank_path=args["plank"])
    log(f"Sucessfully acquired {len(datasets)} NER datasets")

    for model in models:
        for dataset in datasets:
            e = Evaluator(model, dataset)
            res = e.run()
            res.save(os.path.join(args["location"], "-".join((model.name, dataset.name))))
Пример #11
0
def setup_datasets(names_to_setup: list[str], wikiann_path: str="wikiann", plank_path: str="plank", split="test") -> list[TestDataset]:
    datasets = []
    for name in names_to_setup:
        try:
            datasets.append(
                next(d for d in ALL_DATASETS if d.name == name)
            )
        except IndexError as ie:
            raise ValueError(f"Dataset with given name {name} not found, see --help for options") from ie
    for d in datasets:
        log(f"Setting up dataset \"{d.name}\" ...")
        kwargs = dict()
        if isinstance(d, Wikiann):
            kwargs["data_path"] = wikiann_path
        elif isinstance(d, Plank):
            kwargs["data_path"] = plank_path
        d.setup(**kwargs, split=split)
    return datasets
Пример #12
0
def main(path: str, pred: str, truth: str):
    log.configure(os.path.join(path,
                               f"prediction-examples-{pred}-{truth}.log"),
                  print_level=Levels.DEBUG)
    log(f"Looking for examples where model predicted {pred}, but the truth was {truth}"
        )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    res = NER_Results.load(path)
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TEST]
    for preds, truths, text in zip(res.preds, data.annotations, data.texts):
        if any(p != t and cla(p) == pred and cla(t) == truth
               for p, t in zip(preds, truths)):
            t = Table()
            t.add_row(["Text:"] + text)
            t.add_row(["Truth:"] + truths)
            t.add_row(["Pred:"] + preds)
            log(str(t).replace("|", ""), with_info=False)
Пример #13
0
def setup_models(names_to_setup: list[str],
                 location: str,
                 daner_path: str = "daner") -> list[NER_TestModel]:
    models = []
    for name in names_to_setup:
        try:
            models.append([m for m in ALL_MODELS if m.name == name][0])
        except IndexError as ie:
            raise ValueError(
                f"Model with given name {name} not found, see --help for options"
            ) from ie
    for m in models:
        log(f"Setting up model \"{m.name}\" ... ")
        kwargs = dict()
        if isinstance(m, Daner):
            kwargs["repo_path"] = daner_path
            kwargs["data_path"] = location
        m.setup(**kwargs)
    return models
Пример #14
0
def ner(filepath: str, text: str):
    if not filepath and not text:
        raise ValueError("Either filepath or text must be given")
    elif filepath and text:
        raise ValueError("Filepath and text cannot both be given")
    elif filepath:
        with open(filepath) as f:
            text = f.read()

    log.debug("Loading model and predicting")
    with _no_log():
        daluke_ner = AutoNERDaLUKE()
        preds = predict_ner(text, daluke_ner)

    t = Table()
    t.add_header(["Word", "IOB NER Prediction"])
    for word, pred in zip(text.split(), preds):
        t.add_row([word, pred])
    log(t)
Пример #15
0
def main(path: str, n: int):
    log.configure(os.path.join(path, "geometry-examples.log"),
                  "daLUKE examples",
                  print_level=Levels.DEBUG)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Hardcoded to train
    data = load_dataset(dict(dataset="DaNE"), DUMMY_METADATA,
                        device).data[Split.TRAIN]
    set_seeds()
    GeometryResults.subfolder = ""
    res = GeometryResults.load(path)
    for field, axis in OF_INTEREST.items():
        log.section(field)
        X = getattr(res, field)
        order = X[:, axis].argsort()

        log(f"Examples where dim. {axis} is high")
        _show_examples(res, X, order[::-1][:n], data)
        log(f"Examples where dim. {axis} is low")
        _show_examples(res, X, order[:n], data)
Пример #16
0
def objective_function(model: NERDaLUKE, dataset: NERDataset, args: dict[str, Any]) -> NER_Results:
    dataloader = dataset.build(Split.TRAIN, args["batch_size"])
    dev_dataloader = dataset.build(Split.DEV, EVAL_BATCH)
    device = next(model.parameters()).device
    training = TrainNER(
        model,
        dataloader,
        dataset,
        device         = device,
        epochs         = args["epochs"],
        lr             = args["lr"],
        warmup_prop    = args["warmup_prop"],
        weight_decay   = args["weight_decay"],
        dev_dataloader = dev_dataloader,
        loss_weight    = args["batch_size"]
    )
    res = training.run()

    log.debug("Evaluating")
    best_res = res.running_dev_evaluations[res.best_epoch]
    log(f"Best model achieved {best_res.statistics['micro avg']['f1-score']} in mic-F1")
    return best_res
Пример #17
0
    def _calculate_stats(self, preds: list[list[str]], truth: list[list[str]]) -> NER_TestResults:
        # Convert to python numericals to avoid json serialization problems
        # Set divide by zero cases to 0 to avoid warnings for models that can't see "MISC"
        stats = self._stats_to_py_nums(
                    classification_report(truth, preds, output_dict=True, zero_division=0)
                )
        # If the dataset includes the MISC category, a version of the result without this is computed
        stats_nomisc = self._stats_to_py_nums(
                        classification_report(self._rm_misc(truth), self._rm_misc(preds), output_dict=True)
                    ) if any(any("MISC" in ent for ent in sent) for sent in truth) else stats

        #FIXME: Do this manually instead of rerunning everything
        log(classification_report(truth, preds, zero_division=0, digits=4))
        if stats != stats_nomisc:
            log(classification_report(self._rm_misc(truth), self._rm_misc(preds), digits=4))

        return NER_TestResults(
                modelname   = self.model.name,
                dataname    = self.dataset.name,
                predictions = preds,
                statistics  = stats,
                statistics_nomisc = stats_nomisc,
        )
Пример #18
0
def masked(filepath: str, text: str, entity_spans: list[str]):
    """ Entities are given as 'start1,end1;start2,end2 ...'
    Ends are optional. If not given, they will be set to start+1
    Spans are 1-indexed with inclusive ends """
    if not filepath and not text:
        raise ValueError("Either filepath or text must be given")
    elif filepath and text:
        raise ValueError("Filepath and text cannot both be given")
    elif filepath:
        with open(filepath) as f:
            text = f.read()

    entity_spans = [(int(x.split(",")[0]) - 1,
                     int(x.split(",")[1])) if "," in x else
                    (int(x) - 1, int(x)) for x in entity_spans.split(";") if x]

    log.debug("Loading model and predicting")
    with _no_log():
        daluke_mlm = AutoMLMDaLUKE()
        text, top_preds = predict_mlm(text, entity_spans, daluke_mlm)

    log("The top 5 predictions with likelihoods for each [MASK] were",
        top_preds)
    log("DaLUKE's best predictions were", text)
Пример #19
0
def collect_representations(
    modelpath: str, device: torch.device, target_device: torch.device,
    only_positives: bool, fine_tuned: bool
) -> tuple[np.ndarray, np.ndarray, list[dict[str, int | list[tuple[int,
                                                                   int]]]]]:
    entity_vocab, metadata, state_dict, token_map = load_from_archive(
        args["model"])
    log("Loading dataset")
    # Note: We dont fill out dict as we dont allow changing max-entities and max-entity-span here. If this results in an error for any dataset, we must change this.
    dataset = load_dataset(dict(dataset="DaNE"), metadata, device, token_map)
    dataloader = dataset.build(Split.TRAIN, FP_SIZE, shuffle=False)
    log("Loading model")
    if not fine_tuned:
        state_dict, ent_embed_size = mutate_for_ner(
            state_dict,
            mask_id=entity_vocab["[MASK]"]["id"],
            pad_id=entity_vocab["[PAD]"]["id"])
    model = load_model(
        state_dict,
        dataset,
        metadata,
        device,
        entity_embedding_size=ent_embed_size if not fine_tuned else None)
    model.eval()

    log("Forward passing examples")
    batch_representations, labels, content = list(), list(), list()
    for batch in tqdm(dataloader):
        # Use super class as we want the represenations
        word_representations, entity_representations = super(
            type(model), model).forward(batch)
        start_word_representations, end_word_representations = model.collect_start_and_ends(
            word_representations, batch)
        representations = torch.cat([
            start_word_representations, end_word_representations,
            entity_representations
        ],
                                    dim=2)
        # We dont want padding
        mask = batch.entities.attention_mask.bool()
        if only_positives:
            mask &= (batch.entities.labels != 0)
        batch_representations.append(
            representations[mask].contiguous().to(target_device))
        labels.append(
            batch.entities.labels[mask].contiguous().to(target_device))
        for i, text_num in enumerate(batch.text_nums):
            for j in range(batch.entities.N[i]):
                if mask[i, j]:
                    content.append(
                        dict(
                            text_num=text_num,
                            span=batch.entities.fullword_spans[i][j],
                        ))
    return torch.cat(batch_representations).numpy(), torch.cat(
        labels).numpy(), content
Пример #20
0
def main(path: str, model: str, n_components: int,
         reducer_subsample: Optional[int], tsne_perplexity: float,
         umap_neighbours: int, umap_min_dist: float, only_positives: bool,
         fine_tuned: bool):
    set_seeds()
    log.configure(os.path.join(path, "geometry-analysis.log"),
                  "daLUKE embedding geometry analysis",
                  print_level=Levels.DEBUG)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        representations, labels, content = collect_representations(
            model, device, torch.device("cpu"), only_positives, fine_tuned)
    log(f"Acquired representations of shape {representations.shape}")
    log("Performing principal component analysis")
    pca_transformed, principal_components = pca(representations, n_components)
    if reducer_subsample is not None:
        log.debug(
            f"Reducing dataset to {reducer_subsample} examples for UMAP and t-SNE"
        )
        representations = representations[:reducer_subsample]
    log("Running the UMAP algorithm")
    umap_transformed = umap(representations, umap_neighbours, umap_min_dist)
    log("Running the t-SNE algorithm")
    tsne_transformed = tsne(representations, tsne_perplexity)

    log(
        "Saved analysis results to",
        GeometryResults(
            pca_transformed=pca_transformed,
            umap_transformed=umap_transformed,
            tsne_transformed=tsne_transformed,
            labels=labels,
            principal_components=principal_components,
            content=content,
        ).save(path),
    )
Пример #21
0
def cross_validate(model: NERDaLUKE, dataset: NERDataset, k: int,
                   train_args: dict[str, Any]) -> list[NER_Results]:
    cv_splits = random_divide(merge_data(list(dataset.data.values())), k)
    results = list()
    log(f"Split into {k} subdatasets with lengths {[len(c.texts) for c in cv_splits]}"
        )
    for i, test_data in enumerate(cv_splits):
        log.section(f"Cross-validation split {i}")
        train_data = merge_data([s for j, s in enumerate(cv_splits) if j != i])
        # Create split specific model and data
        split_model = deepcopy(model)
        split_dataset = deepcopy(dataset)
        split_dataset.data[Split.TRAIN] = train_data
        split_dataloader = split_dataset.build(Split.TRAIN,
                                               train_args["batch_size"])

        log("Training")
        split_dataset.document(split_dataloader, Split.TRAIN)
        type_distribution(split_dataset.data[Split.TRAIN].annotations)
        trainer = TrainNER(
            split_model,
            split_dataloader,
            split_dataset,
            device=next(split_model.parameters()).device,
            epochs=train_args["epochs"],
            lr=train_args["lr"],
            warmup_prop=train_args["warmup_prop"],
            weight_decay=train_args["weight_decay"],
            dev_dataloader=None,  # Don't eval
            loss_weight=train_args["loss_weight"])
        trainer.run()

        split_dataset.data[Split.TEST] = test_data
        split_test_dataloader = split_dataset.build(Split.TEST, EVAL_BATCH)

        log("Evaluation")
        split_dataset.document(split_dataloader, Split.TEST)
        type_distribution(split_dataset.data[Split.TEST].annotations)
        results.append(
            evaluate_ner(split_model,
                         split_test_dataloader,
                         split_dataset,
                         trainer.device,
                         Split.TEST,
                         also_no_misc=False))
    return results
Пример #22
0
    def run(self) -> TrainResults:
        res = TrainResults(
            epoch                        = 0,
            losses                       = list(),
            best_epoch                   = None,
            running_train_statistics     = list(),
            running_dev_evaluations      = list(),
            dev_pred_distributions       = list(),
            dev_true_type_distribution   = dict(),
            train_pred_distributions     = list(),
            train_true_type_distribution = dict()
        )
        for i in range(self.epochs):
            res.epoch = i
            self.model.train()
            for j, batch in enumerate(self.dataloader):
                scores = self.model(batch)
                loss = self.criterion(scores.view(-1, self.model.output_shape), batch.entities.labels.view(-1))
                loss.backward()

                self.optimizer.step()
                self.scheduler.step()
                self.model.zero_grad()

                res.losses.append(loss.item())
                log.debug(f"Epoch {i} / {self.epochs-1}, batch: {j} / {len(self.dataloader)-1}. LR: {self.scheduler.get_last_lr()[0]:.2e} Loss: {loss.item():.5f}.")

            # Perform running evaluation
            if self.dev_dataloader is not None:
                log("Evaluating on development set ...")
                dev_results = evaluate_ner(self.model, self.dev_dataloader, self.dataset, self.device, Split.DEV, also_no_misc=False)
                res.running_dev_evaluations.append(dev_results)
                res.dev_pred_distributions.append(type_distribution(dev_results.preds))

                log("Evaluating on training set ...")
                train_results = evaluate_ner(self.model, self.dataloader, self.dataset, self.device, Split.TRAIN, also_no_misc=False)
                res.running_train_statistics.append(train_results.statistics)
                res.train_pred_distributions.append(type_distribution(train_results.preds))
                if res.best_epoch is None or\
                        (dev_results.statistics["micro avg"]["f1-score"]) > res.running_dev_evaluations[res.best_epoch].statistics["micro avg"]["f1-score"]:
                    log(f"Found new best model at epoch {i}")
                    self.best_model = deepcopy(self.model)
                    res.best_epoch = i
        return res
Пример #23
0
def run_experiment(args: dict[str, Any]):
    set_seeds(seed=0)
    # Remove subfolder so we can control location directly
    NER_Results.subfolder = ""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    entity_vocab, metadata, state_dict, token_map = load_from_archive(args["model"])
    state_dict, ent_embed_size = mutate_for_ner(state_dict, mask_id=entity_vocab["[MASK]"]["id"], pad_id=entity_vocab["[PAD]"]["id"])

    log("Setting up sampler")
    with open(args["params"], "r") as f:
        param_lists = json.load(f)
    sampler = SAMPLERS[args["sampler"]](param_lists)

    log(f"Loading dataset {args['dataset']} ...")
    dataset = load_dataset(args, metadata, device, token_map)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device, entity_embedding_size=ent_embed_size)

    optimize(model, dataset, args, sampler)
Пример #24
0
def run_experiment(args: dict[str, Any]):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    _, metadata, state_dict, token_map = load_from_archive(args["model"])

    log("Loading dataset ...")
    dataset = load_dataset(args, metadata, device, token_map)
    dataloader = dataset.build(Split.TEST, FP_SIZE)

    log("Loading model ...")
    model = load_model(state_dict, dataset, metadata, device)

    # Print some important information to stdout
    log.debug(model)
    dataset.document(dataloader, Split.TEST)
    type_distribution(dataset.data[Split.TEST].annotations)

    log("Starting evaluation of daLUKE for NER")
    results = evaluate_ner(model, dataloader, dataset, device, Split.TEST)

    results.save(args["location"])
    type_distribution(results.preds)
Пример #25
0
    def __init__(
        self,
        dump_db_file: str,  # Location of file build by build-dump-db
        tokenizer_name:
        str,  # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT
        entity_vocab_file: str,  # Build by build-entity-vocab
        out_dir:
        str,  # Where to put finished dataset. All contents will be removed before saving dataset
        validation_prob:
        float,  # Chance of each finished document to be marked as part of validation set
        max_entities:
        int,  # Only up to this many entities are included in each sequence
        max_entity_span:
        int,  # Maximum number tokens an entity can span before sequence is discarded
        min_sentence_length:
        int,  # Minimum number of tokens a sentence must span to be included
        max_articles: int | None,
        max_vocab_size: int,
    ):
        if not wikipedia2vec_available:
            raise ModuleNotFoundError(
                "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`"
            )
        log("Reading dump database at %s" % dump_db_file)
        self.dump_db = DumpDB(dump_db_file)
        log("Building tokeninizer: %s" % tokenizer_name)
        self.tokenizer_name = tokenizer_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        log("Building sentence tokenizer: %s" % self.tokenizer_language)
        self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language)
        log("Loading entity vocab at %s" % entity_vocab_file)
        self.entity_vocab = load_entity_vocab(entity_vocab_file)
        # Make sure IDs on non-ignored entities are contiguous
        num = 0
        for entity_info in self.entity_vocab.values():
            entity_info["id"] = num
            num += 1
        log("Entity vocab has size %i" % num)

        self.out_dir = out_dir
        self.data_file = os.path.join(self.out_dir, self.data_file)
        self.token_map_file = os.path.join(self.out_dir, self.token_map_file)
        self.max_seq_length = self.tokenizer.model_max_length
        self.validation_prob = validation_prob
        self.max_entities = max_entities
        self.max_entity_span = max_entity_span
        self.min_sentence_length = min_sentence_length
        # Get maximum number of tokens in a sequence excluding start and end tokens
        self.max_num_tokens = self.max_seq_length - 2
        self.max_articles = max_articles
        self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min(
            max_vocab_size, max_vocab_size)

        # Filter titles so only real articles are included
        self.target_titles = list(self.dump_db.titles())

        # Remove old datafile if it exists
        if os.path.isfile(self.data_file):
            log.debug("Removing old datafile '%s'" % self.data_file)
            os.remove(self.data_file)

        self.examples = list()
Пример #26
0
class DatasetBuilder:

    tokenizer_language = "da"

    # Files saved by the build method
    metadata_file = "metadata.json"
    entity_vocab_file = "entity-vocab.json"
    data_file = "data.jsonl"
    token_map_file = "token-map.npy"

    def __init__(
        self,
        dump_db_file: str,  # Location of file build by build-dump-db
        tokenizer_name:
        str,  # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT
        entity_vocab_file: str,  # Build by build-entity-vocab
        out_dir:
        str,  # Where to put finished dataset. All contents will be removed before saving dataset
        validation_prob:
        float,  # Chance of each finished document to be marked as part of validation set
        max_entities:
        int,  # Only up to this many entities are included in each sequence
        max_entity_span:
        int,  # Maximum number tokens an entity can span before sequence is discarded
        min_sentence_length:
        int,  # Minimum number of tokens a sentence must span to be included
        max_articles: int | None,
        max_vocab_size: int,
    ):
        if not wikipedia2vec_available:
            raise ModuleNotFoundError(
                "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`"
            )
        log("Reading dump database at %s" % dump_db_file)
        self.dump_db = DumpDB(dump_db_file)
        log("Building tokeninizer: %s" % tokenizer_name)
        self.tokenizer_name = tokenizer_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        log("Building sentence tokenizer: %s" % self.tokenizer_language)
        self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language)
        log("Loading entity vocab at %s" % entity_vocab_file)
        self.entity_vocab = load_entity_vocab(entity_vocab_file)
        # Make sure IDs on non-ignored entities are contiguous
        num = 0
        for entity_info in self.entity_vocab.values():
            entity_info["id"] = num
            num += 1
        log("Entity vocab has size %i" % num)

        self.out_dir = out_dir
        self.data_file = os.path.join(self.out_dir, self.data_file)
        self.token_map_file = os.path.join(self.out_dir, self.token_map_file)
        self.max_seq_length = self.tokenizer.model_max_length
        self.validation_prob = validation_prob
        self.max_entities = max_entities
        self.max_entity_span = max_entity_span
        self.min_sentence_length = min_sentence_length
        # Get maximum number of tokens in a sequence excluding start and end tokens
        self.max_num_tokens = self.max_seq_length - 2
        self.max_articles = max_articles
        self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min(
            max_vocab_size, max_vocab_size)

        # Filter titles so only real articles are included
        self.target_titles = list(self.dump_db.titles())

        # Remove old datafile if it exists
        if os.path.isfile(self.data_file):
            log.debug("Removing old datafile '%s'" % self.data_file)
            os.remove(self.data_file)

        self.examples = list()

    def _tokenize(self, text: str, paragraph_text: str, idx: int) -> list[str]:
        if not text:
            return list()
        try:
            if isinstance(self.tokenizer, RobertaTokenizer):
                tokens = self.tokenizer.tokenize(
                    text,
                    add_prefix_space=idx == 0 or text.startswith(" ")
                    or paragraph_text[idx - 1] == " ",
                )
            else:
                tokens = self.tokenizer.tokenize(text)
        except KeyboardInterrupt:
            # Make sure program can be keyboard interrupted despite needing to catch BaseException
            raise
        except BaseException as e:
            # Catch an exception caused by rust panicking in the tokenizer
            log.warning(
                "Failed to tokenize text with exception '%s'\nText: '%s'" %
                (e, text))
            return list()

        return tokens

    def build(self):
        log("Saving tokenizer config and word token config to '%s'" %
            self.out_dir)
        with open(path := os.path.join(self.out_dir, self.entity_vocab_file),
                  "w",
                  encoding="utf-8") as ev:
            log("Saving entity vocab to '%s'" % path)
            ujson.dump(self.entity_vocab, ev, indent=2)

        log.section("Processing %i pages" %
                    len(self.target_titles[:self.max_articles]))
        n_seqs, n_ents, n_word_toks, n_words = 0, 0, 0, 0
        for title in log.tqdm(tqdm(self.target_titles[:self.max_articles])):
            log("Processing %s" % title)
            with TT.profile("Process page"):
                s, e, nt, nw = self._process_page(title)
                n_seqs += s
                n_ents += e
                n_word_toks += nt
                n_words += nw

        log("Shuffling data")
        random.shuffle(self.examples)
        n_vals = int(self.validation_prob * len(self.examples))
        for i in range(n_vals):
            self.examples[i]["is_validation"] = True

        # Save metadata
        metadata = {
            "number-of-items": n_seqs,
            "number-of-word-tokens": n_word_toks,
            "number-of-words": n_words,
            "number-of-entities": n_ents,
            "number-of-val-items": n_vals,
            "max-seq-length": self.max_seq_length,
            "max-entities": self.max_entities,
            "max-entity-span": self.max_entity_span,
            "min-sentence-length": self.min_sentence_length,
            "base-model": self.tokenizer_name,
            "tokenizer-class": self.tokenizer.__class__.__name__,
            "language": self.dump_db.language,
            "reduced-vocab": self.vocab_size < self.tokenizer.vocab_size,
            "vocab-size": self.vocab_size,
        }

        if self.vocab_size < self.tokenizer.vocab_size:
            log.section("Reducing token number")
            with TT.profile("Reduce token vocab"):
                token_map, metadata["vocab-size"] = self._reduce_tokens()
            with TT.profile("Rewrite dataset with new tokens"):
                self._update_tokens(token_map)

        with open(path := os.path.join(self.out_dir, self.metadata_file),
                  "w") as f:
            log.section("Saving metadata to %s" % path)
            ujson.dump(metadata, f, indent=4)
Пример #27
0
 def _update_tokens(self, token_map: np.ndarray):
     log("Updating dataset with kept tokens")
     for example in tqdm(self.examples):
         example["word_ids"] = token_map[example["word_ids"]].tolist()
Пример #28
0
def main():
    parser = ArgumentParser(description=\
        "Standalone convenience script used to collect the results from the pretraining of daLUKE "\
        "performed by the pretraining module")
    parser.add_argument("inpath", type=str,
        help= "Path to the output folder of the pretraining containing the model file. "\
            "Entity vocab. and metadata are assumed to be in parent folder of this."\
            "Can also be path to an exact model file, in which case this will be used instead of the newest."
    )
    parser.add_argument("outpath",
                        type=str,
                        help="File path to the compressed model")
    parser.add_argument("--tmpdir",
                        type=str,
                        help="Where to create temporary folder",
                        default="")
    args = parser.parse_args()
    log.configure(os.path.join(
        args.outpath if os.path.isdir(args.outpath) else os.path.dirname(
            args.outpath), "collect.log"),
                  "Collector",
                  print_level=Levels.DEBUG)

    modelpath = args.inpath if os.path.isdir(args.inpath) else os.path.dirname(
        args.inpath)
    vocabfile, metafile = os.path.join(modelpath, "..",
                                       VOCAB_FILE), os.path.join(
                                           modelpath, "..", METADATA_FILE)
    modelfile = os.path.join(args.inpath, _get_newest_model(
        args.inpath)) if os.path.isdir(args.inpath) else args.inpath

    os.makedirs(os.path.split(args.outpath)[0], exist_ok=True)

    ins, outs = [vocabfile, metafile,
                 modelfile], [VOCAB_FILE, METADATA_FILE, MODEL_OUT]
    # If reduction is used, also collect the token map
    with open(metafile, "r") as f:
        is_reduced = json.load(f).get("reduced-vocab")
    if is_reduced:
        ins.append(os.path.join(modelpath, "..",
                                DatasetBuilder.token_map_file))
        outs.append(TOKEN_MAP_FILE)
    tmpdir = os.path.join(args.tmpdir, "tmpdir")
    log.debug(f"Using:", *ins)

    # Operate directly on disk as opposed to serialize.save_to_archive which requires us to load the data into mem.
    if shutil.which("tar"):
        log.debug(f"Compressing to {args.outpath} using system tar tool...")
        try:
            os.makedirs(tmpdir, exist_ok=True)
            for f, n in zip(ins, outs):
                shutil.copy2(f, os.path.join(tmpdir, n))
            p = subprocess.Popen(
                ["tar", "-czvf", args.outpath, "-C", tmpdir] + outs,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            p.wait()
        finally:
            shutil.rmtree(tmpdir)
    else:
        with tarfile.open(args.outpath, "w:gz") as tar:
            for f, n in zip(ins, outs):
                log.debug(
                    f"Compressing {f} as {n} using build-in tar module (may take a while)..."
                )
                tar.add(f, arcname=n)
    log("Succesfully compressed file saved to", args.outpath)
Пример #29
0
def preprocess(
    dump_db_file: str,
    function: str,
    entity_vocab_file: str | None,
    dagw_sections: str | None,
    min_entity_length: int,
    max_entity_length: int,
    max_articles: int | None,
):
    if not entity_vocab_file:
        raise RuntimeError("entity-vocab-file must be given")

    log.configure(
        os.path.join(os.path.split(dump_db_file)[0], "preprocessing.log"),
        "Preprocessing",
        log_commit=True,
    )

    log.section("Collecting data")
    log(
        "Wikidump path: %s" % dump_db_file,
        "Function:      %s" % function,
    )

    log("Loading entity vocab")
    entity_vocab = {
        _insert_xml_special_characters(e.lower())
        for e in load_entity_vocab(entity_vocab_file)
    }

    dagw_files = list()
    if dagw_sections:
        n_words = 0
        log("Finding gigaword data files and counting words")
        dagw_files = list(_get_dagw_files(dagw_sections))
        for dagw_file in tqdm(dagw_files):
            with open(dagw_file) as f:
                n_words += len(f.read().split())
        log("Found %i dagw files containing %i words" %
            (len(dagw_files), n_words))

    # tempdir is not used, as the temporary files can take up more space than what temporary
    # directories usually allow
    tmpdir = os.path.join(os.path.split(dump_db_file)[0], "tmpdir")
    os.makedirs(tmpdir, exist_ok=True)
    log("Saving all articles to temporary directory %s" % tmpdir)
    for dagw_file in tqdm(dagw_files):
        shutil.copy2(
            dagw_file,
            os.path.join(tmpdir, fix_filename(os.path.split(dagw_file)[-1])))
    log("Saving Wikipedia files to temporary directory")
    for is_text, text, title in tqdm(_get_lineblocks(dump_db_file),
                                     unit=" blocks"):
        if is_text and not ignore_title(title):
            text_start = text.index(">") + 1
            text_end = -len("</text>\n")
            with open(
                    os.path.join(tmpdir,
                                 fix_filename(title)[:100] + ".wiki"),
                    "w") as f:
                f.write(text[text_start:text_end])

    files = [
        os.path.join(tmpdir, x) for x in os.listdir(tmpdir)[:max_articles]
    ]
    log("Saved a total of %i articles to %s" % (len(files), tmpdir))

    log.section("Beginning preprocessing on %i threads" % os.cpu_count())
    process_map(
        func,
        [(function, f, entity_vocab, min_entity_length, max_entity_length)
         for f in files],
        max_workers=os.cpu_count(),
        chunksize=1024,
    )

    dump_file = os.path.splitext(dump_db_file)[0] + ".%s.bz2" % function
    log.info("Saving preprocessed files to %s" % dump_file)
    with bz2.BZ2File(dump_file, "w") as dump:
        with bz2.BZ2File(dump_db_file) as old_dump:
            line = b""
            while not line.strip().startswith(b"<page>"):
                dump.write(line)
                line = old_dump.readline()
        for i, fname in tqdm(enumerate(files), total=len(files)):
            with open(fname) as f:
                text = f.read()
            s = """
            <page>
                <title>{title}</title>
                <id>{id}</id>
                <revision>
                    <text bytes="{bytes}" xml:space="preserve">{text}</text>
                </revision>
            </page>""".format(
                title=fname,
                id=i + 1,
                bytes=len(text),
                text=text,
            )
            if i == 0:
                s = s[1:]
            dump.write(s.encode("utf-8"))
        dump.write(b"\n</mediawiki>")

    log.info("Removing temporary files")
    shutil.rmtree(tmpdir)
    log.info("Done preprocessing data")
Пример #30
0
                next(d for d in ALL_DATASETS if d.name == name)
            )
        except IndexError as ie:
            raise ValueError(f"Dataset with given name {name} not found, see --help for options") from ie
    for d in datasets:
        log(f"Setting up dataset \"{d.name}\" ...")
        kwargs = dict()
        if isinstance(d, Wikiann):
            kwargs["data_path"] = wikiann_path
        elif isinstance(d, Plank):
            kwargs["data_path"] = plank_path
        d.setup(**kwargs, split=split)
    return datasets

if __name__ == '__main__':
    """ Shows some Data stats """
    localdata = "../../local_data"

    localdata = os.path.join(sys.path[0], localdata)
    wikiann_p, plank_p = os.path.join(localdata, "wikiann"), os.path.join(localdata, "plank")
    log.configure(os.path.join(localdata, "data.log"), "data")

    for split in ("train", "dev", "test"):
        ds = setup_datasets(("DaNE", "Plank", "WikiANN"), wikiann_path=wikiann_p, plank_path=plank_p, split=split)
        for d in ds:
            log(f"{d.name} {split} sentences:", len(d.get_data()[0]))
    # now for better test statistics
    for d in ds:
        for ann in ("ORG", "PER", "LOC", "MISC"):
            log(f"#{ann} in {d.name}", sum(len([w for w in s if ann in w]) for s in d.get_data()[1]))