예제 #1
0
def train_model(
    model,
    train_path,
    eval_path,
    n_iter=10,
    output=None,
    tok2vec=None,
):
    """
    Train a model from Prodigy annotations and optionally save out the best
    model to disk.
    """
    spacy.util.fix_random_seed(0)
    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    ner = nlp.create_pipe("ner")
    for label in labels:
        ner.add_label(label)
    nlp.add_pipe(ner)
    t2v_cfg = {
        "embed_rows": 10000,
        "token_vector_width": 128,
        "conv_depth": 8,
        "nr_feature_tokens": 3,
    }
    optimizer = nlp.begin_training(
        component_cfg={"ner": t2v_cfg} if tok2vec else {})
    if tok2vec:
        _load_pretrained_tok2vec(nlp, Path(tok2vec))
    batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8, 8, 8)
    msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            texts, annots = zip(*batch)
            nlp.update(texts, annots, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            sc = nlp.evaluate(eval_data)
            if sc.ents_f > best_acc:
                best_acc = sc.ents_f
                if output:
                    best_model = nlp.to_bytes()
        acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
        msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
예제 #2
0
def generate_meta(model_path, existing_meta, msg):
    meta = existing_meta or {}
    settings = [
        ("lang", "Model language", meta.get("lang", "en")),
        ("name", "Model name", meta.get("name", "model")),
        ("version", "Model version", meta.get("version", "0.0.0")),
        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
        ("description", "Model description", meta.get("description", False)),
        ("author", "Author", meta.get("author", False)),
        ("email", "Author email", meta.get("email", False)),
        ("url", "Author website", meta.get("url", False)),
        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
    ]
    nlp = util.load_model_from_path(Path(model_path))
    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
        "vectors": len(nlp.vocab.vectors),
        "keys": nlp.vocab.vectors.n_keys,
        "name": nlp.vocab.vectors.name,
    }
    msg.divider("Generating meta.json")
    msg.text(
        "Enter the package settings for your model. The following information "
        "will be read from your model data: pipeline, vectors."
    )
    for setting, desc, default in settings:
        response = get_raw_input(desc, default)
        meta[setting] = default if response == "" and default else response
    if about.__title__ != "spacy":
        meta["parent_package"] = about.__title__
    return meta
예제 #3
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            doc_bin.add(doc)
    msg.good(f"Processed {len(doc_bin)} docs")
    doc_bin_bytes = doc_bin.to_bytes()
    output_file = output_path / f"{input_path.stem}.spacy"
    with output_file.open("wb") as f:
        f.write(doc_bin_bytes)
    msg.good(f"Saved parsed docs to file", output_file.resolve())
예제 #4
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     counts = Counter()
     for eg in accepted:
         for model_id in eg["accept"]:
             counts[model_id] += 1
     preference, _ = counts.most_common(1)[0]
     ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     if counts["A"] == counts["B"]:
         msg.warn(f"No preference ({ratio})")
     else:
         pc = counts[preference] / sum(counts.values())
         msg.good(
             f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
         msg.text(mapping[preference])
예제 #5
0
파일: nbmodel.py 프로젝트: zxlzr/spikex
 def train(self, corpus: List[Fragment], verbose: bool = None):
     if not corpus:
         raise ValueError
     msg.no_print = not verbose
     with msg.loading("setting things up..."):
         self._setup_training(corpus)
     msg.text("train Naive Bayes model")
     feats = {}
     totals = {}
     for frag in corpus:
         for feat, val in frag.features.items():
             feats[frag.label][feat + "_" + val] += 1
         totals[frag.label] += len(frag.features)
     # add-1 smoothing and normalization
     with msg.loading("smoothing... "):
         smooth_inc = 0.1
         all_feat_names = set(feats[True].keys()).union(
             set(feats[False].keys()))
         for label in [0, 1]:
             totals[label] += len(all_feat_names) * smooth_inc
             for feat in all_feat_names:
                 feats[label][feat] += smooth_inc
                 feats[label][feat] /= totals[label]
                 self.feats[(label, feat)] = feats[label][feat]
             feats[label][self._PRIOR_FEAT] = (totals[label] /
                                               totals.totalCount())
             self.feats[(label,
                         self._PRIOR_FEAT)] = feats[label][self._PRIOR_FEAT]
     msg.good("done")
예제 #6
0
def main(name: ("模型名称", "positional", None, None, trf_list),
         make_cache_dir: (" 创建缓存文件夹", "flag", "mk"),
         use_local_class: ("不使用网络读取", "flag", "local")):
    if make_cache_dir:
        c_path = ensure_path(f"{cache_path + name}")
        if c_path.exists():
            msg.warn(f"{cache_path + name} already exists")
        else:
            c_path.mkdir()
            msg.good(f" 缓存文件夹已创建:\t{cache_path}{name}")

    msg.warn("\n================url================\n")

    config_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[name]

    model_file = ALL_PRETRAINED_MODEL_ARCHIVE_MAP[name]
    msg.text(f"{config_file}\n{model_file}\n")

    vocab = get_tokenizer(name, use_local_class)
    pretrained_vocab_files_map = vocab.pretrained_vocab_files_map
    for vocab_file in pretrained_vocab_files_map.values():
        msg.text(f"{vocab_file[name]}\n")

    msg.warn("\n================url================\n")
    msg.good("\n使用下载工具下载后,将模型文件放入缓存文件夹中。")
예제 #7
0
def get_model_row(compat, name, data, msg, model_type="package"):
    if data["compat"]:
        comp = msg.text("", color="green", icon="good", no_print=True)
        version = msg.text(data["version"], color="green", no_print=True)
    else:
        version = msg.text(data["version"], color="red", no_print=True)
        comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
    return (model_type, name, data["name"], version, comp)
예제 #8
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Path to input file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    spacy_model: str = typer.Argument("en_core_web_sm",
                                      help="Name of spaCy model to use"),
    n_process: int = typer.Option(
        1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
    max_docs: int = typer.Option(10**6,
                                 "--max-docs",
                                 "-m",
                                 help="Maximum docs per batch"),
    # fmt: on
):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(
                    attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        batch_num += 1
        output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
        with output_file.open("wb") as f:
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(f"Complete. Saved final parsed docs to file",
                     output_file.resolve())
예제 #9
0
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate model data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = srsly.read_json(meta_path)
        if not create_meta:  # only print if user doesn't want to overwrite
            msg.good("Loaded meta.json from file", meta_path)
        else:
            meta = generate_meta(input_dir, meta, msg)
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
                "No '{}' setting found in meta.json".format(key),
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "textClassifications.py", TEMPLATE_INIT)
    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
    msg.text("To build the package, run `python setup.py sdist` in this directory.")
예제 #10
0
파일: __init__.py 프로젝트: uliang/Pig
def create_player(i, decision_function=None):

    if decision_function:
        p_name = "PigMachine"
    else:
        msg.text(f"Player {i} name (Press <Enter> to accept default): ")
        p_name = input()
        p_name = p_name if p_name else f"P{i}"

    return Player(p_name, decision_function)
예제 #11
0
 def main(self, args: BaseArgumentParser) -> int:
     list_devices_response = self.get_client().list_devices()
     msg.divider("Registered Devices")
     for device in list_devices_response.devices:
         if device.is_available:
             msg.good(f"{device.name}")
         else:
             msg.fail(f"{device.name}:")
             msg.text(
                 f"  {color(device.error_type, bold=True)}: {device.error_message}"
             )
     return 0
예제 #12
0
def train_model(model,
                train_path,
                eval_path,
                n_iter=10,
                output="./model2/",
                tok2vec=None):
    spacy.util.fix_random_seed(0)

    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat")

        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")
    for label in labels:
        textcat.add_label(label)
    optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True})
    batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8)
    msg.row(("#", "L", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            #texts = [text for text, entities in batch]

            #annotations = [entities for text, entities in batch]
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            scorer = nlp.evaluate(eval_data)
            if scorer.textcat_score > best_acc:
                best_acc = scorer.textcat_score
                if output:
                    best_model = nlp.to_bytes()
        acc = f"{scorer.textcat_score:.3f}"
        msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
예제 #13
0
def _parse_wiki_sql_dump(wiki_sql_dump_url, parse_fx, **kwargs):
    _kwargs = {**config, **kwargs}
    dumps_path = _kwargs["dumps_path"]
    max_workers = _kwargs["max_workers"]
    verbose = _kwargs["verbose"]
    compress_bytes_read = 0
    dump_name = wiki_sql_dump_url.name
    msg.text(f"-> {dump_name}", show=verbose)
    tqdm_disable = not verbose
    tqdm_kwargs = {
        "unit": "B",
        "unit_scale": True,
        "unit_divisor": 1024,
        "disable": tqdm_disable,
    }
    compress_obj, content_len = _get_wiki_dump_obj(wiki_sql_dump_url, verbose)
    should_reopen_compress_obj = False
    if dumps_path is not None:
        if not dumps_path.exists():
            dumps_path.mkdir()
        dump_filepath = dumps_path.joinpath(dump_name)
        if not dump_filepath.exists() or dump_filepath.stat().st_size == 0:
            with tqdm(
                desc="download to disk",
                total=content_len,
                **tqdm_kwargs,
            ) as pbar, dump_filepath.open("wb") as fd:
                bytes_read = 0
                for chunk in compress_obj:
                    fd.write(chunk)
                    compress_bytes = compress_obj.tell()
                    pbar.update(compress_bytes - bytes_read)
                    bytes_read = compress_bytes
            compress_obj.close()
            should_reopen_compress_obj = True
            wiki_sql_dump_url = dump_filepath
    if should_reopen_compress_obj:
        compress_obj, content_len = _get_wiki_dump_obj(wiki_sql_dump_url)
    with tqdm(
        desc="parse",
        total=content_len,
        **tqdm_kwargs,
    ) as pbar, compression_wrapper(compress_obj, "rb") as decompress_obj:
        compress_bytes_read = 0
        with closing(Pool(max_workers)) as pool:
            task = partial(_parsing_task, parse_fx=parse_fx)
            for res in pool.imap_unordered(task, decompress_obj, chunksize=10):
                compress_bytes = compress_obj.tell()
                pbar.update(compress_bytes - compress_bytes_read)
                compress_bytes_read = compress_bytes
                yield from pickle_loads(zlib.decompress(res))
    msg.good(dump_name, show=verbose)
예제 #14
0
def package_wikigraph(input_path: Path, output_path: Path, force: bool = None):
    """
    Generate an installable Python package for a `WikiGraph`.

    After packaging, "python setup.py sdist" must be run in the package directory,
    which will create a .tar.gz archive that can be installed via "pip install".

    Parameters
    ----------
    input_path : Path
        [description]
    output_path : Path
        [description]
    force : bool, optional
        [description], by default None
    """
    if not input_path or not input_path.exists():
        msg.fail("Can't locate graph data", input_path, exits=1)
    if not output_path:
        msg.fail("Output directory is missing", output_path, exits=1)
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))
    meta_path = input_path / "meta.json"
    if not meta_path.exists():
        msg.fail("Can't find graph meta.json", meta_path, exits=1)
    meta = json_loads(meta_path.read_text())
    graph_fullname = meta["fullname"]
    package_path = output_path / graph_fullname
    if package_path.exists():
        if not force:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=package_path),
                exits=1,
            )
        shutil.rmtree(package_path)
    package_path.mkdir()
    shutil.copy(meta_path, package_path)
    copy_tree(str(pkg_path), str(package_path))
    graph_name = meta["name"]
    rename(package_path / "graph-name", package_path / graph_name)
    module_path = package_path / graph_name
    copy_tree(str(input_path), str(module_path / graph_fullname))
    msg.good("Successfully created package {}".format(graph_name),
             package_path)
    msg.text(
        "To build the package, run `python setup.py sdist` in this directory.")
예제 #15
0
def run_test(command, directory):
    """Execute a command that runs a test"""
    msg.text("RUNNING  " + command)
    wrapped_command = f"cd {directory} && {command}"
    pipe = subprocess.Popen(
        wrapped_command, shell=True,
    )
    pipe.wait()
    if pipe.returncode == 0:
        msg.good("TEST PASSED")
    else:
        msg.fail("TEST FAILED")
    msg.text('')
    return pipe.returncode
예제 #16
0
def _get_wiki_dump_obj(wiki_sql_dump_url, verbose=None):
    if isinstance(wiki_sql_dump_url, Path):
        if not wiki_sql_dump_url.exists():
            raise FileNotFoundError
        compress_obj = wiki_sql_dump_url.open("rb")
        content_len = wiki_sql_dump_url.stat().st_size
        if content_len == 0:
            raise FileNotFoundError
    elif isinstance(wiki_sql_dump_url, URL):
        compress_obj = http_open(str(wiki_sql_dump_url), mode="rb")
        content_len = int(compress_obj.response.headers.get("content-length"))
    else:
        raise ValueError
    msg.text(f"from: {wiki_sql_dump_url}", show=verbose)
    return compress_obj, content_len
예제 #17
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    msg.text("Preprocessing text...")
    texts = [line.rstrip() for line in open(in_file, 'r')]
    docs = nlp.pipe(texts, n_process=n_process)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    wn_lemmas = set(wordnet.all_lemma_names())
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            # print(doc)
            spans = get_phrases(doc, wn_lemmas)
            spans = filter_spans(spans)
            # print('NOUN SPAN', str(spans))
            doc = merge_phrases(doc, spans)
            spans = get_adjective_phrases(doc)
            spans = filter_spans(spans)
            # print('ADJ SPAN', str(spans))
            # print('*-----------------------------------------*')
            doc = merge_phrases(doc, spans)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
예제 #18
0
    def main(self, args: BaseArgumentParser) -> int:
        response: ExperimentStatusResponse = self.get_client(
        ).experiment_status()
        sequence_status = response.sequence_status

        if sequence_status is None:
            print("There are no experiments running.")
            return 0

        for experiment in sequence_status.experiments:
            if experiment.state == ExperimentState.FINISHED:
                msg.good(experiment.name)
            elif experiment.state == ExperimentState.RUNNING:
                progress = round(experiment.progress * 100)
                msg.text(f"\u25b6 {experiment.name} ({progress}%)")
            elif experiment.state == ExperimentState.NOT_STARTED:
                msg.text(f"  {experiment.name}", color="grey")

        return 0
예제 #19
0
파일: server.py 프로젝트: motte/labby
    def main(self, args: ServerArgumentParser) -> int:
        if args.command == "start":
            server = Server(self.config)
            server.start()
            return 0

        if args.command == "status":
            response = self.get_client().hello()
            if response == "Hello world":
                msg.good("Active")
                return 0
            msg.fail("Invalid response")
            msg.text(
                "Server replied with an invalid response. This is probably a bug."
            )
            return 1

        if args.command == "stop":
            self.get_client().halt()
            return 0

        raise Exception(f"Unknown server command {args.command}")
예제 #20
0
파일: core.py 프로젝트: motte/labby
    def run(cls, trigger: str, argv: Sequence[str]) -> int:
        try:
            command_klass = ALL_COMMANDS[trigger]
            # pyre-ignore[16]: command_klass has no __orig_bases__ attribute
            args_klass = get_args(command_klass.__orig_bases__[0])[0]
            args = args_klass(prog=f"labby {trigger}").parse_args(argv)

            auto_discover_drivers()
            with open(args.config, "r") as config_file:
                config = Config(config_file.read())

            # pyre-ignore[45]: cannot instantiate Command with abstract method
            command = command_klass(config)
            return command.main(args)
        except pynng.exceptions.Timeout:
            # this had to be an inline import so the tests would use the
            # WASABI_LOG_FRIENDLY env variable correctly ¯\_(ツ)_/¯
            from wasabi import msg

            msg.fail("Timeout")
            msg.text("The labby server did not respond. Are you sure it is started?")
            return 1
예제 #21
0
def show_validation_error(
    file_path: Optional[Union[str, Path]] = None,
    *,
    title: Optional[str] = None,
    desc: str = "",
    show_config: Optional[bool] = None,
    hint_fill: bool = True,
):
    """Helper to show custom config validation errors on the CLI.

    file_path (str / Path): Optional file path of config file, used in hints.
    title (str): Override title of custom formatted error.
    desc (str): Override description of custom formatted error.
    show_config (bool): Whether to output the config the error refers to.
    hint_fill (bool): Show hint about filling config.
    """
    try:
        yield
    except ConfigValidationError as e:
        title = title if title is not None else e.title
        if e.desc:
            desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
        # Re-generate a new error object with overrides
        err = e.from_error(e, title="", desc=desc, show_config=show_config)
        msg.fail(title)
        print(err.text.strip())
        if hint_fill and "value_error.missing" in err.error_types:
            config_path = (file_path if file_path is not None
                           and str(file_path) != "-" else "config.cfg")
            msg.text(
                "If your config contains missing values, you can run the 'init "
                "fill-config' command to fill in all the defaults, if possible:",
                spaced=True,
            )
            print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
        sys.exit(1)
    except InterpolationError as e:
        msg.fail("Config validation error", e, exits=1)
예제 #22
0
    def main(self, args: DeviceInfoArguments) -> int:
        device_info = self.get_client().device_info(args.device_name)
        if device_info.device_type is None:
            msg.fail(
                f"Unknown device {args.device_name}",
                text="See `labby devices` for a list of available devices.",
            )
            return 1

        msg.divider(
            f"{args.device_name} (device_info.device_type.friendly_name)")

        if device_info.is_connected:
            msg.table([
                ("Connection", render.good("OK")),
                *self._render_device_info(device_info),
            ])
        else:
            msg.table([("Connection", render.fail("Error"))])
            msg.text(f"{color(device_info.error_type, bold=True)}: " +
                     f"{device_info.error_message}")

        return 0
예제 #23
0
파일: recipes.py 프로젝트: kabirkhan/dstl
def ner_translate(
    in_sets: List[str],
    out_set: str,
    model_name_or_path: str,
    source_lang: str,
    target_lang: str,
    dry: bool = False,
) -> None:
    translator = TransformersMarianTranslator(
        model_name_or_path, source_lang=source_lang, target_lang=target_lang
    )

    DB = connect()
    for set_id in in_sets:
        if set_id not in DB:
            msg.fail(f"Can't find dataset '{set_id}' in database", exits=1)
    if out_set in DB and len(DB.get_dataset(out_set)):
        msg.fail(
            f"Output dataset '{out_set}' already exists and includes examples",
            f"This can lead to unexpected results. Please use a new dataset.",
            exits=1,
        )
    if out_set not in DB:
        if not dry:
            DB.add_dataset(out_set)
        msg.good(f"Created dataset '{out_set}'")

    matched_examples_t = []
    mismatched_examples_t = []

    for set_id in in_sets:
        msg.text(f"RECIPE: Translating and merging examples from '{set_id}'")
        raw_examples = DB.get_dataset(set_id)
        examples = [Example(**e) for e in raw_examples]
        examples_t = translate_ner_batch(
            examples, translate_f=translator.pipe, target_lang=target_lang
        )
        for e, e_t in zip(examples, examples_t):
            if len(e.spans) != len(e_t.spans):
                mismatched_examples_t.append(e_t)
            else:
                matched_examples_t.append(e_t)

        msg.text(f"RECIPE: Translated {len(matched_examples_t)} examples from '{set_id}'")
        msg.text(
            f"RECIPE: Found {len(mismatched_examples_t)} examples with mismatched spans after translation from '{set_id}'"
        )

    matched_examples_t = set_hashes(matched_examples_t)

    dry = False
    if not dry:
        DB.add_examples(matched_examples_t, datasets=[out_set])
    msg.good(
        f"Translated and merged {len(matched_examples_t)} examples from {len(in_sets)} datasets",
        f"Created translated and merged dataset '{out_set}'",
    )
예제 #24
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     total_count = 0
     agree_count = 0
     for eg in accepted:
         total_count += len(eg.get("options", []))
         agree_count += len(eg.get("accept", []))
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     pc = agree_count / total_count
     text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
     if pc > 0.5:
         msg.good(text)
     else:
         msg.fail(text)
예제 #25
0
 def eval_dataset(set_id):
     """Output summary about user agreement with the model."""
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     high_conf = 0.8
     agree_count = 0
     disagree_high_conf = len(
         [e for e in rejected if e["confidence"] > high_conf])
     for eg in accepted:
         choice = eg["accept"][0]
         score_choice = [
             o["score"] for o in eg["options"] if o["id"] == choice
         ][0]
         score_other = [
             o["score"] for o in eg["options"] if o["id"] != choice
         ][0]
         if score_choice > score_other:
             agree_count += 1
         elif eg["confidence"] > high_conf:
             disagree_high_conf += 1
     pc = agree_count / (len(accepted) + len(rejected))
     text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
     msg.info(f"Evaluating data from '{set_id}'")
     if pc > 0.5:
         msg.good(text)
     else:
         msg.fail(text)
     msg.text(
         f"You disagreed on {disagree_high_conf} high confidence scores")
     msg.text(f"You rejected {len(rejected)} suggestions as not similar")
예제 #26
0
def validate() -> None:
    model_pkgs, compat = get_model_pkgs()
    spacy_version = get_minor_version(about.__version__)
    current_compat = compat.get(spacy_version, {})
    if not current_compat:
        msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
    msg.info(f"spaCy installation: {spacy_dir}")

    if model_pkgs:
        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            if data["compat"]:
                comp = msg.text("", color="green", icon="good", no_print=True)
                version = msg.text(data["version"],
                                   color="green",
                                   no_print=True)
            else:
                version = msg.text(data["version"],
                                   color="yellow",
                                   no_print=True)
                comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No pipeline packages found in your current environment.",
                 exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.info(
            f"The following packages are custom spaCy pipelines or not "
            f"available for spaCy v{about.__version__}:",
            ", ".join(na_models),
        )
    if incompat_models:
        sys.exit(1)
예제 #27
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    replace_components=False,
    vectors=None,
    width=96,
    conv_depth=4,
    cnn_window=1,
    cnn_pieces=3,
    bilstm_depth=0,
    embed_rows=2000,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    tag_map_path=None,
    omit_extra_lookups=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    disabled_pipes = None
    pipes_added = False
    msg.text("Training pipeline: {}".format(pipeline))
    if use_gpu >= 0:
        activated_gpu = None
        try:
            activated_gpu = set_gpu(use_gpu)
        except Exception as e:
            msg.warn("Exception: {}".format(e))
        if activated_gpu is not None:
            msg.text("Using GPU: {}".format(use_gpu))
        else:
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
            msg.text("Using CPU only")
            use_gpu = -1
    base_components = []
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        for pipe in pipeline:
            pipe_cfg = {}
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            if pipe not in nlp.pipe_names:
                msg.text("Adding component to base model: '{}'".format(pipe))
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            elif replace_components:
                msg.text(
                    "Replacing component from base model '{}'".format(pipe))
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
                msg.text(
                    "Extending component from base model '{}'".format(pipe))
                base_components.append(pipe)
        disabled_pipes = nlp.disable_pipes(
            [p for p in nlp.pipe_names if p not in pipeline])
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
        # Replace tag map with provided mapping
        nlp.vocab.morphology.load_tag_map(tag_map)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
    if omit_extra_lookups:
        nlp.vocab.lookups_extra = Lookups()
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
        nlp.vocab.lookups_extra.add_table("lexeme_settings")

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model and not pipes_added:
        # Start with an existing model, use default optimizer
        optimizer = nlp.resume_training(device=use_gpu)
    else:
        # Start with a blank model, call begin_training
        cfg = {"device": use_gpu}
        cfg["conv_depth"] = conv_depth
        cfg["token_vector_width"] = width
        cfg["bilstm_depth"] = bilstm_depth
        cfg["cnn_maxout_pieces"] = cnn_pieces
        cfg["embed_size"] = embed_rows
        cfg["conv_window"] = cnn_window
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec,
                                              base_components)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat-positive-label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    try:
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                    except ValueError as e:
                        err = "Error during training"
                        if init_tok2vec:
                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
                        msg.fail(err,
                                 "Original error message: {}".format(e),
                                 exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        ))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        # Only evaluate on CPU in the first iteration (for
                        # timing) if GPU is enabled
                        if i == 0:
                            with Model.use_device("cpu"):
                                nlp_loaded = util.load_model_from_path(
                                    epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg[
                                            "beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    ))
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs,
                                                             verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta.setdefault("accuracy", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["accuracy"][metric] = scorer.scores[
                                    metric]
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["beam_accuracy"][metric] = scorer.scores[
                                    metric]
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        iter_current = i + 1
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(iter_current -
                                                 iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
            "Encountered exception: {}".format(e),
            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
        if disabled_pipes:
            disabled_pipes.restore()
            meta["pipeline"] = nlp.pipe_names
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
            srsly.write_json(final_model_path / "meta.json", meta)

            meta_loc = output_path / "model-final" / "meta.json"
            final_meta = srsly.read_json(meta_loc)
            final_meta.setdefault("accuracy", {})
            final_meta["accuracy"].update(meta.get("accuracy", {}))
            final_meta.setdefault("speed", {})
            final_meta["speed"].setdefault("cpu", None)
            final_meta["speed"].setdefault("gpu", None)
            meta.setdefault("speed", {})
            meta["speed"].setdefault("cpu", None)
            meta["speed"].setdefault("gpu", None)
            # combine cpu and gpu speeds with the base model speeds
            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
                final_meta["speed"]["cpu"] = speed
            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
                final_meta["speed"]["gpu"] = speed
            # if there were no speeds to update, overwrite with meta
            if (final_meta["speed"]["cpu"] is None
                    and final_meta["speed"]["gpu"] is None):
                final_meta["speed"].update(meta["speed"])
            # note: beam speeds are not combined with the base model
            if has_beam_widths:
                final_meta.setdefault("beam_accuracy", {})
                final_meta["beam_accuracy"].update(
                    meta.get("beam_accuracy", {}))
                final_meta.setdefault("beam_speed", {})
                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
            srsly.write_json(meta_loc, final_meta)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(final_meta, output_path,
                                                  best_pipes)
        msg.good("Created best model", best_model_path)
예제 #28
0
파일: run.py 프로젝트: PhillCli/wheelwright
def build(
    # fmt: off
    repo: str,
    commit: str,
    package_name: str = Option(None,
                               help="Package name (if different from repo)"),
    py35: bool = Option(False, "--py35", help="Build wheels for Python 3.5"),
    llvm: bool = Option(False, "--llvm", help="Requires LLVM to be installed"),
    rust: bool = Option(False, "--rust", help="Requires Rust to be installed"),
    universal: bool = Option(
        False,
        "--universal",
        help="Build universal (pure Python) wheel and sdist"),
    skip_tests: bool = Option(
        False,
        "--skip-tests",
        help="Don't run tests (e.g. if package doesn't have any)"),
    build_constraints: bool = Option(
        False,
        "--build-constraints",
        help="Use build constraints for build requirements"),
    # fmt: on
):
    """Build wheels for a given repo and commit / tag."""
    print(LOGO)
    repo_id = get_repo_id()
    user, package = repo.lower().split("/", 1)
    if package_name is None:
        package_name = package.replace("-", "_")
    msg.info(f"Building in repo {repo_id}")
    msg.info(f"Building wheels for {user}/{package}\n")
    if universal:
        msg.warn(
            "Building only universal sdist and wheel, no cross-platform wheels"
        )
    if skip_tests:
        msg.warn("Not running any tests")
    clone_url = DEFAULT_CLONE_TEMPLATE.format(f"{user}/{package}")
    repo = get_gh().get_repo(repo_id)
    with msg.loading("Finding a unique name for this release..."):
        # Pick the release_name by finding an unused one
        i = 1
        while True:
            release_name = f"{package_name}-{commit}"
            if i > 1:
                release_name += f"-{i}"
            try:
                repo.get_release(release_name)
            except github.UnknownObjectException:
                break
            i += 1
    branch_name = f"branch-for-{release_name}"
    bs = {
        "clone-url": clone_url,
        "package-name": package_name,
        "commit": commit,
        "options": {
            "llvm": llvm,
            "rust": rust,
            "py35": py35,
            "universal": universal,
            "skip_tests": skip_tests,
            "build_constraints": build_constraints,
        },
        "upload-to": {
            "type": "github-release",
            "repo-id": repo_id,
            "release-id": release_name,
        },
    }
    bs_json = json.dumps(bs)
    bs_json_formatted = json.dumps(bs, indent=4)
    msg.text(f"Creating release {release_name} to collect assets")
    release_text = f"https://github.com/{user}/{package}\n\n### Build spec\n\n```json\n{bs_json_formatted}\n```"
    release = repo.create_git_release(release_name, release_name, release_text)
    with msg.loading("Creating build branch..."):
        # 'master' is a 'Commit'. 'master.commit' is a 'GitCommit'. These are
        # different types that are mostly *not* interchangeable:
        #   https://pygithub.readthedocs.io/en/latest/github_objects/Commit.html
        #   https://pygithub.readthedocs.io/en/latest/github_objects/GitCommit.html
        master = repo.get_commit("master")
        master_gitcommit = master.commit
        patch = github.InputGitTreeElement(
            "build-spec.json",
            "100644",
            "blob",
            content=bs_json,
        )
        tree = repo.create_git_tree([patch], master_gitcommit.tree)
        our_gitcommit = repo.create_git_commit(f"Building: {release_name}",
                                               tree, [master_gitcommit])
        repo.create_git_ref(f"refs/heads/{branch_name}", our_gitcommit.sha)
    msg.good(f"Commit is {our_gitcommit.sha[:8]} in branch {branch_name}")
    msg.text(f"Release: {release.html_url}")
    msg.text(
        f"Checks:  https://github.com/{repo_id}/commit/{our_gitcommit.sha}/checks"
    )
예제 #29
0
파일: debug_data.py 프로젝트: DuyguA/spaCy
def debug_data(
    config_path: Path,
    *,
    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
):
    msg = Printer(no_print=silent,
                  pretty=not no_format,
                  ignore_warnings=ignore_warnings)
    # Make sure all files and paths exists if they are needed
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
        config = nlp.config.interpolate()
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
    frozen_components = T["frozen_components"]
    resume_components = [
        p for p in sourced_components if p not in frozen_components
    ]
    pipeline = nlp.pipe_names
    factory_names = [
        nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names
    ]
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)

    nlp.initialize(lambda: train_corpus(nlp))
    msg.good("Pipeline can be initialized with data")

    train_dataset = list(train_corpus(nlp))
    dev_dataset = list(dev_corpus(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_dataset constantly
    gold_train_data = _compile_gold(train_dataset,
                                    factory_names,
                                    nlp,
                                    make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(train_dataset,
                                                   factory_names,
                                                   nlp,
                                                   make_proj=False)
    gold_dev_data = _compile_gold(dev_dataset,
                                  factory_names,
                                  nlp,
                                  make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(
            f"Components from other pipelines: {', '.join(resume_components)}")
    if frozen_components:
        msg.text(f"Frozen components: {', '.join(frozen_components)}")
    msg.text(f"{len(train_dataset)} training docs")
    msg.text(f"{len(dev_dataset)} evaluation docs")

    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
    # TODO: make this feedback more fine-grained and report on updated
    # components vs. blank components
    if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
        text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
        n_misaligned = gold_train_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
        n_misaligned = gold_dev_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
            f"unique keys, {nlp.vocab.vectors_length} dimensions)")
        n_missing_vectors = sum(
            gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:.0f}%)".format(
                n_missing_vectors,
                100 * (n_missing_vectors / gold_train_data["n_words"]),
            ), )
        msg.text(
            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the package")

    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_train_data["ner"]
                     if label not in ("O", "-", None))
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_boundary_cross_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
        missing_values = label_counts["-"]
        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in labels:
            if len(label) == 0:
                msg.fail("Empty label found in train data")
        labels_with_counts = [(label, count)
                              for label, count in label_counts.most_common()
                              if label != "-"]
        labels_with_counts = _format_labels(labels_with_counts, counts=True)
        msg.text(f"Labels in train data: {_format_labels(labels)}",
                 show=verbose)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if gold_train_data["ws_ents"]:
            msg.fail(
                f"{gold_train_data['ws_ents']} invalid whitespace entity spans"
            )
            has_ws_ents_error = True

        for label in labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(
                        train_dataset, label)
                if neg_docs == 0:
                    msg.warn(
                        f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if gold_train_data["boundary_cross_ents"]:
            msg.warn(
                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
            )
            has_boundary_cross_ents_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good(
                "No entities consisting of or starting/ending with whitespace")
        if not has_boundary_cross_ents_warning:
            msg.good("No entities crossing sentence boundaries")

        if has_low_data_warning:
            msg.text(
                f"To train a new entity type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text("Entity spans consisting of or starting/ending "
                     "with whitespace characters are considered invalid.")

    if "textcat" in factory_names:
        msg.divider("Text Classification (Exclusive Classes)")
        labels = _get_labels_from_model(nlp, "textcat")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}.")
        if len(labels) < 2:
            msg.fail(
                "The model does not have enough labels. 'textcat' requires at "
                "least two labels due to mutually-exclusive classes, e.g. "
                "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
                "classification task.")
        if (gold_train_data["n_cats_bad_values"] > 0
                or gold_dev_data["n_cats_bad_values"] > 0):
            msg.fail("Unsupported values for cats: the supported values are "
                     "1.0/True and 0.0/False.")
        if gold_train_data["n_cats_multilabel"] > 0:
            # Note: you should never get here because you run into E895 on
            # initialization first.
            msg.fail(
                "The train data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'.")
        if gold_dev_data["n_cats_multilabel"] > 0:
            msg.fail(
                "The dev data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'.")

    if "textcat_multilabel" in factory_names:
        msg.divider("Text Classification (Multilabel)")
        labels = _get_labels_from_model(nlp, "textcat_multilabel")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}.")
        if (gold_train_data["n_cats_bad_values"] > 0
                or gold_dev_data["n_cats_bad_values"] > 0):
            msg.fail("Unsupported values for cats: the supported values are "
                     "1.0/True and 0.0/False.")
        if gold_train_data["n_cats_multilabel"] > 0:
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
                    "instances without mutually-exclusive classes while the "
                    "dev data contains only instances with mutually-exclusive "
                    "classes.")
        else:
            msg.warn("The train data contains only instances with "
                     "mutually-exclusive classes. You can potentially use the "
                     "component 'textcat' instead of 'textcat_multilabel'.")
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
                    "Train/dev mismatch: the dev data contains instances "
                    "without mutually-exclusive classes while the train data "
                    "contains only instances with mutually-exclusive classes.")

    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        label_list = [label for label in gold_train_data["tags"]]
        model_labels = _get_labels_from_model(nlp, "tagger")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

    if "morphologizer" in factory_names:
        msg.divider("Morphologizer (POS+Morph)")
        label_list = [label for label in gold_train_data["morphs"]]
        model_labels = _get_labels_from_model(nlp, "morphologizer")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        labels_with_counts = _format_labels(
            gold_train_data["morphs"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info(
            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(
            gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
                f"The training data contains {sents_per_doc:.2f} sentences per "
                f"document. When there are very few documents containing more "
                f"than one sentence, the parser will not learn how to segment "
                f"longer texts into sentences.")

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
            n_nonproj = gold_dev_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD:
                msg.warn(f"Low number of examples for label '{label}' "
                         f"({gold_train_unpreprocessed_data['deps'][label]})")
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if (gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
                    and DELIMITER in label):
                rare_projectivized_labels.append(
                    f"{label}: {gold_train_data['deps'][label]}")

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                f"Low number of examples for {len(rare_projectivized_labels)} "
                "label(s) in the projectivized dependency trees used for "
                "training. You may want to projectivize labels such as punct "
                "before training in order to improve parser performance.")
            msg.warn(
                f"Projectivized labels with low numbers of examples: ",
                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
                "The following labels were found only in the train data:",
                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
                "The following labels were found only in the dev data:",
                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
                f"To train a parser, your data should include at "
                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                f"Multiple root labels "
                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
                f"found in training data. spaCy's parser uses a single root "
                f"label ROOT so this distinction will not be available.")

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(f"Found {gold_train_data['n_nonproj']} nonprojective "
                     f"projectivized train sentence(s)")
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed"
        )
    if warn_counts:
        msg.warn(
            f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)
예제 #30
0
def fix_annotations(
    example: Example,
    corrections: List[Correction],
    case_sensitive: bool = False,
    dryrun: bool = False,
) -> Example:
    """Fix annotations in a copy of List[Example] data.
    
    This function will NOT add annotations to your data.
    It will only remove erroneous annotations and fix the
    labels for specific spans.
    
    Args:
        example (Example): Input Example
        corrections (Dict[str, str]): Dictionary of corrections mapping entity text to a new label.
            If the value is set to None, the annotation will be removed
        case_sensitive (bool, optional): Consider case of text for each correction
        dryrun (bool, optional): Treat corrections as a dryrun and just print all changes to be made
    
    Returns:
        Example: Example with fixed annotations
    """

    if not case_sensitive:
        for c in corrections:
            c.annotation = c.annotation.lower()

    corrections_map: Dict[str,
                          Correction] = {c.annotation: c
                                         for c in corrections}
    prints: List[str] = []

    ents_to_remove: List[int] = []
    for i, s in enumerate(example.spans):
        t = s.text if case_sensitive else s.text.lower()

        if t in corrections_map:
            c = corrections_map[t]
            if c.to_label is None and s.label in c.from_labels:
                if dryrun:
                    prints.append(f"Deleting span: {s.text}")
                else:
                    ents_to_remove.append(i)
            elif s.label in c.from_labels or "ANY" in c.from_labels:
                if dryrun:
                    prints.append(
                        f"Correction span: {s.text} from labels: {c.from_labels} to label: {c.to_label}"
                    )
                else:
                    s.label = cast(str, c.to_label)

    i = len(ents_to_remove) - 1
    while i >= 0:
        idx = ents_to_remove[i]
        del example.spans[idx]
        i -= 1

    if dryrun:
        msg.divider("Example Text")
        msg.text(example.text)
        for line in prints:
            msg.text(line)

    return example