Exemplo n.º 1
0
def train_model(
    model,
    train_path,
    eval_path,
    n_iter=10,
    output=None,
    tok2vec=None,
):
    """
    Train a model from Prodigy annotations and optionally save out the best
    model to disk.
    """
    spacy.util.fix_random_seed(0)
    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    ner = nlp.create_pipe("ner")
    for label in labels:
        ner.add_label(label)
    nlp.add_pipe(ner)
    t2v_cfg = {
        "embed_rows": 10000,
        "token_vector_width": 128,
        "conv_depth": 8,
        "nr_feature_tokens": 3,
    }
    optimizer = nlp.begin_training(
        component_cfg={"ner": t2v_cfg} if tok2vec else {})
    if tok2vec:
        _load_pretrained_tok2vec(nlp, Path(tok2vec))
    batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8, 8, 8)
    msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            texts, annots = zip(*batch)
            nlp.update(texts, annots, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            sc = nlp.evaluate(eval_data)
            if sc.ents_f > best_acc:
                best_acc = sc.ents_f
                if output:
                    best_model = nlp.to_bytes()
        acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
        msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
Exemplo n.º 2
0
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Exemplo n.º 3
0
def train():
    with msg.loading("   Loading BERT"):
        TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
        MODEL = BertForQuestionAnswering.from_pretrained(
            'bert-large-uncased-whole-word-masking-finetuned-squad')
    msg.good("   BERT loaded")

    articles_dir = os.path.join(SCRIPT_PATH,
                                '../data/raw/CORD-19-research-challenge/')
    articles_folders = [
        'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/',
        'comm_use_subset/comm_use_subset/pdf_json/',
        'comm_use_subset/comm_use_subset/pmc_json/',
        'noncomm_use_subset/noncomm_use_subset/pdf_json/',
        'noncomm_use_subset/noncomm_use_subset/pmc_json/',
        'custom_license/custom_license/pdf_json/',
        'custom_license/custom_license/pmc_json/'
    ]
    meta_path = articles_dir + 'metadata.csv'

    with msg.loading("   Loading publications"):
        start = time.time()
        data_text, index2paperID, index2paperPath = get_data_texts(
            articles_dir, articles_folders, meta_path)
    msg.good("   Publications loaded - Took {:.2f}s".format(time.time() -
                                                            start))

    covid_q = QuestionCovid(TOKENIZER, MODEL, index2paperID, index2paperPath)
    covid_q.fit(data_text)
    return covid_q
def cfg():
    cfg = get_config(TEST_CFG)

    artefacts = [
        "indices.pickle",
        "weights.h5",
    ]

    S3_SLUG = cfg["data"]["s3_slug"]
    OUTPUT_PATH = cfg["build"]["output_path"]
    WORD_EMBEDDINGS = cfg["build"]["word_embeddings"]

    for artefact in artefacts:
        with msg.loading(f"Could not find {artefact} locally, downloading..."):
            try:
                artefact = os.path.join(OUTPUT_PATH, artefact)
                download_model_artefact(artefact, S3_SLUG)
                msg.good(f"Found {artefact}")
            except:
                msg.fail(f"Could not download {S3_SLUG}{artefact}")

    # Check on word embedding and download if not exists

    WORD_EMBEDDINGS = cfg["build"]["word_embeddings"]

    with msg.loading(
            f"Could not find {WORD_EMBEDDINGS} locally, downloading..."):
        try:
            download_model_artefact(WORD_EMBEDDINGS, S3_SLUG)
            msg.good(f"Found {WORD_EMBEDDINGS}")
        except:
            msg.fail(f"Could not download {S3_SLUG}{WORD_EMBEDDINGS}")

    return cfg
Exemplo n.º 5
0
def profile(model: str,
            inputs: Optional[Path] = None,
            n_texts: int = 10000) -> None:
    if inputs is not None:
        texts = _read_inputs(inputs, msg)
        texts = list(itertools.islice(texts, n_texts))
    if inputs is None:
        try:
            import ml_datasets
        except ImportError:
            msg.fail(
                "This command, when run without an input file, "
                "requires the ml_datasets library to be installed: "
                "pip install ml_datasets",
                exits=1,
            )

        with msg.loading("Loading IMDB dataset via ml_datasets..."):
            imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
            texts, _ = zip(*imdb_train)
        msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
    with msg.loading(f"Loading pipeline '{model}'..."):
        nlp = load_model(model)
    msg.good(f"Loaded pipeline '{model}'")
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Exemplo n.º 6
0
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
    if freqs_loc is not None:
        with msg.loading("Counting frequencies..."):
            probs, _ = read_freqs(freqs_loc)
        msg.good("Counted frequencies")
    else:
        probs, _ = ({}, DEFAULT_OOV_PROB)  # noqa: F841
    if clusters_loc:
        with msg.loading("Reading clusters..."):
            clusters = read_clusters(clusters_loc)
        msg.good("Read clusters")
    else:
        clusters = {}
    lex_attrs = []
    sorted_probs = sorted(probs.items(),
                          key=lambda item: item[1],
                          reverse=True)
    if len(sorted_probs):
        for i, (word, prob) in tqdm(enumerate(sorted_probs)):
            attrs = {"orth": word, "id": i, "prob": prob}
            # Decode as a little-endian string, so that we can do & 15 to get
            # the first 4 bits. See _parse_features.pyx
            if word in clusters:
                attrs["cluster"] = int(clusters[word][::-1], 2)
            else:
                attrs["cluster"] = 0
            lex_attrs.append(attrs)
    return lex_attrs
Exemplo n.º 7
0
 def train(self, corpus: List[Fragment], verbose: bool = None):
     if not corpus:
         raise ValueError
     msg.no_print = not verbose
     with msg.loading("setting things up..."):
         self._setup_training(corpus)
     msg.text("train Naive Bayes model")
     feats = {}
     totals = {}
     for frag in corpus:
         for feat, val in frag.features.items():
             feats[frag.label][feat + "_" + val] += 1
         totals[frag.label] += len(frag.features)
     # add-1 smoothing and normalization
     with msg.loading("smoothing... "):
         smooth_inc = 0.1
         all_feat_names = set(feats[True].keys()).union(
             set(feats[False].keys()))
         for label in [0, 1]:
             totals[label] += len(all_feat_names) * smooth_inc
             for feat in all_feat_names:
                 feats[label][feat] += smooth_inc
                 feats[label][feat] /= totals[label]
                 self.feats[(label, feat)] = feats[label][feat]
             feats[label][self._PRIOR_FEAT] = (totals[label] /
                                               totals.totalCount())
             self.feats[(label,
                         self._PRIOR_FEAT)] = feats[label][self._PRIOR_FEAT]
     msg.good("done")
Exemplo n.º 8
0
def train_model(model,
                train_path,
                eval_path,
                n_iter=10,
                output="./model2/",
                tok2vec=None):
    spacy.util.fix_random_seed(0)

    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat")

        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")
    for label in labels:
        textcat.add_label(label)
    optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True})
    batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8)
    msg.row(("#", "L", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            #texts = [text for text, entities in batch]

            #annotations = [entities for text, entities in batch]
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            scorer = nlp.evaluate(eval_data)
            if scorer.textcat_score > best_acc:
                best_acc = scorer.textcat_score
                if output:
                    best_model = nlp.to_bytes()
        acc = f"{scorer.textcat_score:.3f}"
        msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
Exemplo n.º 9
0
    def fit(self, data_text):

        self.TFIDF_VECTORIZER = TfidfVectorizer()
        with msg.loading("   Fitting TFIDF"):
            start = time.time()
            self.TFIDF_VECTORIZER.fit(data_text.values())
        msg.good("   TFIDF fitted - Took {:.2f}s".format(time.time() - start))
        with msg.loading("   Creating Articles matrix"):
            start = time.time()
            self.ARTICLES_MATRIX = self.TFIDF_VECTORIZER.transform(
                data_text.values())
        msg.good(
            "   Article matrix created - Took {:.2f}s".format(time.time() -
                                                              start))
Exemplo n.º 10
0
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
        for lex in nlp.vocab:
            if lex.rank:
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
        if vectors_loc:
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
                vectors_data, vector_keys = read_vectors(vectors_loc)
            msg.good("Loaded vectors from {}".format(vectors_loc))
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
            for word in vector_keys:
                if word not in nlp.vocab:
                    lexeme = nlp.vocab[word]
                    lexeme.is_oov = False
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if name is None:
        nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
    else:
        nlp.vocab.vectors.name = name
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
    if prune_vectors >= 1:
        nlp.vocab.prune_vectors(prune_vectors)
Exemplo n.º 11
0
def _load_file(file_path: Path, msg: Printer) -> None:
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_json(file_path)
        msg.good(f"Loaded {file_name}")
        return data
    elif file_path.suffix == ".jsonl":
        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_jsonl(file_path)
        msg.good(f"Loaded {file_name}")
        return data
    msg.fail(
        f"Can't load file extension {file_path.suffix}",
        "Expected .json or .jsonl",
        exits=1,
    )
Exemplo n.º 12
0
def init_model(
    lang,
    output_dir,
    freqs_loc=None,
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
            if freqs_loc:
                settings.append("-f")
            if clusters_loc:
                settings.append("-c")
            msg.warn(
                "Incompatible arguments",
                "The -f and -c arguments are deprecated, and not compatible "
                "with the -j argument, which should specify the same "
                "information. Either merge the frequencies and clusters data "
                "into the JSONL-formatted file (recommended), or use only the "
                "-f and -c files, without the other lexical attributes.",
            )
        jsonl_loc = ensure_path(jsonl_loc)
        lex_attrs = srsly.read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)

    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs, name=model_name)
    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors,
                    vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
        "Sucessfully compiled vocab",
        "{} entries, {} vectors".format(lex_added, vec_added),
    )
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp
Exemplo n.º 13
0
def evaluate_model(model, eval_path):
    """
    Evaluate a trained model on Prodigy annotations and print the accuracy.
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    data, _ = format_data(srsly.read_jsonl(eval_path))
    sc = nlp.evaluate(data)
    result = [("F-Score", f"{sc.textcat_score:.3f}")]
    msg.table(result)
Exemplo n.º 14
0
def swarm_solve(
    problems: Union[List[str], str],
    config: SwarmConfig,
    max_steps: Union[List[int], int] = 128,
    silent: bool = False,
) -> Swarm:
    single_problem: bool = isinstance(problems, str)
    if single_problem:
        problems = [problems]
    if isinstance(max_steps, int):
        max_steps = [max_steps
                     ] if single_problem else [max_steps] * len(problems)
    assert len(problems) > 0, "no problems to solve"
    assert len(problems) == len(max_steps)
    assert isinstance(problems, list)
    current_problem: str = problems.pop(0)
    current_max_moves: int = max_steps.pop(0)

    def env_callable():
        nonlocal current_problem, current_max_moves
        return FragileMathyEnv(
            name="mathy_v0",
            problem=current_problem,
            repeat_problem=True,
            max_steps=current_max_moves,
        )

    mathy_env: MathyEnv = env_callable()._env._env.mathy
    swarm: Swarm = mathy_swarm(config, env_callable)
    while True:
        if not silent:
            with msg.loading(f"Solving {current_problem} ..."):
                swarm.run()
        else:
            swarm.run()

        if not silent:
            if swarm.walkers.best_reward > EnvRewards.WIN:
                last_state = MathyEnvState.from_np(
                    swarm.walkers.states.best_state)
                msg.good(
                    f"Solved! {current_problem} = {last_state.agent.problem}")
                mathy_env.print_history(last_state)
            else:
                msg.fail(f"Failed to find a solution :(")

        if len(max_steps) > 0:
            current_max_moves = max_steps.pop(0)
            current_problem = problems.pop(0)
        else:
            break
    return swarm
Exemplo n.º 15
0
def fitted_ann_kb(nlp, entities, aliases):
    kb = AnnKnowledgeBase(nlp.vocab, entity_vector_length=300)
    print(vars(kb))

    entity_ids = []
    descriptions = []
    freqs = []
    for e in entities:
        entity_ids.append(e["id"])
        descriptions.append(e.get("description", ""))
        freqs.append(100)

    msg.divider("Apply EntityEncoder")

    with msg.loading("Applying EntityEncoder to descriptions"):
        # get the pretrained entity vectors
        embeddings = [nlp.make_doc(desc).vector for desc in descriptions]
        msg.good("Finished, embeddings created")

    with msg.loading("Setting kb entities and aliases"):
        # set the entities, can also be done by calling `kb.add_entity` for each entity
        for i in range(len(entity_ids)):
            entity = entity_ids[i]
            if not kb.contains_entity(entity):
                kb.add_entity(entity, freqs[i], embeddings[i])

        for a in aliases:
            ents = [e for e in a["entities"] if kb.contains_entity(e)]
            n_ents = len(ents)
            if n_ents > 0:
                prior_prob = [1.0 / n_ents] * n_ents
                kb.add_alias(alias=a["alias"],
                             entities=ents,
                             probabilities=prior_prob)

    kb.fit_index(verbose=True)

    return kb
Exemplo n.º 16
0
def _make_graph_components(**kwargs):
    cat2id = {}
    page2id = {}
    id2page = {}
    disambiguations = {}
    pprops = _get_pprops(**kwargs)
    verbose = "verbose" in kwargs and kwargs["verbose"]
    msg_no_print = msg.no_print
    msg.no_print = not verbose
    iter_page_data = dt.iter_page_dump_data(**kwargs)
    for ns_kind, pageid, title in iter_page_data:
        disambi = False
        if pageid in pprops:
            page_props = pprops[pageid]
            if "hiddencat" in page_props or "noindex" in page_props:
                continue
            if "disambiguation" in page_props:
                disambi = True
        if disambi:
            disambiguations.setdefault(title, pageid)
        if ns_kind == dt.WIKI_NS_KIND_PAGE:
            page2id[title] = pageid
            id2page[pageid] = title
        elif ns_kind == dt.WIKI_NS_KIND_CATEGORY:
            cat2id[f"Category:{title}"] = pageid
    category_links = _get_category_links(cat2id, id2page, **kwargs)
    redirects = _get_redirects(page2id, id2page, **kwargs)
    with msg.loading("Removing duplicates..."):
        for title in redirects.values():
            source_id = page2id.pop(title, None)
            id2page.pop(source_id, None)
        for title, pageid in disambiguations.items():
            page2id.pop(title, None)
            id2page.pop(pageid, None)
    with msg.loading("Building graph..."):
        adjacency = _edgelist2adjacency(category_links)
    msg.no_print = msg_no_print
    return page2id, redirects, disambiguations, cat2id, adjacency
Exemplo n.º 17
0
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                f"Server error ({r.status_code})",
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    all_models = set()
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="\\[W09[45]")
        installed_models = get_installed_models()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    pkgs = {}
    for pkg_name in installed_models:
        package = pkg_name.replace("-", "_")
        version = get_package_version(pkg_name)
        if package in compat:
            is_compat = version in compat[package]
            spacy_version = about.__version__
        else:
            model_path = get_package_path(package)
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message="\\[W09[45]")
                model_meta = get_model_meta(model_path)
            spacy_version = model_meta.get("spacy_version", "n/a")
            is_compat = is_compatible_version(
                about.__version__, spacy_version)  # type: ignore[assignment]
        pkgs[pkg_name] = {
            "name": package,
            "version": version,
            "spacy": spacy_version,
            "compat": is_compat,
        }
    return pkgs, compat
Exemplo n.º 18
0
def _add_category_links(g, pages, cat2id, **kwargs):
    edges = []
    for _, source_id, target_title in dt.iter_categorylinks_dump_data(
        **kwargs
    ):
        if target_title not in pages:
            continue
        try:
            source_vx = g.vs.find(source_id)
            if (
                source_vx["kind"] == dt.WIKI_NS_KIND_CATEGORY
                and source_vx["title"] not in pages
            ):
                continue
            target_id = cat2id[target_title]
        except (KeyError, ValueError):
            continue
        edges.append((source_vx["name"], target_id))
    with msg.loading("adding category edges..."):
        g.add_edges(edges)
Exemplo n.º 19
0
def cli_print_problems(environment: str, difficulty: str, number: int):
    """Print a set of generated problems from a given environment.

    This is useful if you when developing new environment types for
    verifying that the problems you're generating take the form you
    expect. """
    import gym
    from mathy_envs.gym import MathyGymEnv

    env_name = f"mathy-{environment}-{difficulty}-v0"
    env: MathyGymEnv = gym.make(env_name)  # type:ignore
    msg.divider(env_name)
    with msg.loading(f"Generating {number} problems..."):
        header = ("Complexity", "Is Valid", "Text")
        widths = (10, 8, 62)
        aligns = ("c", "c", "l")
        data = []
        for i in range(number):
            state, problem = env.mathy.get_initial_state(env.env_problem_args,
                                                         print_problem=False)
            valid = False
            text = problem.text
            try:
                env.mathy.parser.parse(problem.text)
                valid = True
            except BaseException as error:
                text = f"parse failed for '{problem.text}' with error: {error}"
            data.append((
                problem.complexity,
                "✔" if valid else "✘",
                text,
            ))
    msg.good(f"\nGenerated {number} problems!")

    print(
        msg.table(data,
                  header=header,
                  divider=True,
                  widths=widths,
                  aligns=aligns))
Exemplo n.º 20
0
def wps(model, data):
    """
    Measure the processing speed in words per second. It's recommended to
    use a larger corpus of raw text here (e.g. a few million words).
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    texts = (eg["text"] for eg in srsly.read_jsonl(data))
    n_docs = 0
    n_words = 0
    start_time = timer()
    for doc in nlp.pipe(texts):
        n_docs += 1
        n_words += len(doc)
    end_time = timer()
    wps = int(n_words / (end_time - start_time))
    result = [
        ("Docs", f"{n_docs:,}"),
        ("Words", f"{n_words:,}"),
        ("Words/s", f"{wps:,}"),
    ]
    msg.table(result, widths=(7, 12), aligns=("l", "r"))
Exemplo n.º 21
0
def build(
    # fmt: off
    repo: str,
    commit: str,
    package_name: str = Option(None,
                               help="Package name (if different from repo)"),
    py35: bool = Option(False, "--py35", help="Build wheels for Python 3.5"),
    llvm: bool = Option(False, "--llvm", help="Requires LLVM to be installed"),
    rust: bool = Option(False, "--rust", help="Requires Rust to be installed"),
    universal: bool = Option(
        False,
        "--universal",
        help="Build universal (pure Python) wheel and sdist"),
    skip_tests: bool = Option(
        False,
        "--skip-tests",
        help="Don't run tests (e.g. if package doesn't have any)"),
    build_constraints: bool = Option(
        False,
        "--build-constraints",
        help="Use build constraints for build requirements"),
    # fmt: on
):
    """Build wheels for a given repo and commit / tag."""
    print(LOGO)
    repo_id = get_repo_id()
    user, package = repo.lower().split("/", 1)
    if package_name is None:
        package_name = package.replace("-", "_")
    msg.info(f"Building in repo {repo_id}")
    msg.info(f"Building wheels for {user}/{package}\n")
    if universal:
        msg.warn(
            "Building only universal sdist and wheel, no cross-platform wheels"
        )
    if skip_tests:
        msg.warn("Not running any tests")
    clone_url = DEFAULT_CLONE_TEMPLATE.format(f"{user}/{package}")
    repo = get_gh().get_repo(repo_id)
    with msg.loading("Finding a unique name for this release..."):
        # Pick the release_name by finding an unused one
        i = 1
        while True:
            release_name = f"{package_name}-{commit}"
            if i > 1:
                release_name += f"-{i}"
            try:
                repo.get_release(release_name)
            except github.UnknownObjectException:
                break
            i += 1
    branch_name = f"branch-for-{release_name}"
    bs = {
        "clone-url": clone_url,
        "package-name": package_name,
        "commit": commit,
        "options": {
            "llvm": llvm,
            "rust": rust,
            "py35": py35,
            "universal": universal,
            "skip_tests": skip_tests,
            "build_constraints": build_constraints,
        },
        "upload-to": {
            "type": "github-release",
            "repo-id": repo_id,
            "release-id": release_name,
        },
    }
    bs_json = json.dumps(bs)
    bs_json_formatted = json.dumps(bs, indent=4)
    msg.text(f"Creating release {release_name} to collect assets")
    release_text = f"https://github.com/{user}/{package}\n\n### Build spec\n\n```json\n{bs_json_formatted}\n```"
    release = repo.create_git_release(release_name, release_name, release_text)
    with msg.loading("Creating build branch..."):
        # 'master' is a 'Commit'. 'master.commit' is a 'GitCommit'. These are
        # different types that are mostly *not* interchangeable:
        #   https://pygithub.readthedocs.io/en/latest/github_objects/Commit.html
        #   https://pygithub.readthedocs.io/en/latest/github_objects/GitCommit.html
        master = repo.get_commit("master")
        master_gitcommit = master.commit
        patch = github.InputGitTreeElement(
            "build-spec.json",
            "100644",
            "blob",
            content=bs_json,
        )
        tree = repo.create_git_tree([patch], master_gitcommit.tree)
        our_gitcommit = repo.create_git_commit(f"Building: {release_name}",
                                               tree, [master_gitcommit])
        repo.create_git_ref(f"refs/heads/{branch_name}", our_gitcommit.sha)
    msg.good(f"Commit is {our_gitcommit.sha[:8]} in branch {branch_name}")
    msg.text(f"Release: {release.html_url}")
    msg.text(
        f"Checks:  https://github.com/{repo_id}/commit/{our_gitcommit.sha}/checks"
    )
Exemplo n.º 22
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    replace_components=False,
    vectors=None,
    width=96,
    conv_depth=4,
    cnn_window=1,
    cnn_pieces=3,
    bilstm_depth=0,
    embed_rows=2000,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    tag_map_path=None,
    omit_extra_lookups=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    disabled_pipes = None
    pipes_added = False
    msg.text("Training pipeline: {}".format(pipeline))
    if use_gpu >= 0:
        activated_gpu = None
        try:
            activated_gpu = set_gpu(use_gpu)
        except Exception as e:
            msg.warn("Exception: {}".format(e))
        if activated_gpu is not None:
            msg.text("Using GPU: {}".format(use_gpu))
        else:
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
            msg.text("Using CPU only")
            use_gpu = -1
    base_components = []
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        for pipe in pipeline:
            pipe_cfg = {}
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            if pipe not in nlp.pipe_names:
                msg.text("Adding component to base model: '{}'".format(pipe))
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            elif replace_components:
                msg.text(
                    "Replacing component from base model '{}'".format(pipe))
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
                msg.text(
                    "Extending component from base model '{}'".format(pipe))
                base_components.append(pipe)
        disabled_pipes = nlp.disable_pipes(
            [p for p in nlp.pipe_names if p not in pipeline])
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
        # Replace tag map with provided mapping
        nlp.vocab.morphology.load_tag_map(tag_map)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
    if omit_extra_lookups:
        nlp.vocab.lookups_extra = Lookups()
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
        nlp.vocab.lookups_extra.add_table("lexeme_settings")

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model and not pipes_added:
        # Start with an existing model, use default optimizer
        optimizer = nlp.resume_training(device=use_gpu)
    else:
        # Start with a blank model, call begin_training
        cfg = {"device": use_gpu}
        cfg["conv_depth"] = conv_depth
        cfg["token_vector_width"] = width
        cfg["bilstm_depth"] = bilstm_depth
        cfg["cnn_maxout_pieces"] = cnn_pieces
        cfg["embed_size"] = embed_rows
        cfg["conv_window"] = cnn_window
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec,
                                              base_components)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat-positive-label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    try:
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                    except ValueError as e:
                        err = "Error during training"
                        if init_tok2vec:
                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
                        msg.fail(err,
                                 "Original error message: {}".format(e),
                                 exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        ))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        # Only evaluate on CPU in the first iteration (for
                        # timing) if GPU is enabled
                        if i == 0:
                            with Model.use_device("cpu"):
                                nlp_loaded = util.load_model_from_path(
                                    epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg[
                                            "beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    ))
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs,
                                                             verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta.setdefault("accuracy", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["accuracy"][metric] = scorer.scores[
                                    metric]
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["beam_accuracy"][metric] = scorer.scores[
                                    metric]
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        iter_current = i + 1
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(iter_current -
                                                 iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
            "Encountered exception: {}".format(e),
            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
        if disabled_pipes:
            disabled_pipes.restore()
            meta["pipeline"] = nlp.pipe_names
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
            srsly.write_json(final_model_path / "meta.json", meta)

            meta_loc = output_path / "model-final" / "meta.json"
            final_meta = srsly.read_json(meta_loc)
            final_meta.setdefault("accuracy", {})
            final_meta["accuracy"].update(meta.get("accuracy", {}))
            final_meta.setdefault("speed", {})
            final_meta["speed"].setdefault("cpu", None)
            final_meta["speed"].setdefault("gpu", None)
            meta.setdefault("speed", {})
            meta["speed"].setdefault("cpu", None)
            meta["speed"].setdefault("gpu", None)
            # combine cpu and gpu speeds with the base model speeds
            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
                final_meta["speed"]["cpu"] = speed
            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
                final_meta["speed"]["gpu"] = speed
            # if there were no speeds to update, overwrite with meta
            if (final_meta["speed"]["cpu"] is None
                    and final_meta["speed"]["gpu"] is None):
                final_meta["speed"].update(meta["speed"])
            # note: beam speeds are not combined with the base model
            if has_beam_widths:
                final_meta.setdefault("beam_accuracy", {})
                final_meta["beam_accuracy"].update(
                    meta.get("beam_accuracy", {}))
                final_meta.setdefault("beam_speed", {})
                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
            srsly.write_json(meta_loc, final_meta)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(final_meta, output_path,
                                                  best_pipes)
        msg.good("Created best model", best_model_path)
Exemplo n.º 23
0
def debug_data(
    config_path: Path,
    *,
    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
):
    msg = Printer(no_print=silent,
                  pretty=not no_format,
                  ignore_warnings=ignore_warnings)
    # Make sure all files and paths exists if they are needed
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
        config = nlp.config.interpolate()
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
    frozen_components = T["frozen_components"]
    resume_components = [
        p for p in sourced_components if p not in frozen_components
    ]
    pipeline = nlp.pipe_names
    factory_names = [
        nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names
    ]
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)

    nlp.initialize(lambda: train_corpus(nlp))
    msg.good("Pipeline can be initialized with data")

    train_dataset = list(train_corpus(nlp))
    dev_dataset = list(dev_corpus(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_dataset constantly
    gold_train_data = _compile_gold(train_dataset,
                                    factory_names,
                                    nlp,
                                    make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(train_dataset,
                                                   factory_names,
                                                   nlp,
                                                   make_proj=False)
    gold_dev_data = _compile_gold(dev_dataset,
                                  factory_names,
                                  nlp,
                                  make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(
            f"Components from other pipelines: {', '.join(resume_components)}")
    if frozen_components:
        msg.text(f"Frozen components: {', '.join(frozen_components)}")
    msg.text(f"{len(train_dataset)} training docs")
    msg.text(f"{len(dev_dataset)} evaluation docs")

    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
    # TODO: make this feedback more fine-grained and report on updated
    # components vs. blank components
    if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
        text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
        n_misaligned = gold_train_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
        n_misaligned = gold_dev_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
            f"unique keys, {nlp.vocab.vectors_length} dimensions)")
        n_missing_vectors = sum(
            gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:.0f}%)".format(
                n_missing_vectors,
                100 * (n_missing_vectors / gold_train_data["n_words"]),
            ), )
        msg.text(
            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the package")

    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_train_data["ner"]
                     if label not in ("O", "-", None))
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_boundary_cross_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
        missing_values = label_counts["-"]
        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in labels:
            if len(label) == 0:
                msg.fail("Empty label found in train data")
        labels_with_counts = [(label, count)
                              for label, count in label_counts.most_common()
                              if label != "-"]
        labels_with_counts = _format_labels(labels_with_counts, counts=True)
        msg.text(f"Labels in train data: {_format_labels(labels)}",
                 show=verbose)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if gold_train_data["ws_ents"]:
            msg.fail(
                f"{gold_train_data['ws_ents']} invalid whitespace entity spans"
            )
            has_ws_ents_error = True

        for label in labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(
                        train_dataset, label)
                if neg_docs == 0:
                    msg.warn(
                        f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if gold_train_data["boundary_cross_ents"]:
            msg.warn(
                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
            )
            has_boundary_cross_ents_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good(
                "No entities consisting of or starting/ending with whitespace")
        if not has_boundary_cross_ents_warning:
            msg.good("No entities crossing sentence boundaries")

        if has_low_data_warning:
            msg.text(
                f"To train a new entity type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text("Entity spans consisting of or starting/ending "
                     "with whitespace characters are considered invalid.")

    if "textcat" in factory_names:
        msg.divider("Text Classification (Exclusive Classes)")
        labels = _get_labels_from_model(nlp, "textcat")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}.")
        if len(labels) < 2:
            msg.fail(
                "The model does not have enough labels. 'textcat' requires at "
                "least two labels due to mutually-exclusive classes, e.g. "
                "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
                "classification task.")
        if (gold_train_data["n_cats_bad_values"] > 0
                or gold_dev_data["n_cats_bad_values"] > 0):
            msg.fail("Unsupported values for cats: the supported values are "
                     "1.0/True and 0.0/False.")
        if gold_train_data["n_cats_multilabel"] > 0:
            # Note: you should never get here because you run into E895 on
            # initialization first.
            msg.fail(
                "The train data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'.")
        if gold_dev_data["n_cats_multilabel"] > 0:
            msg.fail(
                "The dev data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'.")

    if "textcat_multilabel" in factory_names:
        msg.divider("Text Classification (Multilabel)")
        labels = _get_labels_from_model(nlp, "textcat_multilabel")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}.")
        if (gold_train_data["n_cats_bad_values"] > 0
                or gold_dev_data["n_cats_bad_values"] > 0):
            msg.fail("Unsupported values for cats: the supported values are "
                     "1.0/True and 0.0/False.")
        if gold_train_data["n_cats_multilabel"] > 0:
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
                    "instances without mutually-exclusive classes while the "
                    "dev data contains only instances with mutually-exclusive "
                    "classes.")
        else:
            msg.warn("The train data contains only instances with "
                     "mutually-exclusive classes. You can potentially use the "
                     "component 'textcat' instead of 'textcat_multilabel'.")
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
                    "Train/dev mismatch: the dev data contains instances "
                    "without mutually-exclusive classes while the train data "
                    "contains only instances with mutually-exclusive classes.")

    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        label_list = [label for label in gold_train_data["tags"]]
        model_labels = _get_labels_from_model(nlp, "tagger")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

    if "morphologizer" in factory_names:
        msg.divider("Morphologizer (POS+Morph)")
        label_list = [label for label in gold_train_data["morphs"]]
        model_labels = _get_labels_from_model(nlp, "morphologizer")
        msg.info(f"{len(label_list)} label(s) in train data")
        labels = set(label_list)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}.")
        labels_with_counts = _format_labels(
            gold_train_data["morphs"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info(
            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(
            gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
                f"The training data contains {sents_per_doc:.2f} sentences per "
                f"document. When there are very few documents containing more "
                f"than one sentence, the parser will not learn how to segment "
                f"longer texts into sentences.")

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
            n_nonproj = gold_dev_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD:
                msg.warn(f"Low number of examples for label '{label}' "
                         f"({gold_train_unpreprocessed_data['deps'][label]})")
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if (gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
                    and DELIMITER in label):
                rare_projectivized_labels.append(
                    f"{label}: {gold_train_data['deps'][label]}")

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                f"Low number of examples for {len(rare_projectivized_labels)} "
                "label(s) in the projectivized dependency trees used for "
                "training. You may want to projectivize labels such as punct "
                "before training in order to improve parser performance.")
            msg.warn(
                f"Projectivized labels with low numbers of examples: ",
                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
                "The following labels were found only in the train data:",
                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
                "The following labels were found only in the dev data:",
                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
                f"To train a parser, your data should include at "
                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                f"Multiple root labels "
                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
                f"found in training data. spaCy's parser uses a single root "
                f"label ROOT so this distinction will not be available.")

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(f"Found {gold_train_data['n_nonproj']} nonprojective "
                     f"projectivized train sentence(s)")
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed"
        )
    if warn_counts:
        msg.warn(
            f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)
Exemplo n.º 24
0
def main(vectors,
         gpu_id=-1,
         n_neighbors=100,
         batch_size=1024,
         cutoff=0,
         start=0,
         end=None):
    """
    Step 6: Precompute nearest-neighbor queries (optional)

    Precompute nearest-neighbor queries for every entry in the vocab to make
    Sense2Vec.most_similar faster. The --cutoff option lets you define the
    number of earliest rows to limit the neighbors to. For instance, if cutoff
    is 100000, no word will have a nearest neighbor outside of the top 100k
    vectors.
    """
    if gpu_id == -1:
        xp = numpy
    else:
        import cupy as xp
        import cupy.cuda.device

        cupy.take_along_axis = take_along_axis
        device = cupy.cuda.device.Device(gpu_id)
        device.use()
    vectors_dir = Path(vectors)
    vectors_file = vectors_dir / "vectors"
    if not vectors_dir.is_dir() or not vectors_file.exists():
        err = "Are you passing in the exported sense2vec directory containing a vectors file?"
        msg.fail(f"Can't load vectors from {vectors}", err, exits=1)
    with msg.loading(f"Loading vectors from {vectors}"):
        vectors = xp.load(str(vectors_file))
    msg.good(
        f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}"
    )
    norms = xp.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1
    # Normalize to unit norm
    vectors /= norms
    if cutoff < 1:
        cutoff = vectors.shape[0]
    if end is None:
        end = vectors.shape[0]
    mean = float(norms.mean())
    var = float(norms.var())
    msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
    msg.info(
        f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
    n = min(n_neighbors, vectors.shape[0])
    subset = vectors[:cutoff]
    best_rows = xp.zeros((end - start, n), dtype="i")
    scores = xp.zeros((end - start, n), dtype="f")
    for i in tqdm.tqdm(list(range(start, end, batch_size))):
        size = min(batch_size, end - i)
        batch = vectors[i:i + size]
        sims = xp.dot(batch, subset.T)
        # Set self-similarities to -inf, so that we don't return them.
        for j in range(size):
            if i + j < sims.shape[1]:
                sims[j, i + j] = -xp.inf
        # This used to use argpartition, to do a partial sort...But this ended
        # up being a ratsnest of terrible numpy crap. Just sorting the whole
        # list isn't really slower, and it's much simpler to read.
        ranks = xp.argsort(sims, axis=1)
        batch_rows = ranks[:, -n:]
        # Reverse
        batch_rows = batch_rows[:, ::-1]
        batch_scores = xp.take_along_axis(sims, batch_rows, axis=1)
        best_rows[i:i + size] = batch_rows
        scores[i:i + size] = batch_scores
    msg.info("Saving output")
    if not isinstance(best_rows, numpy.ndarray):
        best_rows = best_rows.get()
    if not isinstance(scores, numpy.ndarray):
        scores = scores.get()
    output = {
        "indices": best_rows,
        "scores": scores.astype("float16"),
        "start": start,
        "end": end,
        "cutoff": cutoff,
    }
    output_file = vectors_dir / "cache"
    with msg.loading("Saving output..."):
        srsly.write_msgpack(output_file, output)
    msg.good(f"Saved cache to {output_file}")
def main(
    model="./zh_vectors_web_ud_lg/model-final",
    new_model_name="zh_vectors_web_ud_clue_lg",
    output_dir="./zh_vectors_web_ud_clue_lg",
    train_path="./clue_spacy_train.jsonl",
    dev_path="./clue_spacy_dev.jsonl",
    meta_path="./meta.json",
    use_gpu=0,
    n_iter=50
):
    import tqdm
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    for label in LABEL:
        if label not in ner.labels:
            ner.add_label(label)  # add new entity label to entity recognizer

    train_path = ensure_path(train_path)
    dev_path = ensure_path(dev_path)

    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_dir.exists():
        output_dir.mkdir()

    meta = srsly.read_json(meta_path) if meta_path else {}

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(0))
    corpus = GoldCorpus(train_path, dev_path, limit=0)
    n_train_words = corpus.count_train()
    
    if model is None:   
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    else:
        optimizer = create_default_optimizer(Model.ops)
        # Todo: gpu train?

    dropout_rates = decaying( 0.2, 0.2, 0.0)
    
    batch_sizes = compounding( 100.0, 1000.0 , 1.001)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


    # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment
    # fmt: off
    eval_beam_widths=""
    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]
    row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        noise_level = 0.0
        orth_variant_level = 0.0
        gold_preproc = False
        verbose = False

        best_score = 0.0
        with nlp.disable_pipes(*other_pipes):  # only train NER
            for itn in range(n_iter):
                train_docs = corpus.train_docs(
                    nlp,
                    noise_level=noise_level,
                    orth_variant_level=orth_variant_level,
                    gold_preproc=gold_preproc,
                    max_length=0,
                    ignore_misaligned=True,
                )
                words_seen = 0
                with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                    losses = {}
                    for batch in minibatch_by_words(train_docs, size=batch_sizes):
                        if not batch:
                            continue
                        docs, golds = zip(*batch)
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                        if not int(os.environ.get("LOG_FRIENDLY", 0)):
                            pbar.update(sum(len(doc) for doc in docs))
                        words_seen += sum(len(doc) for doc in docs)
                with nlp.use_params(optimizer.averages):
                    set_env_log(False)
                    epoch_model_path = output_dir / ("model%d" % itn)
                    nlp.to_disk(epoch_model_path)
                    nlp_loaded = load_model_from_path(epoch_model_path)
                    for beam_width in eval_beam_widths:
                        for name, component in nlp_loaded.pipeline:
                            if hasattr(component, "cfg"):
                                component.cfg["beam_width"] = beam_width
                        dev_docs = list(
                            corpus.dev_docs(
                                nlp_loaded,
                                gold_preproc=gold_preproc,
                                ignore_misaligned=True,
                            )
                        )
                        nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                        end_time = timer()
                        if use_gpu < 0:
                            gpu_wps = None
                            cpu_wps = nwords / (end_time - start_time)
                        else:
                            gpu_wps = nwords / (end_time - start_time)
                            with Model.use_device("cpu"):
                                nlp_loaded = load_model_from_path(epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg["beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    )
                                )
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                        acc_loc = output_dir / ("model%d" % itn) / "accuracy.json"
                        srsly.write_json(acc_loc, scorer.scores)

                        # Update model meta.json
                        meta["lang"] = nlp.lang
                        meta["pipeline"] = nlp.pipe_names
                        meta["spacy_version"] = ">=%s" % spacy.__version__
                        if beam_width == 1:
                            meta["speed"] = {
                                "nwords": nwords,
                                "cpu": cpu_wps,
                                "gpu": gpu_wps,
                            }
                            meta["accuracy"] = scorer.scores
                        else:
                            meta.setdefault("beam_accuracy", {})
                            meta.setdefault("beam_speed", {})
                            meta["beam_accuracy"][beam_width] = scorer.scores
                            meta["beam_speed"][beam_width] = {
                                "nwords": nwords,
                                "cpu": cpu_wps,
                                "gpu": gpu_wps,
                            }
                        meta["vectors"] = {
                            "width": nlp.vocab.vectors_length,
                            "vectors": len(nlp.vocab.vectors),
                            "keys": nlp.vocab.vectors.n_keys,
                            "name": nlp.vocab.vectors.name,
                        }
                        meta.setdefault("name", "model%d" % itn)
                        meta.setdefault("version", "0.0.1")
                        meta["labels"] = nlp.meta["labels"]
                        meta_loc = output_dir / ("model%d" % itn) / "meta.json"
                        srsly.write_json(meta_loc, meta)
                        set_env_log(verbose)

                        progress = _get_progress(
                            itn,
                            losses,
                            scorer.scores,
                            output_stats,
                            beam_width=beam_width if has_beam_widths else None,
                            cpu_wps=cpu_wps,
                            gpu_wps=gpu_wps,
                        )

                        msg.row(progress, **row_settings)

    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_dir / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        meta["pipeline"] = nlp.pipe_names
        meta["labels"] = nlp.meta["labels"]
        meta["factories"] = nlp.meta["factories"]
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Exemplo n.º 26
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]])
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text("The following models are not available for spaCy "
                 "v{}: {}".format(about.__version__, ", ".join(na_models)))
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)
Exemplo n.º 27
0
def create_wikigraph(
    output_path: Path,
    wiki="en",
    version="latest",
    dumps_path: Path = None,
    max_workers: int = None,
    silent: bool = None,
    force: bool = None,
):
    """
    Create a `WikiGraph` from a specific dump.

    It can then be used by directly loading it, or
    it can be packaged with the `package-wikigraph` command.

    Parameters
    ----------
    output_path : Path
        Path in which to store the `WikiGraph`.
    wiki : str, optional
        Wikipedia dump type to use, by default "en".
    version : str, optional
        Wikipedia dump version to use, by default "latest".
    dumps_path : Path, optional
        Path in which to find previously downloaded dumps,
        or where to save dumps downloaded in this call, by default None.
    max_workers : int, optional
        Maximum number of processes to use, by default None.
    silent : bool, optional
        Do not print anything in stout, by default None.
    force : bool, optional
        Overwrite content in output_path, if any, by default None.
    """
    if not output_path.exists():
        output_path.mkdir()
        msg.good(f"Created output directory: {output_path}")
    graph_name = f"{wiki}wiki_core"
    graph_path = output_path.joinpath(graph_name)
    if not force and graph_path.exists():
        msg.fail(
            f"Output path already contains {graph_name} directory",
            "Use --force to overwrite it",
            exits=1,
        )
    kwargs = {
        "dumps_path": dumps_path,
        "max_workers": max_workers,
        "wiki": wiki,
        "version": version,
        "verbose": not silent,
    }
    wg = WikiGraph.build(**kwargs)
    if not graph_path.exists():
        graph_path.mkdir()
    graph_format = "picklez"
    with msg.loading("dump to disk..."):
        wg.dump(graph_path, graph_format=graph_format)
    meta = get_meta()
    meta["name"] = graph_name
    meta["version"] = wg.version
    meta["graph_format"] = graph_format
    meta["spikex_version"] = f">={spikex_version}"
    meta["fullname"] = f"{graph_name}-{spikex_version}"
    meta["sources"].append("Wikipedia")
    meta_path = graph_path.joinpath("meta.json")
    meta_path.write_text(json_dumps(meta, indent=2))
    msg.good(f"Successfully created {graph_name}.")
Exemplo n.º 28
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    conv_depth=4,
    bilstm_depth=0,
    cnn_pieces=3,
    sa_depth=0,
    use_chars=False,
    cnn_window=1,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
    init_tok2vec=None,
    epoch_start=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pretrained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pretrained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    for key in config:
        if isinstance(config[key], Path):
            config[key] = str(config[key])
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    if has_gpu:
        import torch

        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
        msg.warn(
            "Output directory is not empty",
            "It is better to use an empty directory or refer to a new output path, "
            "then the new directory will be created for you.",
        )
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory: {}".format(output_dir))
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        if not texts:
            msg.fail("Input file is empty", texts_loc, exits=1)
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=conv_depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            subword_features=not use_chars,  # Set to False for Chinese etc
            cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
        ),
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
        # Parse the epoch number from the given weight file
        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
        else:
            if not epoch_start:
                msg.fail(
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
                    "'--init-tok2vec'",
                    exits=True,
                )
            elif epoch_start < 0:
                msg.fail(
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
                    % epoch_start,
                    exits=True,
                )
    else:
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
        epoch_start = 0

    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
    for epoch in range(epoch_start, n_iter + epoch_start):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs, count = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            skip_counter += count
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
    msg.good("Successfully finished pretrain")
Exemplo n.º 29
0
    def run(self):
        msg.info("Extracción de datos de los artículos del corpus.")
        self.start_time = time.perf_counter()

        with msg.loading("Analizando..."):
            asyncio.run(self.main())
Exemplo n.º 30
0
def speech_to_text_demo(asr: "ASROnlineAudioFrame") -> None:
    """Speech to Text (ASR) Microphone Demo.

    Interrupt the notebook's kernel to stop the app from recoring.
    """
    asr.reset()
    audio = pyaudio.PyAudio()

    offset = {"count": 0}
    columns = []
    devices = []

    for idx in range(audio.get_device_count()):
        device = audio.get_device_info_by_index(idx)
        if not device.get("maxInputChannels"):
            continue
        devices.append(idx)
        columns.append((idx, device.get("name")))

    if columns:
        msg.good("Found the following input devices!")
        msg.table(columns, header=("ID", "Devices"), divider=True)

    if devices:
        device_index = -2
        while device_index not in devices:
            msg.info("Please enter the device ID")
            device_index = int(input())

        def callback(in_data, frame_count, time_info, status):
            signal = np.frombuffer(in_data, dtype=np.int16)
            text = asr.transcribe(signal)
            if text:
                print(text, end="")
                offset["count"] = asr.params.offset
            elif offset["count"] > 0:
                offset["count"] -= 1
                if offset["count"] == 0:
                    print(" ", end="")
            return (in_data, pyaudio.paContinue)

        stream = audio.open(
            input=True,
            format=pyaudio.paInt16,
            input_device_index=device_index,
            stream_callback=callback,
            channels=asr.params.channels,
            rate=asr.params.sample_rate,
            frames_per_buffer=asr.chunk_size,
        )

        msg.loading("Listening...")
        stream.start_stream()

        try:
            while stream.is_active():
                time.sleep(0.1)
        except (KeyboardInterrupt, Exception) as e:
            stream.stop_stream()
            stream.close()
            audio.terminate()
            msg.warn("WARNING: ASR stream stopped.", e)
    else:
        msg.fail("ERROR", "No audio input device found.")