Exemplo n.º 1
0
def connect_to_server(
        host: str = typer.Option(...),
        username: str = typer.Option(...),
        tls: bool = True,
        password: str = typer.Option(..., prompt=True, hide_input=True),
):
    global client
    client = Client(host)
    if not client.connect(username, password, starttls=tls):
        msg.fail("Couldn't connect to server. Wrong password?")
        raise typer.Exit(code=1)
Exemplo n.º 2
0
def import_code(code_path: Optional[Union[Path, str]]) -> None:
    """Helper to import Python file provided in training commands / commands
    using the config. This makes custom registered functions available.
    """
    if code_path is not None:
        if not Path(code_path).exists():
            msg.fail("Path to Python code not found", code_path, exits=1)
        try:
            import_file("python_code", code_path)
        except Exception as e:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
Exemplo n.º 3
0
def get_gh():
    token_path = ROOT / SECRET_FILE
    if ENV.GH_SECRET in os.environ:
        token = os.environ[ENV.GH_SECRET]
    elif token_path.exists():
        with token_path.open("r", encoding="utf-8") as f:
            token = f.read().strip()
    else:
        err = f"Can't find GitGub token. Not {ENV.GH_SECRET} envvar or in {token_path}"
        msg.fail(err, exits=1)
    return github.Github(token)
Exemplo n.º 4
0
def get_json(url, desc):
    r = requests.get(url)
    if r.status_code != 200:
        msg.fail(
            "Server error ({})".format(r.status_code),
            "Couldn't fetch {}. Please find a model for your spaCy "
            "installation (v{}), and download it manually. For more "
            "details, see the documentation: "
            "https://spacy.io/usage/models".format(desc, about.__version__),
            exits=1,
        )
    return r.json()
Exemplo n.º 5
0
 def main(self, args: BaseArgumentParser) -> int:
     list_devices_response = self.get_client().list_devices()
     msg.divider("Registered Devices")
     for device in list_devices_response.devices:
         if device.is_available:
             msg.good(f"{device.name}")
         else:
             msg.fail(f"{device.name}:")
             msg.text(
                 f"  {color(device.error_type, bold=True)}: {device.error_message}"
             )
     return 0
def check_hash(f0):
    key = f0.stem + ".xml.gz"

    if key not in md5:
        msg.fail(
            f"Can't find md5 hash for {key}, suspicious and should look into!")
        return

    if file_md5sum(f0) != md5[key]:
        msg.fail(f"Checksum failed {key}, should delete!")
        print(f0)
        return
Exemplo n.º 7
0
def project_run(
    project_dir: Path,
    subcommand: str,
    *,
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.

    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    force (bool): Force re-running, even if nothing changed.
    dry (bool): Perform a dry run and don't execute commands.
    capture (bool): Whether to capture the output and errors of individual commands.
        If False, the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    """
    config = load_project_config(project_dir)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
            project_run(project_dir, cmd, force=force, dry=dry)
    else:
        cmd = commands[subcommand]
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, err_help, **err_kwargs)
        check_spacy_commit = check_bool_env_var(
            ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
            rerun = check_rerun(current_dir,
                                cmd,
                                check_spacy_commit=check_spacy_commit)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                run_commands(cmd["script"], dry=dry, capture=capture)
                if not dry:
                    update_lockfile(current_dir, cmd)
Exemplo n.º 8
0
def swarm_solve(
    problems: Union[List[str], str],
    config: SwarmConfig,
    max_steps: Union[List[int], int] = 128,
    silent: bool = False,
) -> Swarm:
    single_problem: bool = isinstance(problems, str)
    if single_problem:
        problems = [problems]
    if isinstance(max_steps, int):
        max_steps = [max_steps
                     ] if single_problem else [max_steps] * len(problems)
    assert len(problems) > 0, "no problems to solve"
    assert len(problems) == len(max_steps)
    assert isinstance(problems, list)
    current_problem: str = problems.pop(0)
    current_max_moves: int = max_steps.pop(0)

    def env_callable():
        nonlocal current_problem, current_max_moves
        return FragileMathyEnv(
            name="mathy_v0",
            problem=current_problem,
            repeat_problem=True,
            max_steps=current_max_moves,
        )

    mathy_env: MathyEnv = env_callable()._env._env.mathy
    swarm: Swarm = mathy_swarm(config, env_callable)
    while True:
        if not silent:
            with msg.loading(f"Solving {current_problem} ..."):
                swarm.run()
        else:
            swarm.run()

        if not silent:
            if swarm.walkers.best_reward > EnvRewards.WIN:
                last_state = MathyEnvState.from_np(
                    swarm.walkers.states.best_state)
                msg.good(
                    f"Solved! {current_problem} = {last_state.agent.problem}")
                mathy_env.print_history(last_state)
            else:
                msg.fail(f"Failed to find a solution :(")

        if len(max_steps) > 0:
            current_max_moves = max_steps.pop(0)
            current_problem = problems.pop(0)
        else:
            break
    return swarm
Exemplo n.º 9
0
def putscript(path: Path = typer.Argument(
    ...,
    exists=True,
    readable=True,
    file_okay=True,
    dir_okay=False,
    resolve_path=True,
), ):
    content = path.read_text()
    if client.putscript(path.name, content):
        msg.good(f"Put script {path.name}")
    else:
        msg.fail(f"Failed while putting script {path.name}")
Exemplo n.º 10
0
def resolve_version(wiki, version):
    url = _WIKI_BASE_DL_PATH.format(w=wiki)
    response = requests.get(url)
    versions = re.findall(r"href=\"(\d+)/\"", response.text)
    if version in versions:
        return version
    if version == "latest":
        return max(versions)
    msg.fail(
        "Wikipedia dump version not found",
        f"Pick one of these: {', '.join(versions)}.",
        exits=1,
    )
Exemplo n.º 11
0
def run_test(command, directory):
    """Execute a command that runs a test"""
    wrapped_command = "cd %s && %s" % (directory, command)
    pipe = subprocess.Popen(
        wrapped_command,
        shell=True,
    )
    pipe.wait()
    if pipe.returncode == 0:
        msg.good("TEST PASSED")
    else:
        msg.fail("TEST FAILED")
    return pipe.returncode
Exemplo n.º 12
0
def read_vectors(vectors_loc):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
        line = line.rstrip()
        pieces = line.rsplit(" ", vectors_data.shape[1])
        word = pieces.pop(0)
        if len(pieces) != vectors_data.shape[1]:
            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
    return vectors_data, vectors_keys
Exemplo n.º 13
0
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
        if resume_path:
            msg.warn(
                "Output directory is not empty.",
                "If you're resuming a run in this directory, the old weights "
                "for the consecutive epochs will be overwritten with the new ones.",
            )
        else:
            msg.warn(
                "Output directory is not empty. ",
                "It is better to use an empty directory or refer to a new output path, "
                "then the new directory will be created for you.",
            )
    if resume_path is not None:
        if resume_path.is_dir():
            # This is necessary because Windows gives a Permission Denied when we
            # try to open the directory later, which is confusing. See #7878
            msg.fail(
                "--resume-path should be a weights file, but {resume_path} is a directory.",
                exits=True,
            )
        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if not model_name and not epoch_resume:
            msg.fail(
                "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
                exits=True,
            )
        elif not model_name and epoch_resume < 0:
            msg.fail(
                f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
                exits=True,
            )
Exemplo n.º 14
0
def run_test(command, directory):
    """Execute a command that runs a test"""
    msg.text("RUNNING  " + command)
    wrapped_command = f"cd {directory} && {command}"
    pipe = subprocess.Popen(
        wrapped_command, shell=True,
    )
    pipe.wait()
    if pipe.returncode == 0:
        msg.good("TEST PASSED")
    else:
        msg.fail("TEST FAILED")
    msg.text('')
    return pipe.returncode
Exemplo n.º 15
0
def main(in_file,
         out_dir,
         spacy_model="en_core_web_sm",
         n_process=1,
         max_docs=10**7):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
                output_file = output_path / f"{input_path.stem}.spacy"
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(
                    attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        with output_file.open("wb") as f:
            batch_num += 1
            output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(f"Complete. Saved final parsed docs to file",
                     output_file.resolve())
Exemplo n.º 16
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Vectors file (text-based)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"),
    min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"),
    # fmt: on
):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Exemplo n.º 17
0
def download(f0, f1):
    url = base_url + f0

    r = sess.get(url)
    if not r.ok:
        print(r.content)
        print(r.status_code)
        msg.fail(f"Failed {url}")
        exit()

    with open(f1, "wb") as FOUT:
        FOUT.write(r.content)

    msg.good(f"Saved {f0}")
    time.sleep(5)
Exemplo n.º 18
0
def _read_inputs(loc, msg):
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
        file_ = (line.encode("utf8") for line in file_)
    else:
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
        msg.info("Using data from {}".format(input_path.parts[-1]))
        file_ = input_path.open()
    for line in file_:
        data = srsly.json_loads(line)
        text = data["text"]
        yield text
Exemplo n.º 19
0
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
        file_ = (line.encode("utf8") for line in file_)
    else:
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
        msg.info(f"Using data from {input_path.parts[-1]}")
        file_ = input_path.open()  # type: ignore[assignment]
    for line in file_:
        data = srsly.json_loads(line)
        text = data["text"]
        yield text
Exemplo n.º 20
0
def validate_project_version(config: Dict[str, Any]) -> None:
    """If the project defines a compatible spaCy version range, chec that it's
    compatible with the current version of spaCy.
    config (Dict[str, Any]): The loaded config.
    """
    spacy_version = config.get("spacy_version", None)
    if spacy_version and not is_compatible_version(about.__version__,
                                                   spacy_version):
        err = (
            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
            f"that's not compatible with the version of spaCy you're running "
            f"({about.__version__}). You can edit version requirement in the "
            f"{PROJECT_FILE} to load it, but the project may not run as expected."
        )
        msg.fail(err, exits=1)
Exemplo n.º 21
0
def git_sparse_checkout(repo, subpath, dest, branch):
    # We're using Git, partial clone and sparse checkout to
    # only clone the files we need
    # This ends up being RIDICULOUS. omg.
    # So, every tutorial and SO post talks about 'sparse checkout'...But they
    # go and *clone* the whole repo. Worthless. And cloning part of a repo
    # turns out to be completely broken. The only way to specify a "path" is..
    # a path *on the server*? The contents of which, specifies the paths. Wat.
    # Obviously this is hopelessly broken and insecure, because you can query
    # arbitrary paths on the server! So nobody enables this.
    # What we have to do is disable *all* files. We could then just checkout
    # the path, and it'd "work", but be hopelessly slow...Because it goes and
    # transfers every missing object one-by-one. So the final piece is that we
    # need to use some weird git internals to fetch the missings in bulk, and
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        # This is the "clone, but don't download anything" part.
        cmd = (f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
               f"-b {branch} --filter=blob:none")
        run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
        ret = run_command(cmd, capture=True)
        git_repo = _http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join(
            [x[1:] for x in ret.stdout.split() if x.startswith("?")])
        if not missings:
            err = (
                f"Could not find any relevant files for '{subpath}'. "
                f"Did you specify a correct and complete path within repo '{repo}' "
                f"and branch {branch}?")
            msg.fail(err, exits=1)
        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
        run_command(cmd, capture=True)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
        run_command(cmd, capture=True)

        # Get a subdirectory of the cloned path, if appropriate
        source_path = tmp_dir / Path(subpath)
        if not is_subpath_of(tmp_dir, source_path):
            err = f"'{subpath}' is a path outside of the cloned repository."
            msg.fail(err, repo, exits=1)

        shutil.move(str(source_path), str(dest))
Exemplo n.º 22
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4):
    """
    Step 2: Preprocess text in sense2vec's format

    Expects a binary .spacy input file consisting of the parsed Docs (DocBin)
    and outputs a text file with one sentence per line in the expected sense2vec
    format (merged noun phrases, concatenated phrases with underscores and
    added "senses").

    Example input:
    Rats, mould and broken furniture: the scandal of the UK's refugee housing

    Example output:
    Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT
    the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    with input_path.open("rb") as f:
        doc_bin_bytes = f.read()
    doc_bin = DocBin().from_bytes(doc_bin_bytes)
    msg.good(f"Loaded {len(doc_bin)} parsed docs")
    docs = doc_bin.get_docs(nlp.vocab)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            doc = merge_phrases(doc)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
Exemplo n.º 23
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    msg.text("Preprocessing text...")
    texts = [line.rstrip() for line in open(in_file, 'r')]
    docs = nlp.pipe(texts, n_process=n_process)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    wn_lemmas = set(wordnet.all_lemma_names())
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            # print(doc)
            spans = get_phrases(doc, wn_lemmas)
            spans = filter_spans(spans)
            # print('NOUN SPAN', str(spans))
            doc = merge_phrases(doc, spans)
            spans = get_adjective_phrases(doc)
            spans = filter_spans(spans)
            # print('ADJ SPAN', str(spans))
            # print('*-----------------------------------------*')
            doc = merge_phrases(doc, spans)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
Exemplo n.º 24
0
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
    """Check and validate assets without a URL (private assets that the user
    has to provide themselves) and give feedback about the checksum.

    dest (Path): Destination path of the asset.
    checksum (Optional[str]): Optional checksum of the expected file.
    """
    if not Path(dest).exists():
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
        msg.warn(err)
    else:
        if not checksum:
            msg.good(f"Asset already exists: {dest}")
        elif checksum == get_checksum(dest):
            msg.good(f"Asset exists with matching checksum: {dest}")
        else:
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
Exemplo n.º 25
0
def init_spacy_model(
    lang: str = "en", size: str = "md", model_type: str = "core"
) -> Language:
    name = ""
    for model_name in available_models:
        if (
            lang in model_name
            and size in model_name
            and model_type in model_name
        ):
            msg.good(f"Found model {model_name}")
            name = model_name
            break
        else:
            msg.fail("No compatible model of your choice.")
    model = load_spacy_model(name)
    return model
Exemplo n.º 26
0
def get_checksum(path: Union[Path, str]) -> str:
    """Get the checksum for a file or directory given its file path. If a
    directory path is provided, this uses all files in that directory.
    path (Union[Path, str]): The file or directory path.
    RETURNS (str): The checksum.
    """
    path = Path(path)
    if path.is_file():
        return hashlib.md5(Path(path).read_bytes()).hexdigest()
    if path.is_dir():
        # TODO: this is currently pretty slow
        dir_checksum = hashlib.md5()
        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
            dir_checksum.update(sub_file.read_bytes())
        return dir_checksum.hexdigest()
    msg.fail(f"Can't get checksum for {path}: not a file or directory",
             exits=1)
Exemplo n.º 27
0
def read_vectors(vectors_loc):
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    from tqdm import tqdm

    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
        line = line.rstrip()
        pieces = line.rsplit(" ", vectors_data.shape[1])
        word = pieces.pop(0)
        if len(pieces) != vectors_data.shape[1]:
            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
    return vectors_data, vectors_keys
Exemplo n.º 28
0
def _load_file(file_path: Path, msg: Printer) -> None:
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_json(file_path)
        msg.good(f"Loaded {file_name}")
        return data
    elif file_path.suffix == ".jsonl":
        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_jsonl(file_path)
        msg.good(f"Loaded {file_name}")
        return data
    msg.fail(
        f"Can't load file extension {file_path.suffix}",
        "Expected .json or .jsonl",
        exits=1,
    )
Exemplo n.º 29
0
    def commons_files(self):

        if self.wikidata_commons_category():
            category = pwb.Category(self.commons, self.wikidata_commons_category())

            try:
                return sum(1 for image in category.members(recurse=3, namespaces="File"))

            except RecursionError as error:
                msg.fail(f"Ha ocurrido un error de recursión: {error}")
                return "Demasiadas subcategorías"

            except (ValueError, AttributeError) as error:
                msg.fail(f"Ha ocurrido un error inespereado: {error}")
                return "0"
        else:
            return 0
Exemplo n.º 30
0
def get_compatibility() -> dict:
    version = get_minor_version(about.__version__)
    r = requests.get(about.__compatibility__)
    if r.status_code != 200:
        msg.fail(
            f"Server error ({r.status_code})",
            f"Couldn't fetch compatibility table. Please find a package for your spaCy "
            f"installation (v{about.__version__}), and download it manually. "
            f"For more details, see the documentation: "
            f"https://spacy.io/usage/models",
            exits=1,
        )
    comp_table = r.json()
    comp = comp_table["spacy"]
    if version not in comp:
        msg.fail(f"No compatible packages found for v{version} of spaCy", exits=1)
    return comp[version]