Exemplo n.º 1
0
def convert(
    input_file,
    output_dir="-",
    file_type="jsonl",
    n_sents=1,
    morphology=False,
    converter="auto",
    lang=None,
):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions. If no output_dir is specified, the data
    is written to stdout, so you can pipe them forward to a JSONL file:
    $ spacy convert some_file.conllu > some_file.jsonl
    """
    msg = Printer()
    input_path = Path(input_file)
    if file_type not in FILE_TYPES:
        msg.fail(
            "Unknown file type: '{}'".format(file_type),
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            "Can't write .{} data to stdout.".format(file_type),
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    input_data = input_path.open("r", encoding="utf-8").read()
    data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)
Exemplo n.º 2
0
 def save_pkuseg_processors(path):
     if self.pkuseg_seg:
         data = (
             _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie),
             self.pkuseg_seg.postprocesser.do_process,
             sorted(list(self.pkuseg_seg.postprocesser.common_words)),
             sorted(list(self.pkuseg_seg.postprocesser.other_words)),
         )
         srsly.write_msgpack(path, data)
Exemplo n.º 3
0
    def to_disk(self, path, **kwargs):
        """Serialize waterwheel data to a file.
        
        Parameters
        ----------
        path : Path
            path to file.
        """

        path = ensure_path(path)
        serial = self.to_bytes()
        srsly.write_msgpack(path, serial)
Exemplo n.º 4
0
    def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
        """Serialize a Sense2Vec object to a directory.

        path (unicode / Path): The path.
        exclude (list): Names of serialization fields to exclude.
        """
        path = Path(path)
        self.vectors.to_disk(path)
        srsly.write_json(path / "cfg", self.cfg)
        srsly.write_json(path / "freqs.json", list(self.freqs.items()))
        if "strings" not in exclude:
            self.strings.to_disk(path / "strings.json")
        if "cache" not in exclude and self.cache:
            srsly.write_msgpack(path / "cache", self.cache)
Exemplo n.º 5
0
def write_data_files(vocab: Dict, wikidata: Dict, stop_words: set,
                     doc_bins_bytes: Dict):
    """Writes necessary data to resource files.
    
    Parameters
    ----------
    vocab: Dict
        A dictionary containing the different water body types and their hash value.
    wikidata: Dict
        A dictionary with each wikilink data for every water_body
    stop_words: set
        A set of commong words in English.
    doc_bins_bytes: Dict
        A dictionary of DocBin bytes for each water body type.
    """

    serial = OrderedDict((
        ('stop_words', list(stop_words)),
        ('vocab', vocab),
        ('wikidata', wikidata),
        ('doc_bins', doc_bins_bytes),
    ))
    srsly.write_msgpack(doc_bins_file, serial)
Exemplo n.º 6
0
    def to_disk(
        self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Serialize the AttributeRuler to disk.

        path (Union[Path, str]): A path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.

        DOCS: https://spacy.io/api/attributeruler#to_disk
        """
        serialize = {
            "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
            "patterns": lambda p: srsly.write_msgpack(p, self.patterns),
        }
        util.to_disk(path, serialize, exclude)
Exemplo n.º 7
0
def main(vectors,
         gpu_id=-1,
         n_neighbors=100,
         batch_size=1024,
         cutoff=0,
         start=0,
         end=None):
    """
    Step 6: Precompute nearest-neighbor queries (optional)

    Precompute nearest-neighbor queries for every entry in the vocab to make
    Sense2Vec.most_similar faster. The --cutoff option lets you define the
    number of earliest rows to limit the neighbors to. For instance, if cutoff
    is 100000, no word will have a nearest neighbor outside of the top 100k
    vectors.
    """
    if gpu_id == -1:
        xp = numpy
    else:
        import cupy as xp
        import cupy.cuda.device

        cupy.take_along_axis = take_along_axis
        device = cupy.cuda.device.Device(gpu_id)
        device.use()
    vectors_dir = Path(vectors)
    vectors_file = vectors_dir / "vectors"
    if not vectors_dir.is_dir() or not vectors_file.exists():
        err = "Are you passing in the exported sense2vec directory containing a vectors file?"
        msg.fail(f"Can't load vectors from {vectors}", err, exits=1)
    with msg.loading(f"Loading vectors from {vectors}"):
        vectors = xp.load(str(vectors_file))
    msg.good(
        f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}"
    )
    norms = xp.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1
    # Normalize to unit norm
    vectors /= norms
    if cutoff < 1:
        cutoff = vectors.shape[0]
    if end is None:
        end = vectors.shape[0]
    mean = float(norms.mean())
    var = float(norms.var())
    msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
    msg.info(
        f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
    n = min(n_neighbors, vectors.shape[0])
    subset = vectors[:cutoff]
    best_rows = xp.zeros((end - start, n), dtype="i")
    scores = xp.zeros((end - start, n), dtype="f")
    for i in tqdm.tqdm(list(range(start, end, batch_size))):
        size = min(batch_size, end - i)
        batch = vectors[i:i + size]
        sims = xp.dot(batch, subset.T)
        # Set self-similarities to -inf, so that we don't return them.
        for j in range(size):
            if i + j < sims.shape[1]:
                sims[j, i + j] = -xp.inf
        # This used to use argpartition, to do a partial sort...But this ended
        # up being a ratsnest of terrible numpy crap. Just sorting the whole
        # list isn't really slower, and it's much simpler to read.
        ranks = xp.argsort(sims, axis=1)
        batch_rows = ranks[:, -n:]
        # Reverse
        batch_rows = batch_rows[:, ::-1]
        batch_scores = xp.take_along_axis(sims, batch_rows, axis=1)
        best_rows[i:i + size] = batch_rows
        scores[i:i + size] = batch_scores
    msg.info("Saving output")
    if not isinstance(best_rows, numpy.ndarray):
        best_rows = best_rows.get()
    if not isinstance(scores, numpy.ndarray):
        scores = scores.get()
    output = {
        "indices": best_rows,
        "scores": scores.astype("float16"),
        "start": start,
        "end": end,
        "cutoff": cutoff,
    }
    output_file = vectors_dir / "cache"
    with msg.loading("Saving output..."):
        srsly.write_msgpack(output_file, output)
    msg.good(f"Saved cache to {output_file}")
def convert(
    input_file,
    output_dir="-",
    file_type="json",
    n_sents=1,
    seg_sents=False,
    model=None,
    morphology=False,
    converter="auto",
    lang=None,
):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions. If no output_dir is specified, the data
    is written to stdout, so you can pipe them forward to a JSON file:
    $ spacy convert some_file.conllu > some_file.json
    """
    no_print = output_dir == "-"
    msg = Printer(no_print=no_print)
    input_path = Path(input_file)
    if file_type not in FILE_TYPES:
        msg.fail(
            "Unknown file type: '{}'".format(file_type),
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            "Can't write .{} data to stdout.".format(file_type),
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    input_data = input_path.open("r", encoding="utf-8").read()
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter == "ner" or converter == "iob":
        converter_autodetect = autodetect_ner_format(input_data)
        if converter_autodetect == "ner":
            msg.info("Auto-detected token-per-line NER format")
            converter = converter_autodetect
        elif converter_autodetect == "iob":
            msg.info("Auto-detected sentence-per-line NER format")
            converter = converter_autodetect
        else:
            msg.warn(
                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
            )
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    data = func(
        input_data,
        n_sents=n_sents,
        seg_sents=seg_sents,
        use_morphology=morphology,
        lang=lang,
        model=model,
        no_print=no_print,
    )
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(
            input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents): {}".format(
            len(data), output_file))
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)