def init_model(lang, output_dir, freqs_loc=None, vectors_loc=None, no_expand_vectors=False, meta_overrides=None, prune_vectors=-1, min_word_frequency=50): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. """ output_dir = ensure_path(output_dir) if vectors_loc is not None: vectors_loc = cached_path(vectors_loc) vectors_loc = ensure_path(vectors_loc) if freqs_loc is not None: freqs_loc = cached_path(freqs_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) probs, oov_prob = read_freqs(freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20) vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors) # Insert our custom tokenizer into the base model. #nlp.tokenizer = combined_rule_tokenizer(nlp) nlp.tokenizer = Tokenizer(nlp.vocab) if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) nlp.meta["version"] = VERSION if not output_dir.exists(): os.makedirs(output_dir, exist_ok=True) nlp.to_disk(output_dir) return nlp
def test_de_tagger_tagset(NLP, test_file): """Check that no tags outside the tagset are used.""" gold_tags = set([ "$(", "$,", "$.", "ADJA", "ADJD", "ADV", "APPO", "APPR", "APPRART", "APZR", "ART", "CARD", "FM", "ITJ", "KOKOM", "KON", "KOUI", "KOUS", "NE", "NN", "NNE", "PDAT", "PDS", "PIAT", "PIS", "PPER", "PPOSAT", "PPOSS", "PRELAT", "PRELS", "PRF", "PROAV", "PTKA", "PTKANT", "PTKNEG", "PTKVZ", "PTKZU", "PWAT", "PWAV", "PWS", "TRUNC", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "XY" ]) data_path = os.path.join(TEST_FILES_DIR, test_file) data_path = util.ensure_path(data_path) if not data_path.exists(): raise FileNotFoundError("Test corpus not found", data_path) corpus = GoldCorpus(data_path, data_path) dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False)) pred_tags = set() tagger = NLP.get_pipe('tagger') for doc, _ in dev_docs: tagger(doc) pred_tags = pred_tags.union(set([t.tag_ for t in doc])) assert len(pred_tags - gold_tags) == 0
def from_disk(self, path, **kwargs): """Load waterwheel from a file. Expects file to contain a bytestring of the following dict format: { 'stop_words': {}, 'vocab': {}, 'wikidata': {}, 'doc_bins': doc_bins_bytes, } Parameters ---------- path : Path path to the serialized file. Returns ------- self : WaterWheel The loaded WaterWheel object. """ path = ensure_path(path) with open(path, 'rb') as file: serial = file.read() self.from_bytes(serial) return self
def main(name: ("模型名称", "positional", None, None, trf_list), make_cache_dir: (" 创建缓存文件夹", "flag", "mk"), use_local_class: ("不使用网络读取", "flag", "local")): if make_cache_dir: c_path = ensure_path(f"{cache_path + name}") if c_path.exists(): msg.warn(f"{cache_path + name} already exists") else: c_path.mkdir() msg.good(f" 缓存文件夹已创建:\t{cache_path}{name}") msg.warn("\n================url================\n") config_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[name] model_file = ALL_PRETRAINED_MODEL_ARCHIVE_MAP[name] msg.text(f"{config_file}\n{model_file}\n") vocab = get_tokenizer(name, use_local_class) pretrained_vocab_files_map = vocab.pretrained_vocab_files_map for vocab_file in pretrained_vocab_files_map.values(): msg.text(f"{vocab_file[name]}\n") msg.warn("\n================url================\n") msg.good("\n使用下载工具下载后,将模型文件放入缓存文件夹中。")
def to_disk(self, output_path: Path, force: bool = False, save_examples: bool = True) -> None: """Save Corpus to Disk Args: output_path (Path): Output file path to save data to force (bool): Force save to directory. Create parent directories or overwrite existing data. save_examples (bool): Save the example store along with the state. """ output_path = ensure_path(output_path) output_dir = output_path.parent state_dir = output_dir / ".recon" / self.name if force: output_dir.mkdir(parents=True, exist_ok=True) if not state_dir.exists(): state_dir.mkdir(parents=True, exist_ok=True) ds_op_state = DatasetOperationsState( name=self.name, commit=self.commit_hash, size=len(self), operations=self.operations ) srsly.write_json(state_dir / "state.json", ds_op_state.dict()) if save_examples: self.example_store.to_disk(state_dir / "example_store.jsonl") srsly.write_jsonl(output_path, [e.dict() for e in self.data])
def from_disk( self, data_dir: Path, train_file: str = "train.jsonl", dev_file: str = "dev.jsonl", test_file: str = "test.jsonl", loader_func: Callable = read_jsonl, ) -> "Corpus": """Load Corpus from disk given a directory with files named explicitly train.jsonl, dev.jsonl, and test.jsonl Args: data_dir (Path): directory to load from. train_file (str, optional): Filename of train data under data_dir. Defaults to train.jsonl. dev_file (str, optional): Filename of dev data under data_dir. Defaults to dev.jsonl. test_file (str, optional): Filename of test data under data_dir. Defaults to test.jsonl. loader_func (Callable, optional): Callable that reads a file and returns a List of examples. Defaults to [read_jsonl][recon.loaders.read_jsonl] """ data_dir = ensure_path(data_dir) / self.name train = Dataset("train").from_disk(data_dir / train_file) dev = Dataset("dev").from_disk(data_dir / dev_file) try: test = Dataset("test").from_disk(data_dir / test_file) corpus = self(self.name, train, dev, test=test) except ValueError as e: corpus = self(self.name, train, dev) return corpus
def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. This is the second bug of two bugs underlying the issue 4042. """ nlp1 = English() # add ner pipe ner1 = nlp1.add_pipe("ner") ner1.add_label("SOME_LABEL") nlp1.initialize() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 assert "SOME_LABEL" in ner1.labels apple_ent = Span(doc1, 5, 6, label="MY_ORG") doc1.ents = list(doc1.ents) + [apple_ent] # Add the label explicitly. Previously we didn't require this. ner1.add_label("MY_ORG") ner1(doc1) assert len(ner1.labels) == 2 assert "SOME_LABEL" in ner1.labels assert "MY_ORG" in ner1.labels with make_tempdir() as d: # assert IO goes fine output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) config = {} ner2 = nlp1.create_pipe("ner", config=config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2
def test_issue4042(): """Test that serialization of an EntityRuler before NER works fine.""" nlp = English() # add ner pipe ner = nlp.add_pipe("ner") ner.add_label("SOME_LABEL") nlp.initialize() # Add entity ruler patterns = [ {"label": "MY_ORG", "pattern": "Apple"}, {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, ] # works fine with "after" ruler = nlp.add_pipe("entity_ruler", before="ner") ruler.add_patterns(patterns) doc1 = nlp("What do you think about Apple ?") assert doc1.ents[0].label_ == "MY_ORG" with make_tempdir() as d: output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) nlp2 = load_model(output_dir) doc2 = nlp2("What do you think about Apple ?") assert doc2.ents[0].label_ == "MY_ORG"
def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1
def dump(self, path: Path): path = ensure_path(path) super().dump(str(path / "kb")) cfg = { "k": self.k, "m_parameter": self.m_parameter, "ef_search": self.ef_search, "ef_construction": self.ef_construction, "n_threads": self.n_threads, } cg_cfg_path = path / "cg_cfg" aliases_path = path / "aliases.json" short_aliases_path = path / "short_aliases.json" ann_index_path = path / "ann_index.bin" tfidf_vectorizer_path = path / "tfidf_vectorizer.joblib" tfidf_vectors_path = path / "tfidf_vectors_sparse.npz" srsly.write_json(cg_cfg_path, cfg) srsly.write_json(aliases_path, self.aliases) srsly.write_json(short_aliases_path, list(self.short_aliases)) self.ann_index.saveIndex(str(ann_index_path)) joblib.dump(self.vectorizer, tfidf_vectorizer_path) scipy.sparse.save_npz(tfidf_vectors_path, self.alias_tfidfs.astype(np.float16))
def test_en_parser_depset(NLP, test_file): """Check that no tags outside the tagset are produced.""" gold_deps = set([ "ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "root", "xcomp" ]) data_path = os.path.join(TEST_FILES_DIR, test_file) data_path = util.ensure_path(data_path) if not data_path.exists(): raise FileNotFoundError("Test corpus not found", data_path) corpus = GoldCorpus(data_path, data_path) dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False)) pred_deps = set() parser = NLP.get_pipe('parser') for doc, _ in dev_docs: parser(doc) pred_deps = pred_deps.union(set([t.dep_ for t in doc])) print(pred_deps - gold_deps) assert len(pred_deps - gold_deps) == 0
def test_tokenizer_handle_text_from_file(combined_rule_tokenizer_fixture, file_name): loc = util.ensure_path(__file__).parent / file_name text = loc.open('r', encoding='utf8').read() assert len(text) != 0 tokens = combined_rule_tokenizer_fixture(text) assert len(tokens) > 100
def test_tokenizer_handle_text_from_file(tokenizer, file_name): loc = ensure_path(__file__).parent / file_name with loc.open("r", encoding="utf8") as infile: text = infile.read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100
def test_issue4054(en_vocab): """Test that a new blank model can be made with a vocab from file, and that serialization does not drop the language at any point.""" nlp1 = English() vocab1 = nlp1.vocab with make_tempdir() as d: vocab_dir = ensure_path(d / "vocab") if not vocab_dir.exists(): vocab_dir.mkdir() vocab1.to_disk(vocab_dir) vocab2 = Vocab().from_disk(vocab_dir) nlp2 = spacy.blank("en", vocab=vocab2) nlp_dir = ensure_path(d / "nlp") if not nlp_dir.exists(): nlp_dir.mkdir() nlp2.to_disk(nlp_dir) nlp3 = load_model(nlp_dir) assert nlp3.lang == "en"
def load_texts(path): """Load inputs from a jsonl file. Each line should be a dict like {"text": "..."} """ path = ensure_path(path) with path.open("r", encoding="utf8") as file_: texts = [json.loads(line) for line in file_] random.shuffle(texts) return texts
def to_disk(self, path: Path, exclude: Tuple = tuple(), **kwargs): """Serialize RemoteAnnLinker to disk. path (Path): directory to serialize to exclude (Tuple, optional): config to exclude. Defaults to tuple(). """ path = ensure_path(path) serializers = {"cfg": lambda p: srsly.write_json(p, self.cfg)} to_disk(path, serializers, {})
def test_de_tagger_corpus(NLP, test_file, accuracy_threshold): data_path = os.path.join(TEST_FILES_DIR, test_file) data_path = util.ensure_path(data_path) if not data_path.exists(): raise FileNotFoundError("Test corpus not found", data_path) corpus = GoldCorpus(data_path, data_path) dev_docs = list(corpus.dev_docs(NLP, gold_preproc=False)) scorer = NLP.evaluate(dev_docs) assert scorer.tags_acc > accuracy_threshold
def from_disk(self, path: Path, loader_func: Callable = read_jsonl) -> "Dataset": """Load Dataset from disk given a path and a loader function that reads the data and returns an iterator of Examples Args: path (Path): path to load from loader_func (Callable, optional): Callable that reads a file and returns a List of examples. Defaults to [read_jsonl][recon.loaders.read_jsonl] """ path = ensure_path(path) ds_op_state = None if (path.parent / ".recon" / self.name).exists(): state = srsly.read_json(path.parent / ".recon" / self.name / "state.json") ds_op_state = DatasetOperationsState(**state) self.operations = ds_op_state.operations data = loader_func(path) self.data = data for example in self.data: self.example_store.add(example) if ds_op_state and self.commit_hash != ds_op_state.commit: # Dataset changed, examples added self.operations.append( OperationState( name="examples_added_external", status=OperationStatus.COMPLETED, ts=datetime.now(), examples_added=max(len(self) - ds_op_state.size, 0), examples_removed=max(ds_op_state.size - len(self), 0), examples_changed=0, transformations=[], ) ) for op in self.operations: op.status = OperationStatus.NOT_STARTED seen: Set[str] = set() operations_to_run: Dict[str, OperationState] = {} for op in self.operations: if ( op.name not in operations_to_run and op.name in registry.operations and op.status != OperationStatus.COMPLETED ): operations_to_run[op.name] = op for op_name, state in operations_to_run.items(): op = registry.operations.get(op_name) self.apply_(op, *state.args, initial_state=state, **state.kwargs) # type: ignore return self
def to_disk(self, path: Path) -> None: """Save store to disk Args: path (Path): Path to save store to """ path = ensure_path(path) examples = [] for example_hash, example in self._map.items(): examples.append({"example_hash": example_hash, "example": example.dict()}) srsly.write_jsonl(path, examples)
def __init__( self, path: Union[str, Path], *, limit: int = 0, min_length: int = 0, max_length: int = 0, ) -> None: self.path = util.ensure_path(path) self.limit = limit self.min_length = min_length self.max_length = max_length
def to_disk(self, path, **kwargs): """Serialize waterwheel data to a file. Parameters ---------- path : Path path to file. """ path = ensure_path(path) serial = self.to_bytes() srsly.write_msgpack(path, serial)
def open_file(loc): '''Handle .gz, .tar.gz or unzipped files''' loc = ensure_path(loc) print("Open loc") if tarfile.is_tarfile(str(loc)): return tarfile.open(str(loc), 'r:gz') elif loc.parts[-1].endswith('gz'): return (line.decode('utf8') for line in gzip.open(str(loc), 'r')) elif loc.parts[-1].endswith('zip'): zip_file = zipfile.ZipFile(str(loc)) names = zip_file.namelist() file_ = zip_file.open(names[0]) return (line.decode('utf8') for line in file_) else: return loc.open('r', encoding='utf8')
def from_disk(self, path: Path, **kwargs): """Deserialize saved RemoteAnnLinker from disk. path (Path): directory to deserialize from RETURNS (RemoteAnnLinker): Initialized RemoteAnnLinker """ path = ensure_path(path) cfg = {} deserializers = {"cfg": lambda p: cfg.update(srsly.read_json(p))} from_disk(path, deserializers, {}) self.cfg.update(cfg) self.base_url = cfg.get('base_url') self.headers = cfg.get('headers', {}) return self
def test_serialize_kb_disk(en_vocab): # baseline assertions kb1 = _get_dummy_kb(en_vocab) _check_kb(kb1) # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb1.to_disk(str(file_path)) kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) kb2.from_disk(str(file_path)) # final assertions _check_kb(kb2)
def from_disk(self, path: Path) -> "ExampleStore": """Load store from disk Args: path (Path): Path to file to load from Returns: ExampleStore: Initialized ExampleStore """ path = ensure_path(path) examples = srsly.read_jsonl(path) for e in examples: example_hash = e["example_hash"] raw_example = e["example"] example = Example(**raw_example) assert hash(example) == example_hash self.add(example) return self
def to_disk(self, path: Path, exclude: Tuple = tuple(), **kwargs): """Serialize AnnLinker to disk. path (Path): directory to serialize to exclude (Tuple, optional): config to exclude. Defaults to tuple(). """ path = util.ensure_path(path) if not path.exists(): path.mkdir() cfg = { "threshold": self.threshold, "no_description_threshold": self.no_description_threshold, "disambiguate": self.disambiguate } srsly.write_json(path / "cfg", cfg) self.kb.dump(path / "kb") self.cg.to_disk(path)
def main(input_path: str, output_path: str, min_word_frequency: int): if input_path is not None: input_path = cached_path(input_path) input_path = ensure_path(input_path) probs, oov_prob = ( read_freqs(input_path, min_freq=min_word_frequency) if input_path is not None else ({}, -20) ) with open(output_path, "w") as _jsonl_file: _jsonl_file.write( json.dumps({"lang": "en", "settings": {"oov_prob": -20.502029418945312}}) ) _jsonl_file.write("\n") for word, prob in probs.items(): _jsonl_file.write(json.dumps({"orth": word, "prob": prob})) _jsonl_file.write("\n")
def to_disk(self, data_dir: Path, force: bool = False) -> None: """Save Corpus to Disk Args: data_dir (Path): Directory to save data to force (bool): Force save to directory. Create parent directories or overwrite existing data. """ data_dir = ensure_path(data_dir) / self.name state_dir = data_dir / ".recon" if force: data_dir.mkdir(parents=True, exist_ok=True) if not state_dir.exists(): state_dir.mkdir(parents=True, exist_ok=True) self._train.to_disk(data_dir / "train.jsonl", force=force, save_examples=False) self._dev.to_disk(data_dir / "dev.jsonl", force=force, save_examples=False) if self._test: self._test.to_disk(data_dir / "test.jsonl", force=force, save_examples=False) self.example_store.to_disk(state_dir / "example_store.jsonl")
def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. This is the second bug of two bugs underlying the issue 4042. """ nlp1 = English() vocab = nlp1.vocab # add ner pipe ner1 = nlp1.create_pipe("ner") ner1.add_label("SOME_LABEL") nlp1.add_pipe(ner1) nlp1.begin_training() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 assert "SOME_LABEL" in ner1.labels apple_ent = Span(doc1, 5, 6, label="MY_ORG") doc1.ents = list(doc1.ents) + [apple_ent] # reapply the NER - at this point it should resize itself ner1(doc1) assert len(ner1.labels) == 2 assert "SOME_LABEL" in ner1.labels assert "MY_ORG" in ner1.labels with make_tempdir() as d: # assert IO goes fine output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) nlp2 = English(vocab) ner2 = EntityRecognizer(vocab) ner2.from_disk(output_dir) assert len(ner2.labels) == 2
def load_bulk(self, path: Path): path = ensure_path(path) super().load_bulk(str(path / "kb")) aliases_path = path / "aliases.json" short_aliases_path = path / "short_aliases.json" ann_index_path = path / "ann_index.bin" tfidf_vectorizer_path = path / "tfidf_vectorizer.joblib" tfidf_vectors_path = path / "tfidf_vectors_sparse.npz" cfg = srsly.read_json(path / "cg_cfg") self.k = cfg.get("k", 5) self.m_parameter = cfg.get("m_parameter", 100) self.ef_search = cfg.get("ef_search", 200) self.ef_construction = cfg.get("ef_construction", 2000) self.n_threads = cfg.get("n_threads", 60) aliases = srsly.read_json(aliases_path) short_aliases = set(srsly.read_json(short_aliases_path)) tfidf_vectorizer = joblib.load(tfidf_vectorizer_path) alias_tfidfs = scipy.sparse.load_npz(tfidf_vectors_path).astype( np.float32) ann_index = nmslib.init( method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR, ) ann_index.addDataPointBatch(alias_tfidfs) ann_index.loadIndex(str(ann_index_path)) query_time_params = {"efSearch": self.ef_search} ann_index.setQueryTimeParams(query_time_params) self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs) return self
def test_util_ensure_path_succeeds(text): path = util.ensure_path(text) assert isinstance(path, Path)
def test_tokenizer_handle_text_from_file(tokenizer, file_name): loc = ensure_path(__file__).parent / file_name text = loc.open("r", encoding="utf8").read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100