def train_word_vectors(vectors_loc, lang='zh', model_name='zh_model'): """ 加载词向量数据 从零开始训练 spacy 模型 :param vectors_loc: :param lang: :param model_name: :return: """ if lang is None: nlp = Language() else: # create an empty language class nlp = spacy.blank(lang) with open(vectors_loc, 'rb') as file_: header = file_.readline() nr_row, nr_dim = header.split() print(nr_row, nr_dim) nlp.vocab.reset_vectors(width=int(nr_dim)) count = 0 for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') # add the vectors to the vocab count += 1 print(f'{word} added {count / int(nr_row) * 100} % ') nlp.vocab.set_vector(word, vector) nlp.to_disk("data/" + model_name) print('finishing!!!')
def load_tospacy(self, lang='en'): """ loads glove vectors from file specified in intialization, set vectors and save to disk :param lang: :return: """ if lang is None: # create blank multilanguage class with 'xx' if lang is None nlp = Language('xx') else: nlp = spacy.blank(lang) custom_log('PARSING GLOVE MODEL') with open(self.glovedir, 'r', encoding="utf8") as glove_file: model = {} for line in glove_file: split_line = line.split() word = split_line[0] embedding = np.array([float(val) for val in split_line[1:]]) nlp.vocab.set_vector(word, embedding) custom_log('VECTORS SET, SAVING TO DISK') nlp.to_disk(r'glove6B/glove-6B')
def main(vectors_loc, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(vectors_loc, 'rb') as file_: header = file_.readline() nr_row, nr_dim = header.split() print(nr_row, nr_dim) nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab print(word) # test the vectors and similarity # text = '您好' # doc = nlp(text) # print(text, doc[0].similarity(doc[1])) nlp.to_disk("./zh_model")
def test_package(nlp: Language, chdir): d = Path().cwd() modeld = d / "model" pkgd = d / "package" pkgd.mkdir() nlp.to_disk(modeld) package(modeld, pkgd)
def test_serialize_config_language_specific(): """Test that config serialization works as expected with language-specific factories.""" name = "test_serialize_config_language_specific" @English.factory(name, default_config={"foo": 20}) def custom_factory(nlp: Language, name: str, foo: int): return lambda doc: doc nlp = Language() assert not nlp.has_factory(name) nlp = English() assert nlp.has_factory(name) nlp.add_pipe(name, config={"foo": 100}, name="bar") pipe_config = nlp.config["components"]["bar"] assert pipe_config["foo"] == 100 assert pipe_config["factory"] == name with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) assert nlp2.has_factory(name) assert nlp2.pipe_names == ["bar"] assert nlp2.get_pipe_meta("bar").factory == name pipe_config = nlp2.config["components"]["bar"] assert pipe_config["foo"] == 100 assert pipe_config["factory"] == name config = Config().from_str(nlp2.config.to_str()) config["nlp"]["lang"] = "de" with pytest.raises(ValueError): # German doesn't have a factory, only English does load_model_from_config(config)
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], [ "i'm looking for a place in the north of town", [[31, 36, "LOCATION"]] ], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, overwrite_ents=True) ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) nlp.add_pipe(ruler) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.overwrite is True
def test_issue_3526_4(en_vocab): nlp = Language(vocab=en_vocab) patterns = [{"label": "ORG", "pattern": "Apple"}] config = {"overwrite_ents": True} ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.overwrite is True
def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to readd labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], ["show me chinese restaurants", [[8, 15, "CUISINE"]]], ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.begin_training() ner.model.learn_rate = 0.001 for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = Language().from_disk(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def main(lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) file_loc = './广电全量地址.txt' #file_loc = '/home/siy/Downloads/guizhou/new/txt/0.txt' nr_dim = 768 nlp.vocab.reset_vectors(width=int(nr_dim)) cnt = 0 with open(file_loc, 'r') as f: # df = pd.read_csv(f) lines = f.readlines() np.random.shuffle(lines) lines = lines[:10000] for line in lines: # line = line.decode() # print(line) line = strQ2B(line) line.strip() line = clr(line) line.strip() vecs = [] try: print(line) vecs = bc.encode(list(line)) except: traceback.print_exc() print(list(line)) continue for char, vec in zip(line, vecs): try: nlp.vocab.set_vector(ord(char), bc.encode([char])) except BaseException: traceback.print_exc() print(char) continue cnt += 1 print('bingo, i write in %s' % cnt) nlp.to_disk('./zh_models')
def main(vectors_loc=None, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(VECTORS_PATH, "rb") as file_: print("loading vectors...") header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode("utf8") pieces = line.rsplit(" ", int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f") nlp.vocab.set_vector(word, vector) # add the vectors to the vocab tagger = nlp.create_pipe("tagger") # Add the tags. This needs to be done before you start training. print("trainning tags...") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(20): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model test_text = "Eu desejo ouvir uma música muito boa" doc = nlp(test_text) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) print("Saved mode to nl_model_tagger") nlp.to_disk("/app/model")
def test_serialize_with_custom_tokenizer(): """Test that serialization with custom tokenizer works without token_match. See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2 """ prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""") suffix_re = re.compile(r"""""") infix_re = re.compile(r"""[~]""") def custom_tokenizer(nlp): return Tokenizer( nlp.vocab, {}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, ) nlp = Language() nlp.tokenizer = custom_tokenizer(nlp) with make_tempdir() as d: nlp.to_disk(d)
def main(vectors_loc, lang=None, model_name='zh_model'): if lang is None: nlp = Language() else: # create an empty language class nlp = spacy.blank(lang) with open(vectors_loc, 'rb') as file_: header = file_.readline() nr_row, nr_dim = header.split() print(nr_row, nr_dim) nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') # add the vectors to the vocab nlp.vocab.set_vector(word, vector) nlp.to_disk("data/" + model_name) print('finishing!!!')
def recreateWordVectors(vectors_loc="wordfeats/glove.6B/glove.6B.50d.txt", save_loc="wordfeats"): lang = "en" if lang is None: nlp = Language() else: nlp = spacy.blank(lang) with open(vectors_loc, 'rb') as file_: header = file_.readline() # nr_row, nr_dim = header.split() nr_dim = 50 nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = np.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab nlp.to_disk(save_loc) return
def main(vectors_loc, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) print('=' * 20) new_model = KeyedVectors.load_word2vec_format(vectors_loc, binary=True) for word in new_model.wv.index2word: vector = numpy.asarray([float(v) for v in new_model[word]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab # print(word, vector) print('=' * 20) # test the vectors and similarity # text = '不同' # doc = nlp(text) # print(text, doc[0].similarity(doc[1])) # print('='*20) nlp.to_disk("./zh_model")
def save_model(nlp: Language, path: Path) -> None: nlp.to_disk(path) logger.info(f"Saved the model in {str(path.absolute())}")
def save_model(nlp: Language, output_path: str): nlp.to_disk(output_path)
def test_serialize_language_meta_disk(meta_data): language = Language(meta=meta_data) with make_tempdir() as d: language.to_disk(d) new_language = Language().from_disk(d) assert new_language.meta == language.meta
"neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water", "person", "family", "brother", "sister", ] nlp = spacy.load("en_core_web_md") vec_data = {w: nlp(w).vector for w in words} vocab = Vocab(strings=words) for word, vector in vec_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab, meta={"lang": "en"}) vocab.to_disk("tests/custom_test_vocab") print("local vocab saved for spacy") nlp.to_disk("tests/custom_test_lang") print("local nlp saved for spacy")
def save_model(model: Language, project=None, session=None): output_dir = get_model_dir(project, session) logging.debug("Saving model to {}...".format(output_dir)) model.to_disk(output_dir) return output_dir