class CkSpacyModel(): def __init__(self, xml_dir, output_dir, section_names): self.xml_dir = xml_dir self.output_dir = output_dir self.section_names = section_names self.__current_xml_files_for_spacy_preprocessing = [] self.__filenames = [] self._TEXTS = [] self._current_TEXTS_idx = 0 self.nlp = spacy.load('en_core_web_md') self.ruler = EntityRuler(self.nlp,overwrite_ents=True ).from_disk("./patterns.jsonl") #self.ruler = EntityRuler(self.nlp) self._current_sentence_idx = 0 self.TRAIN_DATA = [] self.stringstore = 0 self.matcher = Matcher(self.nlp.vocab) Token.set_extension("is_unit", getter= self.is_unit) Token.set_extension("alt_text", default = None) # getter= self.get_alt_text) Token.set_extension("alt_text_keep", default = True) # whether this word should be keeped in the alternative text (necessary because of trailing whitespaces)) Token.set_extension("alt_text_trailing_whitespace_", default = " ") self.matcher_units = PhraseMatcher(self.nlp.vocab) # der PhraseMatcher fuer die Uniterkennung fuer alternative words self.matcher_alt_text = Matcher(self.nlp.vocab) self.pattern_file_custom_matcher_alt_text = "./Lib/units.jsonl" def pre_process(self): print('starting preprocess') self.nlp.add_pipe(self.ruler, after="ner") self.nlp.add_pipe(self.custom_pipe_component_phrase_entity, before="ner") #self.nlp.add_pipe(self.custom_pipe_component_Name_et_al, after="ner") #self.nlp.add_pipe(self.custom_pipe_component_Quantity, last=True) #self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit, last=True) # lade die pattern in den Matcher self.custom_matcher_alt_text() # self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit_text, last=True) self.nlp.add_pipe(self.custom_pipe_comp_alt_text, last = True) # als letztes kommt dann die Wortersetzung fuer das simplified english ... 10 mg = xy mg self.extract_text() def reintegrate_patterns_to_ruler(self, file): self.ruler = EntityRuler(self.nlp).from_disk(file) #self.nlp.remove_pipe("ruler") self.nlp.replace_pipe("entity_ruler", self.ruler) #self.nlp.add_pipe(self.ruler, before="ner") #* The entity ruler is designed to integrate with spaCy’s existing statistical models #* and enhance the named entity recognizer. If it’s added before the "ner" component, #* the entity recognizer will respect the existing entity spans and adjust its #* predictions around it. This can significantly improve accuracy in some cases. #* If it’s added after the "ner" component, the entity ruler will only add spans to #* the doc.ents if they don’t overlap with existing entities predicted by the model. #* To overwrite overlapping entities, you can set overwrite_ents=True on initialization. def show_ents(self, doc): if doc.ents: for ent in doc.ents: print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_))) else: print('No named entities found.') def get_next_sentence(self): self._current_TEXT = self._TEXTS[self._current_TEXTS_idx] self._current_doc = self.nlp(self._current_TEXT) sentences = list(self._current_doc.sents) sentence = sentences[self._current_sentence_idx] if self._current_sentence_idx < len(sentences)-1: self._current_sentence_idx += 1 else: self._current_sentence_idx = 0 print('next document') if self._current_TEXTS_idx < len(self._TEXTS)-1: self._current_TEXTS_idx += 1 else: print('end of Text list') sentence = self.nlp(sentence.text) unknown_words = [] for token in sentence: #print("check for : " + token.text) if token.is_oov: unknown_words.append(token) #print("not found: " + token.text) print(f"token.text = {token.text:{18}} : token._.alt_text = {token._.alt_text:{10}}") return (sentence, unknown_words) def add_pattern_to_entity_ruler(self,patterns,file): # die Prufung auf gleiche Lines hab ich nicht hinbekommen # daher pruefung auf doppelte und Loeschung von diesen self.ruler.add_patterns(patterns) self.ruler.to_disk(file) uniqlines = set(open(file).readlines()) with open(file,'w',encoding='utf8') as fp: for line in uniqlines: fp.write(line) def add_sentence_to_TRAIN_DATA(self,sentence, filename): exists = os.path.isfile(filename) if exists: with open(filename,'r',encoding='utf8') as fh: for line in fh: one_line = line[:-1] self.TRAIN_DATA.append(one_line) self.TRAIN_DATA.append(sentence) if exists: # haenge nur den einen aktuellen Listeneintrag an with open(filename,'a',encoding='utf8') as fh: listitem = self.TRAIN_DATA.pop() fh.write('%s\n' % listitem) if not exists: with open(filename,'w+',encoding='utf8') as fh: for listitem in self.TRAIN_DATA: fh.write('%s\n' % listitem) def add_word_to_stringstore(self, word, path): try: self.stringstore = StringStore().from_disk(path) self.stringstore.add(word) except: self.stringstore = StringStore(word) self.stringstore.to_disk(path) def add_word_to_vocab_permanently(self,word): pass def add_word_to_vocab_temporarely(self, word): pass def add_stringstore_to_vocab_temporarely(self, file): try: self.stringstore = StringStore().from_disk(file) for word in self.stringstore: lex = self.nlp.vocab[word] self.nlp.vocab[word].is_oov = False except: print("cannot read stringstore in file " + file) def add_pattern_jsonl_file_to_vocab_and_entity_matcher(self, pattern_file): (ents, pattern) = self.read_gazetteer(pattern_file) for i in range(len(ents)-1): #print(ents[i]) #print(pattern[i]) #print(type(pattern[i])) self.matcher.add(ents[i], None, pattern[i]) # self.matcher.add(entity, None, *phrases) def read_gazetteer(self, loc): pattern = [] ents = [] idx = 0 for i, line in enumerate(open(loc)): idx +=1 data = eval(line.strip()) # data = json.loads(line.strip()) # ich fuege zum Vocab den String #phrase = self.nlp.tokenizer(data["pattern"]) #phrase = data["pattern"][0] ents.append(data["label"]) # ich fuege zum matcher das pattenr pattern.append(data["pattern"]) # adde die Worte zum vocab #print(f"laenge der phrases = {len(phrases)}") # print(phrase) try: phrase = ["pattern"][1]["lower"] for w in phrase: _ = self.nlp.tokenizer.vocab[w.text] except: pass return (ents, pattern) # for i, line in enumerate(open(loc)): # data = json.loads(line.strip()) # #! dann duerfen es aber nur einzelne Worte sein # phrase = self.nlp.tokenizer(data["pattern"]) # # adde die Worte zum vocab # print(f"laenge der phrases = {len(phrase)}") # for w in phrase: # _ = self.nlp.tokenizer.vocab[w.text] # if len(phrase) >= 2: # yield phrase #*___________________________________________________________ #*___________________________________________________________ #* CUSTOM PIPE COMPONENTS #* Hier kommen die Cusom Pipe Components #*Aufgabe hauptsaechlich Entitaeten mittels Matchern zu verbessern #*Diese werden in der Funktion preproces in die Pipeline integriert def custom_pipe_component_phrase_entity(self, doc): # for ent in doc.ents: # print(ent.text) # Apply the matcher to the doc matches = self.matcher(doc) # Create a Span for each match and assign the label 'ANIMAL' spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches] # Overwrite the doc.ents with the matched spans try: doc.ents = list(doc.ents) + spans except: print(f"overlapping Entities with {spans}") # doc.ents = spans return doc def custom_pipe_component_Name_et_al(self, doc): print("entering_custom_pipe_component Name et al") new_ents = [] for ent in doc.ents: print(f"ent = {ent}") # Only check for title if it's a person and not the first token replaced = False if ent.label_ == "PERSON":# and ent.end<len(doc)-2: # gib das neue label if et al. is in person or after Person if 'et' in ent.text and ('al' in ent.text or 'al.' in ent.text): new_ent = Span(doc, ent.start, ent.end, label="REF") replaced = True print("new ents") else: # wir schauen ob die danach folgenden et al sind print("within label Person") next_token = doc[ent.end + 1] next_next_token = doc[ent.end + 2] print(next_token.text) print(next_next_token.text) if next_token.text == "et" and next_next_token.text in ("al.", "al"): new_ent = Span(doc, ent.start, ent.end+2, label="REF") new_ents.append(new_ent) replaced = True print("new_ent") # es wird das neue angehangen if replaced: new_ents.append(new_ent) print('new ent') else: # es wird die alte Entitaet uveraendert uebertragen new_ents.append(ent) print("old ents") doc.ents = new_ents print(doc.ents) return doc def custom_pipe_component_Quantity(self, doc): # 10 mg macht er meist als 10(CARDINAL) mg # Ziel 10 mg (QUANTITY) print("entering_custom_pipe_component Quantity") print(doc.text) new_ents = [] for ent in doc.ents: print(ent.text) print(ent.label_) # Only check for title if it's a person and not the first token replaced = False if ent.label_ == "CARDINAL":# and ent.end<len(doc)-2: next_token = doc[ent.end] if next_token.text in ["mg", "g"]: new_ent = Span(doc, ent.start, ent.end+1, label="QUANTITY") replaced = True # es wird das neue angehangen if replaced: new_ents.append(new_ent) print('new ent') else: # es wird die alte Entitaet uveraendert uebertragen new_ents.append(ent) print("old ents") try: doc.ents = new_ents except: print("overlapping Entities in Quantity") for ent in new_ents: print(f"ent = {ent.text} start = {ent.start} stop = {ent.end} label = {ent.label_}") #print(doc.ents) return doc def custom_pipe_component_set_extension_unit(self, doc): pass #*___________________________________________________________ #*___________________________________________________________ #* EXTENSION Methods # Hier kommen die EXTENSION Methods # Hauptaufgabe ist das setzen von user defined Attributes, Propertien and Methods #Hauptziel fuer bestimmte Tokens ein neues text Token mit simplified english #zu def custom_pipe_comp_alt_text(self, doc): # setze standardmaessig den alternativ text auf den Orginaltext for token in doc: token._.alt_text = token.text token._.alt_text_trailing_whitespace_ = token.whitespace_ # nun wird der Matcher aufgerufen, der nach verschiedenen Regeln sucht # diese gefundenen Regeln werden danach abgefangen und der Alternativtext # wird entsprechend dieser Regeln gesetzt matches = self.matcher_alt_text(doc) # Create a Span for each match and assign the label 'ANIMAL' for match_id, start, end in matches: # Zahl die allein steht und als ent Type Cardinal ist if self.nlp.vocab.strings[match_id]=="NUMCARDINAL": doc[start]._.alt_text = "NUM" # UNITS # Wenn UNITS allein stehen if self.nlp.vocab.strings[match_id]=="UNITS": doc[start]._.alt_text = "UNITS" # Wenn Units nach einer Zahl als eigenes Token stehen if self.nlp.vocab.strings[match_id]=="NUM_UNIT": doc[start]._.alt_text = "99" doc[start+1]._.alt_text = "UNITS" # WEnn Units nach einer Zahl in einem Token stehen if self.nlp.vocab.strings[match_id]=="NUMUNIT": # zahl und Einheit wurde zusammen geschrieben doc[start]._.alt_text = "99UNITS" if self.nlp.vocab.strings[match_id]=="DRUGNAME": doc[start]._.alt_text = "DRUGNAME" if self.nlp.vocab.strings[match_id]=="NAMEETAL": doc[start]._.alt_text = "REF" doc[start+1]._.alt_text = "not to keep" doc[start+1]._.alt_text_keep = False doc[start+2]._.alt_text = "not to keep" doc[start+2]._.alt_text_keep = False doc[start+3]._.alt_text = "not to keep" doc[start+3]._.alt_text_keep = False if self.nlp.vocab.strings[match_id]=="REFx": doc[start]._.alt_text = "REF" if self.nlp.vocab.strings[match_id]=="REFS": doc[start]._.alt_text = "REF" if self.nlp.vocab.strings[match_id]=="REFpunkt": doc[start]._.alt_text = "REF" if self.nlp.vocab.strings[match_id]=="XYMIN": doc[start]._.alt_text = "XYMIN" if self.nlp.vocab.strings[match_id]=="XY-YEARREG": doc[start]._.alt_text = "99-year" if self.nlp.vocab.strings[match_id]=="XYYEARREG": doc[start]._.alt_text = "99year" if self.nlp.vocab.strings[match_id]=="XYMINREG": doc[start]._.alt_text = "99min" if self.nlp.vocab.strings[match_id]=="XY-MINREG": doc[start]._.alt_text = "99-min" if self.nlp.vocab.strings[match_id]=="XY_PROCENT": doc[start]._.alt_text = "99" doc[start+1]._.alt_text = "%" if self.nlp.vocab.strings[match_id]=="XY-RECEPTOR": doc[start]._.alt_text = "XY" doc[start+1]._.alt_text = "-" doc[start+2]._.alt_text = "receptor" if self.nlp.vocab.strings[match_id]=="XY_RECEPTOR": doc[start]._.alt_text = "XY" doc[start+1]._.alt_text = "receptor" # {"label":"REFS","pattern":[{"TEXT": "AuthorsEtAl"}]} # {"label":"REFx","pattern":[{"TEXT": "AuthorEtAl"}]} # doc[start]._.alt_text = doc[start].text + " " + self.nlp.vocab.strings[match_id] + " gefunden" # spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches] return doc def custom_matcher_alt_text(self): pattern_file = self.pattern_file_custom_matcher_alt_text (ents, pattern) = self.read_pattern_matcher_file(pattern_file) for i in range(len(ents)-1): self.matcher_alt_text.add(ents[i], None, pattern[i]) # self.matcher.add(entity, None, *phrases) # pattern = [] # pattern.append([{'IS_DIGIT': True}, {'LOWER':'ng'}]) # pattern.append([{'IS_DIGIT': True}, {'LOWER':'mg'}]) # self.matcher_units2.add('UNITS', None, *pattern) # diese Funktion soll den Text jedes Tokens setzen def custom_pipe_component_set_extension_unit_text(self, doc): # rufe den PhraseMatcher fuer die units auf #self.matcher_units2 = Matcher(self.nlp.vocab) self.add_pattern_jsonl_file_Phrasematcher("./Lib/units.jsonl") matches = self.matcher_units(doc) # Create a Span for each match and assign the label 'ANIMAL' for match_id, start, end in matches: doc[start]._.alt_text = doc[start].text + "_ unit gefunden" # spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches] return doc def is_unit(self,token): return token.text == "mg" #def get_alt_text(self,token): # return token._.alt_text def add_pattern_jsonl_file_Phrasematcher(self, pattern_file): (ents, unit_pattern) = self.read_gazetteer2(pattern_file) for i in range(len(ents)-1): #matcher_units.add("Units", None, *list(nlp.pipe(COUNTRIES))) self.matcher_units.add("UNITS", None, *list(self.nlp.pipe(unit_pattern))) # self.matcher_units.add(ents[i], None, pattern[i]) # self.matcher.add(entity, None, *phrases) def read_gazetteer2(self, loc): pattern = [] ents = [] idx = 0 for i, line in enumerate(open(loc)): idx +=1 data = eval(line.strip()) ents.append(data["label"]) # ich fuege zum matcher das pattenr pattern.append(data["pattern"]) return (ents, pattern) def read_pattern_matcher_file(self, loc): pattern = [] ents = [] for i, line in enumerate(open(loc)): data = eval(line.strip()) ents.append(data["label"]) pattern.append(data["pattern"]) return (ents, pattern) #*___________________________________________________________ #*___________________________________________________________ #* Text Extraction von XML to txt # Wandelt den Text von den XML Dokumenten in reinen Text um #diese werden dann im self.output_dir gespeichert # def extract_text(self): idx = 0 for file in os.listdir(self.xml_dir): print(f'schleife extract text with : {idx} ') if file.endswith('.xml'): input_filename = os.path.join(self.xml_dir, file) if len(self.section_names)==1: prefix = self.section_names[0] else: prefix = 'section_mix' output_filename = os.path.join(self.output_dir, prefix + '_' + file) print(output_filename) self.__current_xml_files_for_spacy_preprocessing.append(input_filename) with open(input_filename, "r", encoding="utf8") as f1: print('-------------------------') print('filename:' + input_filename) xml = f1.read() P = RP.Research_Paper_XMLJSON(xml, "json") P.development_test() #P.analyse_parsing() rtext = '' for section_name in self.section_names: rtext = rtext + P.get_part_of_text(section_name) #print(rtext) with open(output_filename,"w+", encoding="utf8") as f2: self._TEXTS.append(rtext) f2.write(rtext) idx += 1 # ! This has to be removed in further versions if idx > 10: break def get_sentence_alt_text(self, sent): # uebergabe eines doc objects /// sentence # rueckgabe eines TExtes das den alternativen TExt nutzt alt_text = "" sent_org_text = sent.text for token in sent: if token._.alt_text_keep: alt_text = alt_text + token._.alt_text + token._.alt_text_trailing_whitespace_ return alt_text
class Sense2Vec(object): def __init__( self, shape: tuple = (1000, 128), strings: StringStore = None, senses: List[str] = [], vectors_name: str = "sense2vec", overrides: Dict[str, str] = SimpleFrozenDict(), ): """Initialize the Sense2Vec object. shape (tuple): The vector shape. strings (StringStore): Optional string store. Will be created if it doesn't exist. senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. vectors_name (unicode): Optional name to assign to the Vectors object. overrides (dict): Optional custom functions to use, mapped to names registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ self.vectors = Vectors(shape=shape, name=vectors_name) self._row2key = None self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cache = None self.cfg: Dict[str, Any] = { "senses": senses, "make_key": "default", "split_key": "default", } self.cfg.update(overrides) @property def senses(self) -> Sequence[str]: """RETURNS (list): The available senses.""" return self.cfg.get("senses", []) @property def frequencies(self) -> List[Tuple[str, int]]: """RETURNS (list): The (key, freq) tuples by frequency, descending.""" freqs = [(self.strings[k], s) for k, s in self.freqs.items() if s is not None] return sorted(freqs, key=lambda item: item[1], reverse=True) def __len__(self) -> int: """RETURNS (int): The number of rows in the vectors table.""" return len(self.vectors) def __contains__(self, key: Union[str, int]) -> bool: """Check if a key is in the vectors table. key (unicode / int): The key to look up. RETURNS (bool): Whether the key is in the table. """ key = self.ensure_int_key(key) return key in self.vectors def __getitem__(self, key: Union[str, int]) -> Union[numpy.ndarray, None]: """Retrieve a vector for a given key. Returns None if the key is not in the table. key (unicode / int): The key to look up. RETURNS (numpy.ndarray): The vector. """ key = self.ensure_int_key(key) if key in self.vectors: return self.vectors[key] return None def __setitem__(self, key: Union[str, int], vector: numpy.ndarray): """Set a vector for a given key. Will raise an error if the key doesn't exist. key (unicode / int): The key. vector (numpy.ndarray): The vector to set. """ key = self.ensure_int_key(key) if key not in self.vectors: raise ValueError(f"Can't find key {key} in table") self.vectors[key] = vector self._row2key = None def __iter__(self): """YIELDS (tuple): String key and vector pairs in the table.""" yield from self.items() def items(self): """YIELDS (tuple): String key and vector pairs in the table.""" for key, value in self.vectors.items(): yield self.strings[key], value def keys(self): """YIELDS (unicode): The string keys in the table.""" for key in self.vectors.keys(): yield self.strings[key] def values(self): """YIELDS (numpy.ndarray): The vectors in the table.""" yield from self.vectors.values() @property def row2key(self): if not self._row2key: self._row2key = { row: key for key, row in self.vectors.key2row.items() } return self._row2key @property def make_key(self) -> Callable: """Get the function to make keys.""" return registry.make_key.get(self.cfg["make_key"]) @property def split_key(self) -> Callable: """Get the function to split keys.""" return registry.split_key.get(self.cfg["split_key"]) def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): """Add a new vector to the table. key (unicode / int): The key to add. vector (numpy.ndarray): The vector to add. freq (int): Optional frequency count. """ if not isinstance(key, int): key = self.strings.add(key) self.vectors.add(key, vector=vector) if freq is not None: self.set_freq(key, freq) self._row2key = None def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: """Get the frequency count for a given key. key (unicode / int): They key to look up. default: Default value to return if no frequency is found. RETURNS (int): The frequency count. """ key = self.ensure_int_key(key) return self.freqs.get(key, default) def set_freq(self, key: Union[str, int], freq: int): """Set a frequency count for a given key. key (unicode / int): The key to set the count for. freq (int): The frequency count. """ if not isinstance(freq, int): raise ValueError( f"Invalid frequency count: {repr(freq)} for '{key}'") key = self.ensure_int_key(key) self.freqs[key] = freq def ensure_int_key(self, key: Union[str, int]) -> int: """Ensure that a key is an int by looking it up in the string store. key (unicode / int): The key. RETURNS (int): The integer key. """ return key if isinstance(key, int) else self.strings.add(key) def similarity( self, keys_a: Union[Sequence[Union[str, int]], str, int], keys_b: Union[Sequence[Union[str, int]], str, int], ) -> float: """Make a semantic similarity estimate of two keys or two sets of keys. The default estimate is cosine similarity using an average of vectors. keys_a (unicode / int / iterable): The string or integer key(s). keys_b (unicode / int / iterable): The other string or integer key(s). RETURNS (float): The similarity score. """ if isinstance(keys_a, (str, int)): keys_a = [keys_a] if isinstance(keys_b, (str, int)): keys_b = [keys_b] average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0) average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0) return cosine_similarity(average_a, average_b) def most_similar( self, keys: Union[Sequence[Union[str, int]], str, int], n: int = 10, batch_size: int = 16, ) -> List[Tuple[str, float]]: """Get the most similar entries in the table. If more than one key is provided, the average of the vectors is used. keys (unicode / int / iterable): The string or integer key(s) to compare to. n (int): The number of similar keys to return. batch_size (int): The batch size to use. RETURNS (list): The (key, score) tuples of the most similar vectors. """ if isinstance(keys, (str, int)): keys = [keys] for key in keys: if key not in self: raise ValueError(f"Can't find key {key} in table") if self.cache and self.cache["indices"].shape[1] >= n: n = min(len(self.vectors), n) key = self.ensure_int_key(key) key_row = self.vectors.find(key=key) if key_row < self.cache["indices"].shape[0]: rows = self.cache["indices"][key_row, :n] scores = self.cache["scores"][key_row, :n] entries = zip(rows, scores) entries = [(self.strings[self.row2key[r]], score) for r, score in entries if r in self.row2key] return entries # Always ask for more because we'll always get the keys themselves n = min(len(self.vectors), n + len(keys)) rows = numpy.asarray(self.vectors.find(keys=keys)) vecs = self.vectors.data[rows] average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( average, n=n, batch_size=batch_size) result = list(zip(result_keys.flatten(), scores.flatten())) result = [(self.strings[key], score) for key, score in result if key] result = [(key, score) for key, score in result if key not in keys] return result def get_other_senses(self, key: Union[str, int], ignore_case: bool = True) -> List[str]: """Find other entries for the same word with a different sense, e.g. "duck|VERB" for "duck|NOUN". key (unicode / int): The key to check. ignore_case (bool): Check for uppercase, lowercase and titlecase. RETURNS (list): The string keys of other entries with different senses. """ result = [] key = key if isinstance(key, str) else self.strings[key] word, orig_sense = self.split_key(key) versions = [word, word.upper(), word.title() ] if ignore_case else [word] for text in versions: for sense in self.senses: new_key = self.make_key(text, sense) if sense != orig_sense and new_key in self: result.append(new_key) return result def get_best_sense(self, word: str, senses: Sequence[str] = tuple(), ignore_case: bool = True) -> Union[str, None]: """Find the best-matching sense for a given word based on the available senses and frequency counts. Returns None if no match is found. word (unicode): The word to check. senses (list): Optional list of senses to limit the search to. If not set / empty, all senses in the vectors are used. ignore_case (bool): Check for uppercase, lowercase and titlecase. RETURNS (unicode): The best-matching key or None. """ sense_options = senses or self.senses if not sense_options: return None versions = [word, word.upper(), word.title() ] if ignore_case else [word] freqs = [] for text in versions: for sense in sense_options: key = self.make_key(text, sense) if key in self: freq = self.get_freq(key, -1) freqs.append((freq, key)) return max(freqs)[1] if freqs else None def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: """Serialize a Sense2Vec object to a bytestring. exclude (list): Names of serialization fields to exclude. RETURNS (bytes): The serialized Sense2Vec object. """ vectors_bytes = self.vectors.to_bytes() freqs = list(self.freqs.items()) data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs} if "strings" not in exclude: data["strings"] = self.strings.to_bytes() if "cache" not in exclude: data["cache"] = self.cache return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. bytes_data (bytes): The data to load. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) self.freqs = dict(data.get("freqs", [])) self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) if "cache" not in exclude and "cache" in data: self.cache = data.get("cache", {}) self._row2key = None return self def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Serialize a Sense2Vec object to a directory. path (unicode / Path): The path. exclude (list): Names of serialization fields to exclude. """ path = Path(path) self.vectors.to_disk(path) srsly.write_json(path / "cfg", self.cfg) srsly.write_json(path / "freqs.json", list(self.freqs.items())) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") if "cache" not in exclude and self.cache: srsly.write_msgpack(path / "cache", self.cache) def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. path (unicode / Path): The path to load from. exclude (list): Names of serialization fields to exclude. RETURNS (Sense2Vec): The loaded object. """ path = Path(path) strings_path = path / "strings.json" freqs_path = path / "freqs.json" cache_path = path / "cache" self.vectors = Vectors().from_disk(path) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) if "cache" not in exclude and cache_path.exists(): self.cache = srsly.read_msgpack(cache_path) self._row2key = None return self
class Analz(object): """ >>> from sagas.nlu.analz import analz >>> analz.add_pats('typ', ['寄账单地址']) >>> analz.add_pats('srv', ['新建']) >>> doc=analz.parse("我想要新建一些寄账单地址") >>> analz.vis(doc) >>> doc.terms """ def __init__(self): import os from sagas.conf.conf import cf from pyltp import Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller from spacy.strings import StringStore self.stringstore = StringStore() MODELDIR = f'{cf.conf_dir}/ai/ltp/ltp_data_v3.4.0' self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) par_model_path = os.path.join(MODELDIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(MODELDIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) self.conf = AnalConf('zh') self.conf.setup(self) def add_pats(self, pat_name, pat_text_ls: List[Text]): import jieba id_hash = self.stringstore.add(pat_name) for t in pat_text_ls: jieba.add_word(t, tag=id_hash) def tokenize(self, sents: Text) -> List[Dict[Text, Text]]: import jieba.posseg as pseg toks = pseg.cut(sents) terms = [] for i, (word, flag) in enumerate(toks): if not isinstance(flag, str): ref = self.stringstore[flag] else: ref = flag terms.append({'term': ref, 'value': word}) return terms def parse(self, sents: Text) -> Docz: terms = self.tokenize(sents) words = [w['value'] for w in terms] postags = self.postagger.postag(words) arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) netags = self.recognizer.recognize(words, postags) # terms=list(filter(lambda x: x['term'] in terms_list, terms)) return Docz(words, postags, arcs, roles, netags, terms) def vis(self, doc): from graphviz import Digraph f = Digraph('deps', filename='deps.gv') f.attr(rankdir='LR', size='8,5') f.attr('node', shape='egg', fontname='Calibri') for i in range(len(doc.words)): idx = int(doc.arcs[i].head) - 1 if idx == -1: continue a = doc.words[idx] print("%s --> %s|%s|%s|%s" % (a, doc.words[i], doc.arcs[i].relation, doc.postags[i], doc.netags[i])) f.edge(a, doc.words[i], label=doc.arcs[i].relation.lower()) return f