示例#1
0
class CkSpacyModel():

    def __init__(self, xml_dir, output_dir, section_names):
        self.xml_dir = xml_dir
        self.output_dir = output_dir
        self.section_names = section_names
        self.__current_xml_files_for_spacy_preprocessing = []
        self.__filenames = []
        self._TEXTS = []
        self._current_TEXTS_idx = 0
        self.nlp = spacy.load('en_core_web_md')
        self.ruler = EntityRuler(self.nlp,overwrite_ents=True  ).from_disk("./patterns.jsonl")
        #self.ruler = EntityRuler(self.nlp)
        self._current_sentence_idx = 0
        self.TRAIN_DATA = []
        self.stringstore = 0
        self.matcher = Matcher(self.nlp.vocab)
        Token.set_extension("is_unit",  getter= self.is_unit)
        Token.set_extension("alt_text", default = None) #  getter= self.get_alt_text)
        Token.set_extension("alt_text_keep", default = True) #  whether this word should be keeped in the alternative text (necessary because of trailing whitespaces))
        Token.set_extension("alt_text_trailing_whitespace_", default = " ")
        self.matcher_units = PhraseMatcher(self.nlp.vocab) # der PhraseMatcher fuer die Uniterkennung fuer alternative words
        self.matcher_alt_text = Matcher(self.nlp.vocab)
        self.pattern_file_custom_matcher_alt_text = "./Lib/units.jsonl"

    def pre_process(self):
        print('starting preprocess')   
        self.nlp.add_pipe(self.ruler, after="ner")
        self.nlp.add_pipe(self.custom_pipe_component_phrase_entity, before="ner")
        #self.nlp.add_pipe(self.custom_pipe_component_Name_et_al, after="ner")
        #self.nlp.add_pipe(self.custom_pipe_component_Quantity, last=True)
        #self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit, last=True)

        # lade die pattern in den Matcher
        self.custom_matcher_alt_text()
#        self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit_text, last=True)
        self.nlp.add_pipe(self.custom_pipe_comp_alt_text, last = True)
        # als letztes kommt dann die Wortersetzung fuer das simplified english ... 10 mg = xy mg

        self.extract_text()
        
    def reintegrate_patterns_to_ruler(self, file):
        self.ruler = EntityRuler(self.nlp).from_disk(file)
        #self.nlp.remove_pipe("ruler")
        self.nlp.replace_pipe("entity_ruler", self.ruler)
        #self.nlp.add_pipe(self.ruler, before="ner")

        #* The entity ruler is designed to integrate with spaCy’s existing statistical models 
        #* and enhance the named entity recognizer. If it’s added before the "ner" component, 
        #* the entity recognizer will respect the existing entity spans and adjust its 
        #* predictions around it. This can significantly improve accuracy in some cases. 
        #* If it’s added after the "ner" component, the entity ruler will only add spans to 
        #* the doc.ents if they don’t overlap with existing entities predicted by the model. 
        #* To overwrite overlapping entities, you can set overwrite_ents=True on initialization.




    def show_ents(self, doc):
        if doc.ents:
            for ent in doc.ents:
                print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
        else:
            print('No named entities found.')

    def get_next_sentence(self):
        self._current_TEXT = self._TEXTS[self._current_TEXTS_idx]
        self._current_doc = self.nlp(self._current_TEXT)
        sentences = list(self._current_doc.sents)
        sentence = sentences[self._current_sentence_idx]
        if self._current_sentence_idx < len(sentences)-1:
            self._current_sentence_idx += 1
        else:
            self._current_sentence_idx = 0
            print('next document')
            if self._current_TEXTS_idx < len(self._TEXTS)-1:
                self._current_TEXTS_idx += 1
            else:
                print('end of Text list')
        sentence = self.nlp(sentence.text)
        unknown_words = []
        for token in sentence:
            #print("check for : " + token.text)
            if token.is_oov:
                unknown_words.append(token)
                #print("not found: " + token.text)
            print(f"token.text = {token.text:{18}} : token._.alt_text = {token._.alt_text:{10}}")
        
        return (sentence, unknown_words)

    def add_pattern_to_entity_ruler(self,patterns,file):
        # die Prufung auf gleiche Lines hab ich nicht hinbekommen
        # daher pruefung auf doppelte und Loeschung von diesen
        self.ruler.add_patterns(patterns)
        self.ruler.to_disk(file)
        uniqlines = set(open(file).readlines())
        with open(file,'w',encoding='utf8') as fp:
            for line in uniqlines:
                fp.write(line)

    def add_sentence_to_TRAIN_DATA(self,sentence, filename):
        exists = os.path.isfile(filename)
        if exists:
            with open(filename,'r',encoding='utf8') as fh:
                for line in fh:
                    one_line = line[:-1]
                    self.TRAIN_DATA.append(one_line)
        self.TRAIN_DATA.append(sentence)
        if exists:
            # haenge nur den einen aktuellen Listeneintrag an
            with open(filename,'a',encoding='utf8') as fh:
                listitem = self.TRAIN_DATA.pop()
                fh.write('%s\n' % listitem)
        if not exists:
            with open(filename,'w+',encoding='utf8') as fh:
                for listitem in self.TRAIN_DATA:
                    fh.write('%s\n' % listitem)

    def add_word_to_stringstore(self, word, path):
        try:
            self.stringstore = StringStore().from_disk(path)
            self.stringstore.add(word)
        except:
            self.stringstore = StringStore(word)
        self.stringstore.to_disk(path)


    def add_word_to_vocab_permanently(self,word):
        pass
    
    def add_word_to_vocab_temporarely(self, word):
        pass

    def add_stringstore_to_vocab_temporarely(self, file):
        try:
            self.stringstore = StringStore().from_disk(file)
            for word in self.stringstore:
                lex = self.nlp.vocab[word]
                self.nlp.vocab[word].is_oov = False
        except:
            print("cannot read stringstore in file " + file)
    

    def add_pattern_jsonl_file_to_vocab_and_entity_matcher(self, pattern_file):
        (ents, pattern) = self.read_gazetteer(pattern_file)
        for i in range(len(ents)-1):
            #print(ents[i])
            #print(pattern[i])
            #print(type(pattern[i]))
            self.matcher.add(ents[i], None, pattern[i])
#           self.matcher.add(entity, None, *phrases)

    

    def read_gazetteer(self, loc):
        pattern = []
        ents = []
        idx = 0
        for i, line in enumerate(open(loc)):
            idx +=1
            data = eval(line.strip())
#            data = json.loads(line.strip())
            # ich fuege zum Vocab den String
            #phrase = self.nlp.tokenizer(data["pattern"])
            #phrase = data["pattern"][0]
            ents.append(data["label"])
            # ich fuege zum matcher das pattenr
            pattern.append(data["pattern"])

            # adde die Worte zum vocab
            #print(f"laenge der phrases = {len(phrases)}")
    #        print(phrase)
            try:
                phrase = ["pattern"][1]["lower"]
                for w in phrase:
                    _ = self.nlp.tokenizer.vocab[w.text]
            except:
                pass
        return (ents, pattern)
        # for i, line in enumerate(open(loc)):
        #     data = json.loads(line.strip())
        #     #! dann duerfen es aber nur einzelne Worte sein
        #     phrase = self.nlp.tokenizer(data["pattern"])
        #     # adde die Worte zum vocab
        #     print(f"laenge der phrases = {len(phrase)}")
        #     for w in phrase:
        #         _ = self.nlp.tokenizer.vocab[w.text]
        #     if len(phrase) >= 2:
        #         yield phrase

#*___________________________________________________________
#*___________________________________________________________
    #* CUSTOM PIPE COMPONENTS     
    #* Hier kommen die Cusom Pipe Components
    #*Aufgabe hauptsaechlich Entitaeten mittels Matchern zu verbessern
    #*Diese werden in der Funktion preproces in die Pipeline integriert
    
    def custom_pipe_component_phrase_entity(self, doc):
        # for ent in doc.ents:
        #     print(ent.text)
        # Apply the matcher to the doc
        matches = self.matcher(doc)
        # Create a Span for each match and assign the label 'ANIMAL'
        spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
        # Overwrite the doc.ents with the matched spans

        try:
            doc.ents = list(doc.ents) + spans
        except:
            print(f"overlapping Entities with {spans}")
#        doc.ents = spans
        return doc     

    def custom_pipe_component_Name_et_al(self, doc):
        print("entering_custom_pipe_component Name et al")
        new_ents = [] 
        for ent in doc.ents:
            print(f"ent = {ent}")
            # Only check for title if it's a person and not the first token
            replaced = False
        
            if ent.label_ == "PERSON":# and ent.end<len(doc)-2:
                # gib das neue label if et al. is in person or after Person
                if 'et' in ent.text and ('al' in ent.text or 'al.' in ent.text):
                    new_ent = Span(doc, ent.start, ent.end, label="REF")
                    replaced = True
                    print("new ents")
                else:
                    # wir schauen ob die danach folgenden et al sind
                    print("within label Person")
                    next_token = doc[ent.end +  1]
                    next_next_token = doc[ent.end + 2]
                    print(next_token.text)
                    print(next_next_token.text)
                    if next_token.text == "et" and next_next_token.text in ("al.", "al"):
                        new_ent = Span(doc, ent.start, ent.end+2, label="REF")
                        new_ents.append(new_ent)
                        replaced = True
                        print("new_ent")


            # es wird das neue angehangen
            if replaced:
                new_ents.append(new_ent)
                print('new ent')
            else:
            # es wird die alte Entitaet uveraendert uebertragen
                new_ents.append(ent)
                print("old ents")
            
        doc.ents = new_ents
        print(doc.ents)
        return doc     

    def custom_pipe_component_Quantity(self, doc):
       # 10 mg macht er meist als 10(CARDINAL) mg
       # Ziel 10 mg (QUANTITY)
        print("entering_custom_pipe_component Quantity")
        print(doc.text)
        new_ents = []
        for ent in doc.ents:
            print(ent.text)
            print(ent.label_)
            # Only check for title if it's a person and not the first token
            replaced = False
            if ent.label_ == "CARDINAL":# and ent.end<len(doc)-2:
                next_token = doc[ent.end]
                if next_token.text in ["mg", "g"]:
                    new_ent = Span(doc, ent.start, ent.end+1, label="QUANTITY")
                    replaced = True
            # es wird das neue angehangen
            if replaced:
                new_ents.append(new_ent)
                print('new ent')
            else:
            # es wird die alte Entitaet uveraendert uebertragen
                new_ents.append(ent)
                print("old ents")


        try:
            doc.ents = new_ents
        except:
            print("overlapping Entities in Quantity")
            for ent in new_ents:
                print(f"ent = {ent.text}   start = {ent.start}   stop = {ent.end}  label = {ent.label_}")
        #print(doc.ents)
        return doc     



    def custom_pipe_component_set_extension_unit(self, doc):
        pass
#*___________________________________________________________
#*___________________________________________________________
    #* EXTENSION Methods
    # Hier kommen die EXTENSION Methods
    # Hauptaufgabe ist das setzen von user defined Attributes, Propertien and Methods
    #Hauptziel fuer bestimmte Tokens ein neues text Token mit simplified english 
    #zu
    

    def custom_pipe_comp_alt_text(self, doc):
        # setze standardmaessig den alternativ text auf den Orginaltext
        for token in doc:
            token._.alt_text = token.text
            token._.alt_text_trailing_whitespace_ = token.whitespace_
        # nun wird der Matcher aufgerufen, der nach verschiedenen Regeln sucht
        # diese gefundenen Regeln werden danach abgefangen und der Alternativtext
        # wird entsprechend dieser Regeln gesetzt
        matches = self.matcher_alt_text(doc)
        # Create a Span for each match and assign the label 'ANIMAL'
        for match_id, start, end in matches:
            # Zahl die allein steht und als ent Type Cardinal ist
            if self.nlp.vocab.strings[match_id]=="NUMCARDINAL":
                doc[start]._.alt_text = "NUM"

            # UNITS
            # Wenn UNITS allein stehen 
            if self.nlp.vocab.strings[match_id]=="UNITS":
                doc[start]._.alt_text = "UNITS"
            # Wenn Units nach einer Zahl als eigenes Token stehen
            if self.nlp.vocab.strings[match_id]=="NUM_UNIT":
                doc[start]._.alt_text = "99"
                doc[start+1]._.alt_text = "UNITS" 
            # WEnn Units nach einer Zahl in einem Token stehen
            if self.nlp.vocab.strings[match_id]=="NUMUNIT": # zahl und Einheit wurde zusammen geschrieben
                doc[start]._.alt_text = "99UNITS"

            if self.nlp.vocab.strings[match_id]=="DRUGNAME":
                doc[start]._.alt_text = "DRUGNAME"
            if self.nlp.vocab.strings[match_id]=="NAMEETAL":
                doc[start]._.alt_text = "REF"
                doc[start+1]._.alt_text = "not to keep"
                doc[start+1]._.alt_text_keep = False
                doc[start+2]._.alt_text = "not to keep"
                doc[start+2]._.alt_text_keep = False
                doc[start+3]._.alt_text = "not to keep"
                doc[start+3]._.alt_text_keep = False
                
            if self.nlp.vocab.strings[match_id]=="REFx":
                doc[start]._.alt_text = "REF"
            if self.nlp.vocab.strings[match_id]=="REFS":
                doc[start]._.alt_text = "REF"
            if self.nlp.vocab.strings[match_id]=="REFpunkt":
                doc[start]._.alt_text = "REF"
            if self.nlp.vocab.strings[match_id]=="XYMIN":
                doc[start]._.alt_text = "XYMIN"
            if self.nlp.vocab.strings[match_id]=="XY-YEARREG":
                doc[start]._.alt_text = "99-year"
            if self.nlp.vocab.strings[match_id]=="XYYEARREG":
                doc[start]._.alt_text = "99year"
            if self.nlp.vocab.strings[match_id]=="XYMINREG":
                doc[start]._.alt_text = "99min"
            if self.nlp.vocab.strings[match_id]=="XY-MINREG":
                doc[start]._.alt_text = "99-min"

            if self.nlp.vocab.strings[match_id]=="XY_PROCENT":
                doc[start]._.alt_text = "99"
                doc[start+1]._.alt_text = "%"

            if self.nlp.vocab.strings[match_id]=="XY-RECEPTOR":
                doc[start]._.alt_text = "XY"
                doc[start+1]._.alt_text = "-"
                doc[start+2]._.alt_text = "receptor"
            if self.nlp.vocab.strings[match_id]=="XY_RECEPTOR":
                doc[start]._.alt_text = "XY"
                doc[start+1]._.alt_text = "receptor"


# {"label":"REFS","pattern":[{"TEXT": "AuthorsEtAl"}]}
# {"label":"REFx","pattern":[{"TEXT": "AuthorEtAl"}]}

#            doc[start]._.alt_text = doc[start].text + " " + self.nlp.vocab.strings[match_id] + " gefunden"
#        spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]

        return doc   

    def custom_matcher_alt_text(self):
        pattern_file = self.pattern_file_custom_matcher_alt_text
        (ents, pattern) = self.read_pattern_matcher_file(pattern_file)
        for i in range(len(ents)-1):

            self.matcher_alt_text.add(ents[i], None, pattern[i])
 #           self.matcher.add(entity, None, *phrases)
        # pattern = []
        # pattern.append([{'IS_DIGIT': True}, {'LOWER':'ng'}])
        # pattern.append([{'IS_DIGIT': True}, {'LOWER':'mg'}])
        # self.matcher_units2.add('UNITS', None, *pattern)
    
    


    # diese Funktion soll den Text jedes Tokens setzen
    def custom_pipe_component_set_extension_unit_text(self, doc):
        # rufe den PhraseMatcher fuer die units auf
        #self.matcher_units2 = Matcher(self.nlp.vocab)
        self.add_pattern_jsonl_file_Phrasematcher("./Lib/units.jsonl")
        matches = self.matcher_units(doc)
        # Create a Span for each match and assign the label 'ANIMAL'
        for match_id, start, end in matches:
            doc[start]._.alt_text = doc[start].text + "_ unit gefunden"
#        spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]

        return doc     


    def is_unit(self,token):
        return token.text == "mg"

    #def get_alt_text(self,token):
    #    return token._.alt_text




    def add_pattern_jsonl_file_Phrasematcher(self, pattern_file):
        (ents, unit_pattern) = self.read_gazetteer2(pattern_file)
        for i in range(len(ents)-1):
            #matcher_units.add("Units", None, *list(nlp.pipe(COUNTRIES)))
            self.matcher_units.add("UNITS", None, *list(self.nlp.pipe(unit_pattern)))
#            self.matcher_units.add(ents[i], None, pattern[i])
#           self.matcher.add(entity, None, *phrases)


    def read_gazetteer2(self, loc):
        pattern = []
        ents = []
        idx = 0
        for i, line in enumerate(open(loc)):
            idx +=1
            data = eval(line.strip())
            ents.append(data["label"])
            # ich fuege zum matcher das pattenr
            pattern.append(data["pattern"])
        return (ents, pattern)



    def read_pattern_matcher_file(self, loc):
        pattern = []
        ents = []
        for i, line in enumerate(open(loc)):
            data = eval(line.strip())
            ents.append(data["label"])
            pattern.append(data["pattern"])
        return (ents, pattern)

#*___________________________________________________________
#*___________________________________________________________
    #* Text Extraction von XML to txt     
    # Wandelt den Text von den XML Dokumenten in reinen Text um 
    #diese werden dann im self.output_dir gespeichert
    #
    def extract_text(self):
        idx = 0
        for file in os.listdir(self.xml_dir):
            print(f'schleife extract text with : {idx} ')
            if file.endswith('.xml'):
                input_filename = os.path.join(self.xml_dir, file)
                if len(self.section_names)==1:
                    prefix = self.section_names[0]
                else:
                    prefix = 'section_mix'

                output_filename = os.path.join(self.output_dir, prefix + '_' + file)
                print(output_filename)
                self.__current_xml_files_for_spacy_preprocessing.append(input_filename)

                with open(input_filename, "r", encoding="utf8") as f1:
                    print('-------------------------')
                    print('filename:' + input_filename)
                    xml = f1.read()
                    P = RP.Research_Paper_XMLJSON(xml, "json")
                    P.development_test()
                    #P.analyse_parsing()
                    rtext = ''
                    for section_name in self.section_names:
                        rtext = rtext + P.get_part_of_text(section_name)
                    #print(rtext)

                with open(output_filename,"w+", encoding="utf8") as f2:
                    self._TEXTS.append(rtext)
                    f2.write(rtext)
                idx += 1
            # ! This has to be removed in further versions    
            if idx > 10:
                break



    def get_sentence_alt_text(self, sent):
        # uebergabe eines doc objects /// sentence
        # rueckgabe eines TExtes das den alternativen TExt nutzt
        alt_text = ""
        sent_org_text = sent.text
        for token in sent:
            if token._.alt_text_keep:
                alt_text = alt_text + token._.alt_text + token._.alt_text_trailing_whitespace_
        return alt_text
示例#2
0
class Sense2Vec(object):
    def __init__(
            self,
            shape: tuple = (1000, 128),
            strings: StringStore = None,
            senses: List[str] = [],
            vectors_name: str = "sense2vec",
            overrides: Dict[str, str] = SimpleFrozenDict(),
    ):
        """Initialize the Sense2Vec object.

        shape (tuple): The vector shape.
        strings (StringStore): Optional string store. Will be created if it
            doesn't exist.
        senses (list): Optional list of all available senses. Used in methods
            that generate the best sense or other senses.
        vectors_name (unicode): Optional name to assign to the Vectors object.
        overrides (dict): Optional custom functions to use, mapped to names
            registered via the registry, e.g. {"make_key": "custom_make_key"}.
        RETURNS (Sense2Vec): The newly constructed object.
        """
        self.vectors = Vectors(shape=shape, name=vectors_name)
        self._row2key = None
        self.strings = StringStore() if strings is None else strings
        self.freqs: Dict[int, int] = {}
        self.cache = None
        self.cfg: Dict[str, Any] = {
            "senses": senses,
            "make_key": "default",
            "split_key": "default",
        }
        self.cfg.update(overrides)

    @property
    def senses(self) -> Sequence[str]:
        """RETURNS (list): The available senses."""
        return self.cfg.get("senses", [])

    @property
    def frequencies(self) -> List[Tuple[str, int]]:
        """RETURNS (list): The (key, freq) tuples by frequency, descending."""
        freqs = [(self.strings[k], s) for k, s in self.freqs.items()
                 if s is not None]
        return sorted(freqs, key=lambda item: item[1], reverse=True)

    def __len__(self) -> int:
        """RETURNS (int): The number of rows in the vectors table."""
        return len(self.vectors)

    def __contains__(self, key: Union[str, int]) -> bool:
        """Check if a key is in the vectors table.

        key (unicode / int): The key to look up.
        RETURNS (bool): Whether the key is in the table.
        """
        key = self.ensure_int_key(key)
        return key in self.vectors

    def __getitem__(self, key: Union[str, int]) -> Union[numpy.ndarray, None]:
        """Retrieve a vector for a given key. Returns None if the key is not
        in the table.

        key (unicode / int): The key to look up.
        RETURNS (numpy.ndarray): The vector.
        """
        key = self.ensure_int_key(key)
        if key in self.vectors:
            return self.vectors[key]
        return None

    def __setitem__(self, key: Union[str, int], vector: numpy.ndarray):
        """Set a vector for a given key. Will raise an error if the key
        doesn't exist.

        key (unicode / int): The key.
        vector (numpy.ndarray): The vector to set.
        """
        key = self.ensure_int_key(key)
        if key not in self.vectors:
            raise ValueError(f"Can't find key {key} in table")
        self.vectors[key] = vector
        self._row2key = None

    def __iter__(self):
        """YIELDS (tuple): String key and vector pairs in the table."""
        yield from self.items()

    def items(self):
        """YIELDS (tuple): String key and vector pairs in the table."""
        for key, value in self.vectors.items():
            yield self.strings[key], value

    def keys(self):
        """YIELDS (unicode): The string keys in the table."""
        for key in self.vectors.keys():
            yield self.strings[key]

    def values(self):
        """YIELDS (numpy.ndarray): The vectors in the table."""
        yield from self.vectors.values()

    @property
    def row2key(self):
        if not self._row2key:
            self._row2key = {
                row: key
                for key, row in self.vectors.key2row.items()
            }
        return self._row2key

    @property
    def make_key(self) -> Callable:
        """Get the function to make keys."""
        return registry.make_key.get(self.cfg["make_key"])

    @property
    def split_key(self) -> Callable:
        """Get the function to split keys."""
        return registry.split_key.get(self.cfg["split_key"])

    def add(self,
            key: Union[str, int],
            vector: numpy.ndarray,
            freq: int = None):
        """Add a new vector to the table.

        key (unicode / int): The key to add.
        vector (numpy.ndarray): The vector to add.
        freq (int): Optional frequency count.
        """
        if not isinstance(key, int):
            key = self.strings.add(key)
        self.vectors.add(key, vector=vector)
        if freq is not None:
            self.set_freq(key, freq)
        self._row2key = None

    def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]:
        """Get the frequency count for a given key.

        key (unicode / int): They key to look up.
        default: Default value to return if no frequency is found.
        RETURNS (int): The frequency count.
        """
        key = self.ensure_int_key(key)
        return self.freqs.get(key, default)

    def set_freq(self, key: Union[str, int], freq: int):
        """Set a frequency count for a given key.

        key (unicode / int): The key to set the count for.
        freq (int): The frequency count.
        """
        if not isinstance(freq, int):
            raise ValueError(
                f"Invalid frequency count: {repr(freq)} for '{key}'")
        key = self.ensure_int_key(key)
        self.freqs[key] = freq

    def ensure_int_key(self, key: Union[str, int]) -> int:
        """Ensure that a key is an int by looking it up in the string store.

        key (unicode / int): The key.
        RETURNS (int): The integer key.
        """
        return key if isinstance(key, int) else self.strings.add(key)

    def similarity(
        self,
        keys_a: Union[Sequence[Union[str, int]], str, int],
        keys_b: Union[Sequence[Union[str, int]], str, int],
    ) -> float:
        """Make a semantic similarity estimate of two keys or two sets of keys.
        The default estimate is cosine similarity using an average of vectors.

        keys_a (unicode / int / iterable): The string or integer key(s).
        keys_b (unicode / int / iterable): The other string or integer key(s).
        RETURNS (float): The similarity score.
        """
        if isinstance(keys_a, (str, int)):
            keys_a = [keys_a]
        if isinstance(keys_b, (str, int)):
            keys_b = [keys_b]
        average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0)
        average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0)
        return cosine_similarity(average_a, average_b)

    def most_similar(
        self,
        keys: Union[Sequence[Union[str, int]], str, int],
        n: int = 10,
        batch_size: int = 16,
    ) -> List[Tuple[str, float]]:
        """Get the most similar entries in the table. If more than one key is
        provided, the average of the vectors is used.

        keys (unicode / int / iterable): The string or integer key(s) to compare to.
        n (int): The number of similar keys to return.
        batch_size (int): The batch size to use.
        RETURNS (list): The (key, score) tuples of the most similar vectors.
        """
        if isinstance(keys, (str, int)):
            keys = [keys]
        for key in keys:
            if key not in self:
                raise ValueError(f"Can't find key {key} in table")
        if self.cache and self.cache["indices"].shape[1] >= n:
            n = min(len(self.vectors), n)
            key = self.ensure_int_key(key)
            key_row = self.vectors.find(key=key)
            if key_row < self.cache["indices"].shape[0]:
                rows = self.cache["indices"][key_row, :n]
                scores = self.cache["scores"][key_row, :n]
                entries = zip(rows, scores)
                entries = [(self.strings[self.row2key[r]], score)
                           for r, score in entries if r in self.row2key]
                return entries
        # Always ask for more because we'll always get the keys themselves
        n = min(len(self.vectors), n + len(keys))
        rows = numpy.asarray(self.vectors.find(keys=keys))
        vecs = self.vectors.data[rows]
        average = vecs.mean(axis=0, keepdims=True)
        result_keys, _, scores = self.vectors.most_similar(
            average, n=n, batch_size=batch_size)
        result = list(zip(result_keys.flatten(), scores.flatten()))
        result = [(self.strings[key], score) for key, score in result if key]
        result = [(key, score) for key, score in result if key not in keys]
        return result

    def get_other_senses(self,
                         key: Union[str, int],
                         ignore_case: bool = True) -> List[str]:
        """Find other entries for the same word with a different sense, e.g.
        "duck|VERB" for "duck|NOUN".

        key (unicode / int): The key to check.
        ignore_case (bool): Check for uppercase, lowercase and titlecase.
        RETURNS (list): The string keys of other entries with different senses.
        """
        result = []
        key = key if isinstance(key, str) else self.strings[key]
        word, orig_sense = self.split_key(key)
        versions = [word, word.upper(), word.title()
                    ] if ignore_case else [word]
        for text in versions:
            for sense in self.senses:
                new_key = self.make_key(text, sense)
                if sense != orig_sense and new_key in self:
                    result.append(new_key)
        return result

    def get_best_sense(self,
                       word: str,
                       senses: Sequence[str] = tuple(),
                       ignore_case: bool = True) -> Union[str, None]:
        """Find the best-matching sense for a given word based on the available
        senses and frequency counts. Returns None if no match is found.

        word (unicode): The word to check.
        senses (list): Optional list of senses to limit the search to. If not
            set / empty, all senses in the vectors are used.
        ignore_case (bool): Check for uppercase, lowercase and titlecase.
        RETURNS (unicode): The best-matching key or None.
        """
        sense_options = senses or self.senses
        if not sense_options:
            return None
        versions = [word, word.upper(), word.title()
                    ] if ignore_case else [word]
        freqs = []
        for text in versions:
            for sense in sense_options:
                key = self.make_key(text, sense)
                if key in self:
                    freq = self.get_freq(key, -1)
                    freqs.append((freq, key))
        return max(freqs)[1] if freqs else None

    def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes:
        """Serialize a Sense2Vec object to a bytestring.

        exclude (list): Names of serialization fields to exclude.
        RETURNS (bytes): The serialized Sense2Vec object.
        """
        vectors_bytes = self.vectors.to_bytes()
        freqs = list(self.freqs.items())
        data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs}
        if "strings" not in exclude:
            data["strings"] = self.strings.to_bytes()
        if "cache" not in exclude:
            data["cache"] = self.cache
        return srsly.msgpack_dumps(data)

    def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a bytestring.

        bytes_data (bytes): The data to load.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        data = srsly.msgpack_loads(bytes_data)
        self.vectors = Vectors().from_bytes(data["vectors"])
        self.freqs = dict(data.get("freqs", []))
        self.cfg.update(data.get("cfg", {}))
        if "strings" not in exclude and "strings" in data:
            self.strings = StringStore().from_bytes(data["strings"])
        if "cache" not in exclude and "cache" in data:
            self.cache = data.get("cache", {})
        self._row2key = None
        return self

    def to_disk(self, path: Union[Path, str],
                exclude: Sequence[str] = tuple()):
        """Serialize a Sense2Vec object to a directory.

        path (unicode / Path): The path.
        exclude (list): Names of serialization fields to exclude.
        """
        path = Path(path)
        self.vectors.to_disk(path)
        srsly.write_json(path / "cfg", self.cfg)
        srsly.write_json(path / "freqs.json", list(self.freqs.items()))
        if "strings" not in exclude:
            self.strings.to_disk(path / "strings.json")
        if "cache" not in exclude and self.cache:
            srsly.write_msgpack(path / "cache", self.cache)

    def from_disk(self,
                  path: Union[Path, str],
                  exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a directory.

        path (unicode / Path): The path to load from.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        path = Path(path)
        strings_path = path / "strings.json"
        freqs_path = path / "freqs.json"
        cache_path = path / "cache"
        self.vectors = Vectors().from_disk(path)
        self.cfg.update(srsly.read_json(path / "cfg"))
        if freqs_path.exists():
            self.freqs = dict(srsly.read_json(freqs_path))
        if "strings" not in exclude and strings_path.exists():
            self.strings = StringStore().from_disk(strings_path)
        if "cache" not in exclude and cache_path.exists():
            self.cache = srsly.read_msgpack(cache_path)
        self._row2key = None
        return self
示例#3
0
class Analz(object):
    """
    >>> from sagas.nlu.analz import analz
    >>> analz.add_pats('typ', ['寄账单地址'])
    >>> analz.add_pats('srv', ['新建'])
    >>> doc=analz.parse("我想要新建一些寄账单地址")
    >>> analz.vis(doc)
    >>> doc.terms
    """
    def __init__(self):
        import os
        from sagas.conf.conf import cf
        from pyltp import Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
        from spacy.strings import StringStore

        self.stringstore = StringStore()

        MODELDIR = f'{cf.conf_dir}/ai/ltp/ltp_data_v3.4.0'
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        par_model_path = os.path.join(MODELDIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(MODELDIR, "ner.model"))
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

        self.conf = AnalConf('zh')
        self.conf.setup(self)

    def add_pats(self, pat_name, pat_text_ls: List[Text]):
        import jieba
        id_hash = self.stringstore.add(pat_name)
        for t in pat_text_ls:
            jieba.add_word(t, tag=id_hash)

    def tokenize(self, sents: Text) -> List[Dict[Text, Text]]:
        import jieba.posseg as pseg
        toks = pseg.cut(sents)
        terms = []
        for i, (word, flag) in enumerate(toks):
            if not isinstance(flag, str):
                ref = self.stringstore[flag]
            else:
                ref = flag
            terms.append({'term': ref, 'value': word})
        return terms

    def parse(self, sents: Text) -> Docz:
        terms = self.tokenize(sents)
        words = [w['value'] for w in terms]
        postags = self.postagger.postag(words)
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        netags = self.recognizer.recognize(words, postags)

        # terms=list(filter(lambda x: x['term'] in terms_list, terms))
        return Docz(words, postags, arcs, roles, netags, terms)

    def vis(self, doc):
        from graphviz import Digraph
        f = Digraph('deps', filename='deps.gv')
        f.attr(rankdir='LR', size='8,5')
        f.attr('node', shape='egg', fontname='Calibri')
        for i in range(len(doc.words)):
            idx = int(doc.arcs[i].head) - 1
            if idx == -1:
                continue
            a = doc.words[idx]
            print("%s --> %s|%s|%s|%s" %
                  (a, doc.words[i], doc.arcs[i].relation, doc.postags[i],
                   doc.netags[i]))
            f.edge(a, doc.words[i], label=doc.arcs[i].relation.lower())
        return f