Python Pipe.add_ner 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: medcat.pipe

클래스/타입: Pipe

메소드/함수: add_ner

hotexamples.com에서의 예제들: 3

Python Pipe.add_ner - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 medcat.pipe.Pipe.add_ner에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Pipe(7)

add_tagger(5)

destroy(4)

add_linker(3)

add_ner(3)

add_token_normalizer(3)

add_meta_cat(1)

예제 #1

파일 보기

파일: test_ner_archive.py 프로젝트: tomolopolis/MedCAT-1

class NerArchiveTests(unittest.TestCase):
    def setUp(self) -> None:
        self.config = Config()
        self.config.general['log_level'] = logging.INFO
        cdb = CDB(config=self.config)

        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

        self.vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(self.vocab_path):
            import requests
            tmp = requests.get(
                "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(self.vocab_path, 'wb') as f:
                f.write(tmp.content)

        vocab = Vocab.load(self.vocab_path)
        # Make the pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])
        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                          config=self.config,
                                          data_vocab=vocab)
        self.nlp.add_token_normalizer(spell_checker=spell_checker,
                                      config=self.config)
        ner = NER(cdb, self.config)
        self.nlp.add_ner(ner)

        # Add Linker
        link = Linker(cdb, vocab, self.config)
        self.nlp.add_linker(link)

        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"

    def tearDown(self) -> None:
        self.nlp.destroy()

    def test_limits_for_tokens_and_uppercase(self):
        self.config.ner['max_skip_tokens'] = 1
        self.config.ner['upper_case_limit_len'] = 4
        self.config.linking['disamb_length_limit'] = 2

        d = self.nlp(self.text)

        assert len(d._.ents) == 2
        assert d._.ents[0]._.link_candidates[0] == 'S-229004'

    def test_change_limit_for_skip(self):
        self.config.ner['max_skip_tokens'] = 3
        d = self.nlp(self.text)
        assert len(d._.ents) == 3

    def test_change_limit_for_upper_case(self):
        self.config.ner['upper_case_limit_len'] = 3
        d = self.nlp(self.text)
        assert len(d._.ents) == 4

    def test_check_name_length_limit(self):
        self.config.ner['min_name_len'] = 4
        d = self.nlp(self.text)
        assert len(d._.ents) == 2

    def test_speed(self):
        text = "CDB - I was running and then Movar    Virus attacked and CDb"
        text = text * 300
        self.config.general['spell_check'] = True
        start = timer()
        for i in range(50):
            d = self.nlp(text)
        end = timer()
        print("Time: ", end - start)

    def test_without_spell_check(self):
        # Now without spell check
        self.config.general['spell_check'] = False
        start = timer()
        for i in range(50):
            d = self.nlp(self.text)
        end = timer()
        print("Time: ", end - start)

    def test_for_linker(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        cdb = CDB(config=self.config)

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        cdb.add_names(cui='S-2290045',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}

        cuis = list(cdb.cui2names.keys())
        for cui in cuis[0:50]:
            vectors = {
                'short': np.random.rand(300),
                'long': np.random.rand(300),
                'medium': np.random.rand(300)
            }
            cdb.update_context_vector(cui, vectors, negative=False)

        d = self.nlp(self.text)
        vocab = Vocab.load(self.vocab_path)
        cm = ContextModel(cdb, vocab, self.config)
        cm.train_using_negative_sampling('S-229004')
        self.config.linking['train_count_threshold'] = 0

        cm.train('S-229004', d._.ents[1], d)

        cm.similarity('S-229004', d._.ents[1], d)

        cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)

예제 #2

파일 보기

class CAT(object):
    r'''
    The main MedCAT class used to annotate documents, it is built on top of spaCy
    and works as a spaCy pipline. Creates an instance of a spaCy pipline that can
    be used as a spacy nlp model.

    Args:
        cdb (medcat.cdb.CDB):
            The concept database that will be used for NER+L
        config (medcat.config.Config):
            Global configuration for medcat
        vocab (medcat.vocab.Vocab, optional):
            Vocabulary used for vector embeddings and spelling. Default: None
        meta_cats (list of medcat.meta_cat.MetaCAT, optional):
            A list of models that will be applied sequentially on each
            detected annotation.

    Attributes (limited):
        cdb (medcat.cdb.CDB):
            Concept database used with this CAT instance, please do not assign
            this value directly.
        config (medcat.config.Config):
            The global configuration for medcat. Usuall cdb.config can be used for this
            field.
        vocab (medcat.utils.vocab.Vocab):
            The vocabulary object used with this instance, please do not assign
            this value directly.
        config - WILL BE REMOVED - TEMPORARY PLACEHOLDER

    Examples:
        >>>cat = CAT(cdb, vocab)
        >>>spacy_doc = cat("Put some text here")
        >>>print(spacy_doc.ents) # Detected entites
    '''
    log = logging.getLogger(__package__)
    # Add file and console handlers
    log = add_handlers(log)

    def __init__(self, cdb, config, vocab, meta_cats=[]):
        self.cdb = cdb
        self.vocab = vocab
        # Take config from the cdb
        self.config = config

        # Set log level
        self.log.setLevel(self.config.general['log_level'])

        # Build the pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=partial(tag_skip_and_punct,
                                           config=self.config),
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        spell_checker = BasicSpellChecker(cdb_vocab=self.cdb.vocab,
                                          config=self.config,
                                          data_vocab=vocab)
        self.nlp.add_token_normalizer(spell_checker=spell_checker,
                                      config=self.config)

        # Add NER
        self.ner = NER(self.cdb, self.config)
        self.nlp.add_ner(self.ner)

        # Add LINKER
        self.linker = Linker(self.cdb, vocab, self.config)
        self.nlp.add_linker(self.linker)

        # Add meta_annotaiton classes if they exist
        self._meta_annotations = False
        for meta_cat in meta_cats:
            self.nlp.add_meta_cat(meta_cat, meta_cat.category_name)
            self._meta_annotations = True

        # Set max document length
        self.nlp.nlp.max_length = self.config.preprocessing.get(
            'max_document_length', 1000000)

    def get_spacy_nlp(self):
        ''' Returns the spacy pipeline with MedCAT
        '''
        return self.nlp.nlp

    def __call__(self, text, do_train=False):
        r'''
        Push the text through the pipeline.

        Args:
            text (string):
                The text to be annotated, if it is longer than self.config.preprocessing['max_document_length'] it will be trimmed
                to that length.
            do_train (bool, defaults to `False`):
                This causes so many screwups when not there, so I'll force training
                to False. To run training it is much better to use the self.train() function
                but for some special cases I'm leaving it here also.
        Returns:
            A spacy document with the extracted entities
        '''
        # Should we train - do not use this for training, unles you know what you are doing. Use the
        #self.train() function
        self.config.linking['train'] = do_train

        if text and len(text) > 0:
            return self.nlp(text[0:self.config.preprocessing.
                                 get('max_document_length', 1000000)])
        else:
            return None

    def _print_stats(self,
                     data,
                     epoch=0,
                     use_filters=False,
                     use_overlaps=False,
                     use_cui_doc_limit=False,
                     use_groups=False):
        r''' TODO: Refactor and make nice
        Print metrics on a dataset (F1, P, R), it will also print the concepts that have the most FP,FN,TP.

        Args:
            data (list of dict):
                The json object that we get from MedCATtrainer on export.
            epoch (int):
                Used during training, so we know what epoch is it.
            use_filters (boolean):
                Each project in medcattrainer can have filters, do we want to respect those filters
                when calculating metrics.
            use_overlaps (boolean):
                Allow overlapping entites, nearly always False as it is very difficult to annotate overlapping entites.
            use_cui_doc_limit (boolean):
                If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words
                if the document was annotated for that CUI. Useful in very specific situations when during the annotation
                process the set of CUIs changed.
            use_groups (boolean):
                If True concepts that have groups will be combined and stats will be reported on groups.

        Returns:
            fps (dict):
                False positives for each CUI
            fns (dict):
                False negatives for each CUI
            tps (dict):
                True positives for each CUI
            cui_prec (dict):
                Precision for each CUI
            cui_rec (dict):
                Recall for each CUI
            cui_f1 (dict):
                F1 for each CUI
            cui_counts (dict):
                Number of occurrence for each CUI
            examples (dict):
                Examples for each of the fp, fn, tp. Foramt will be examples['fp']['cui'][<list_of_examples>]
        '''
        tp = 0
        fp = 0
        fn = 0
        fps = {}
        fns = {}
        tps = {}
        cui_prec = {}
        cui_rec = {}
        cui_f1 = {}
        cui_counts = {}
        examples = {'fp': {}, 'fn': {}, 'tp': {}}

        fp_docs = set()
        fn_docs = set()
        # Backup for filters
        _filters = deepcopy(self.config.linking['filters'])
        # Shortcut for filters
        filters = self.config.linking['filters']

        for pind, project in tqdm(enumerate(data['projects']),
                                  desc="Stats project",
                                  total=len(data['projects']),
                                  leave=False):
            if use_filters:
                if type(project.get('cuis', None)) == str:
                    # Old filters
                    filters['cuis'] = process_old_project_filters(
                        cuis=project.get('cuis', None),
                        type_ids=project.get('tuis', None),
                        cdb=self.cdb)
                elif type(project.get('cuis', None)) == list:
                    # New filters
                    filters['cuis'] = project.get('cuis')

            start_time = time.time()
            for dind, doc in tqdm(enumerate(project['documents']),
                                  desc='Stats document',
                                  total=len(project['documents']),
                                  leave=False):
                if type(doc['annotations']) == list:
                    anns = doc['annotations']
                elif type(doc['annotations']) == dict:
                    anns = doc['annotations'].values()

                # Apply document level filtering if
                if use_cui_doc_limit:
                    _cuis = set([ann['cui'] for ann in anns])
                    if _cuis:
                        filters['cuis'] = _cuis

                spacy_doc = self(doc['text'])

                if use_overlaps:
                    p_anns = spacy_doc._.ents
                else:
                    p_anns = spacy_doc.ents

                anns_norm = []
                anns_norm_neg = []
                anns_examples = []
                anns_norm_cui = []
                for ann in anns:
                    cui = ann['cui']
                    if not use_filters or check_filters(cui, filters):
                        if use_groups:
                            cui = self.cdb.addl_info['cui2group'].get(cui, cui)

                        if ann.get('validated',
                                   True) and (not ann.get('killed', False) and
                                              not ann.get('deleted', False)):
                            anns_norm.append((ann['start'], cui))
                            anns_examples.append({
                                "text":
                                doc['text'][max(0, ann['start'] -
                                                60):ann['end'] + 60],
                                "cui":
                                cui,
                                "source value":
                                ann['value'],
                                "acc":
                                1,
                                "project index":
                                pind,
                                "document inedex":
                                dind
                            })
                        elif ann.get('validated', True) and (ann.get(
                                'killed', False) or ann.get('deleted', False)):
                            anns_norm_neg.append((ann['start'], cui))

                        if ann.get("validated", True):
                            # This is used to test was someone annotating for this CUI in this document
                            anns_norm_cui.append(cui)
                            cui_counts[cui] = cui_counts.get(cui, 0) + 1

                p_anns_norm = []
                p_anns_examples = []
                for ann in p_anns:
                    cui = ann._.cui
                    if use_groups:
                        cui = self.cdb.addl_info['cui2group'].get(cui, cui)

                    p_anns_norm.append((ann.start_char, cui))
                    p_anns_examples.append({
                        "text":
                        doc['text'][max(0, ann.start_char - 60):ann.end_char +
                                    60],
                        "cui":
                        cui,
                        "source value":
                        ann.text,
                        "acc":
                        float(ann._.context_similarity),
                        "project index":
                        pind,
                        "document inedex":
                        dind
                    })

                for iann, ann in enumerate(p_anns_norm):
                    cui = ann[1]
                    if ann in anns_norm:
                        tp += 1
                        tps[cui] = tps.get(cui, 0) + 1

                        example = p_anns_examples[iann]
                        examples['tp'][cui] = examples['tp'].get(
                            cui, []) + [example]
                    else:
                        fp += 1
                        fps[cui] = fps.get(cui, 0) + 1
                        fp_docs.add(doc.get('name', 'unk'))

                        # Add example for this FP prediction
                        example = p_anns_examples[iann]
                        if ann in anns_norm_neg:
                            # Means that it really was annotated as negative
                            example['real_fp'] = True

                        examples['fp'][cui] = examples['fp'].get(
                            cui, []) + [example]

                for iann, ann in enumerate(anns_norm):
                    if ann not in p_anns_norm:
                        cui = ann[1]
                        fn += 1
                        fn_docs.add(doc.get('name', 'unk'))

                        fns[cui] = fns.get(cui, 0) + 1
                        examples['fn'][cui] = examples['fn'].get(
                            cui, []) + [anns_examples[iann]]

        try:
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            f1 = 2 * (prec * rec) / (prec + rec)
            print("Epoch: {}, Prec: {}, Rec: {}, F1: {}\n".format(
                epoch, prec, rec, f1))
            print("Docs with false positives: {}\n".format("; ".join(
                [str(x) for x in list(fp_docs)[0:10]])))
            print("Docs with false negatives: {}\n".format("; ".join(
                [str(x) for x in list(fn_docs)[0:10]])))

            # Sort fns & prec
            fps = {
                k: v
                for k, v in sorted(
                    fps.items(), key=lambda item: item[1], reverse=True)
            }
            fns = {
                k: v
                for k, v in sorted(
                    fns.items(), key=lambda item: item[1], reverse=True)
            }
            tps = {
                k: v
                for k, v in sorted(
                    tps.items(), key=lambda item: item[1], reverse=True)
            }

            # F1 per concept
            for cui in tps.keys():
                prec = tps[cui] / (tps.get(cui, 0) + fps.get(cui, 0))
                rec = tps[cui] / (tps.get(cui, 0) + fns.get(cui, 0))
                f1 = 2 * (prec * rec) / (prec + rec)
                cui_prec[cui] = prec
                cui_rec[cui] = rec
                cui_f1[cui] = f1

            # Get top 10
            pr_fps = [(self.cdb.cui2preferred_name.get(
                cui,
                list(self.cdb.cui2names.get(cui, [cui]))[0]), cui, fps[cui])
                      for cui in list(fps.keys())[0:10]]
            pr_fns = [(self.cdb.cui2preferred_name.get(
                cui,
                list(self.cdb.cui2names.get(cui, [cui]))[0]), cui, fns[cui])
                      for cui in list(fns.keys())[0:10]]
            pr_tps = [(self.cdb.cui2preferred_name.get(
                cui,
                list(self.cdb.cui2names.get(cui, [cui]))[0]), cui, tps[cui])
                      for cui in list(tps.keys())[0:10]]

            print("\n\nFalse Positives\n")
            for one in pr_fps:
                print("{:70} - {:20} - {:10}".format(
                    str(one[0])[0:69],
                    str(one[1])[0:19], one[2]))
            print("\n\nFalse Negatives\n")
            for one in pr_fns:
                print("{:70} - {:20} - {:10}".format(
                    str(one[0])[0:69],
                    str(one[1])[0:19], one[2]))
            print("\n\nTrue Positives\n")
            for one in pr_tps:
                print("{:70} - {:20} - {:10}".format(
                    str(one[0])[0:69],
                    str(one[1])[0:19], one[2]))
            print("*" * 110 + "\n")

        except Exception as e:
            traceback.print_exc()

        self.config.linking['filters'] = _filters

        return fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples

    def train(self, data_iterator, fine_tune=True, progress_print=1000):
        """ Runs training on the data, note that the maximum lenght of a line
        or document is 1M characters. Anything longer will be trimmed.

        data_iterator:
            Simple iterator over sentences/documents, e.g. a open file
            or an array or anything that we can use in a for loop.
        fine_tune:
            If False old training will be removed
        progress_print:
            Print progress after N lines
        """
        if not fine_tune:
            self.log.info("Removing old training data!")
            self.cdb.reset_training()

        cnt = 0
        for line in data_iterator:
            if line is not None and line:
                # Convert to string
                line = str(line).strip()

                try:
                    _ = self(line, do_train=True)
                except Exception as e:
                    self.log.warning("LINE: '{}...' \t WAS SKIPPED".format(
                        line[0:100]))
                    self.log.warning("BECAUSE OF: " + str(e))
                if cnt % progress_print == 0:
                    self.log.info("DONE: " + str(cnt))
                cnt += 1

        self.config.linking['train'] = False

    def add_cui_to_group(self, cui, group_name, reset_all_groups=False):
        r'''
        Ads a CUI to a group, will appear in cdb.addl_info['cui2group']

        Args:
            cui (str):
                The concept to be added
            group_name (str):
                The group to whcih the concept will be added
            reset_all_groups (boolean):
                If True it will reset all existing groups and remove them.

        Examples:
            >>> cat.add_cui_to_group("S-17", 'pain')
        '''

        # Reset if needed
        if reset_all_groups:
            self.cdb.addl_info['cui2group'] = {}

        # Add group_name
        self.cdb.addl_info['cui2group'][cui] = group_name

    def unlink_concept_name(self, cui, name, preprocessed_name=False):
        r'''
        Unlink a concept name from the CUI (or all CUIs if full_unlink), removes the link from
        the Concept Database (CDB). As a consequence medcat will never again link the `name`
        to this CUI - meaning the name will not be detected as a concept in the future.

        Args:
            cui (str):
                The CUI from which the `name` will be removed
            name (str):
                The span of text to be removed from the linking dictionary
        Examples:
            >>> # To never again link C0020538 to HTN
            >>> cat.unlink_concept_name('C0020538', 'htn', False)
        '''

        cuis = [cui]
        if preprocessed_name:
            names = {name: 'nothing'}
        else:
            names = prepare_name(name, self, {}, self.config)

        # If full unlink find all CUIs
        if self.config.general.get('full_unlink', False):
            for name in names:
                cuis.extend(self.cdb.name2cuis.get(name, []))

        # Remove name from all CUIs
        for cui in cuis:
            self.cdb.remove_names(cui=cui, names=names)

    def add_and_train_concept(self,
                              cui,
                              name,
                              spacy_doc=None,
                              spacy_entity=None,
                              ontologies=set(),
                              name_status='A',
                              type_ids=set(),
                              description='',
                              full_build=True,
                              negative=False,
                              devalue_others=False,
                              do_add_concept=True):
        r''' Add a name to an existing concept, or add a new concept, or do not do anything if the name and concept alraedy exist. Perform
        training if spacy_entity and spacy_doc are set.

        Args:
            cui (str):
                CUI of the concept
            name (str):
                Name to be linked to the concept (in the case of MedCATtrainer this is simply the
                selected value in text, no preprocessing or anything needed).
            spacy_doc (spacy.tokens.Doc):
                Spacy represenation of the document that was manually annotated.
            spacy_entity (List[spacy.tokens.Token]):
                Given the spacy document, this is the annotated span of text - list of annotated tokens that are marked with this CUI.
            negative (bool):
                Is this a negative or positive example.
            devalue_others:
                If set, cuis to which this name is assigned and are not `cui` will receive negative training given
                that negative=False.

            **other:
                Refer to CDB.add_concept
        '''

        names = prepare_name(name, self, {}, self.config)
        if do_add_concept:
            self.cdb.add_concept(cui=cui,
                                 names=names,
                                 ontologies=ontologies,
                                 name_status=name_status,
                                 type_ids=type_ids,
                                 description=description,
                                 full_build=full_build)

        if spacy_entity is not None and spacy_doc is not None:
            # Train Linking
            self.linker.context_model.train(cui=cui,
                                            entity=spacy_entity,
                                            doc=spacy_doc,
                                            negative=negative,
                                            names=names)

            if not negative and devalue_others:
                # Find all cuis
                cuis = set()
                for name in names:
                    cuis.update(self.cdb.name2cuis.get(name, []))
                # Remove the cui for which we just added positive training
                cuis.remove(cui)
                # Add negative training for all other CUIs that link to these names
                for _cui in cuis:
                    self.linker.context_model.train(cui=_cui,
                                                    entity=spacy_entity,
                                                    doc=spacy_doc,
                                                    negative=True)

    def train_supervised(self,
                         data_path,
                         reset_cui_count=False,
                         nepochs=1,
                         print_stats=0,
                         use_filters=False,
                         terminate_last=False,
                         use_overlaps=False,
                         use_cui_doc_limit=False,
                         test_size=0,
                         devalue_others=False,
                         use_groups=False,
                         never_terminate=False,
                         train_from_false_positives=False):
        r''' TODO: Refactor, left from old
        Run supervised training on a dataset from MedCATtrainer. Please take care that this is more a simiulated
        online training then supervised.

        Args:
            data_path (str):
                The path to the json file that we get from MedCATtrainer on export.
            reset_cui_count (boolean):
                Used for training with weight_decay (annealing). Each concept has a count that is there
                from the beginning of the CDB, that count is used for annealing. Resetting the count will
                significantly incrase the training impact. This will reset the count only for concepts
                that exist in the the training data.
            nepochs (int):
                Number of epochs for which to run the training.
            print_stats (int):
                If > 0 it will print stats every print_stats epochs.
            use_filters (boolean):
                Each project in medcattrainer can have filters, do we want to respect those filters
                when calculating metrics.
            terminate_last (boolean):
                If true, concept termination will be done after all training.
            use_overlaps (boolean):
                Allow overlapping entites, nearly always False as it is very difficult to annotate overlapping entites.
            use_cui_doc_limit (boolean):
                If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words
                if the document was annotated for that CUI. Useful in very specific situations when during the annotation
                process the set of CUIs changed.
            test_size (float):
                If > 0 the data set will be split into train test based on this ration. Should be between 0 and 1.
                Usually 0.1 is fine.
            devalue_others(bool):
                Check add_name for more details.
            use_groups (boolean):
                If True concepts that have groups will be combined and stats will be reported on groups.
            never_terminate (boolean):
                If True no termination will be applied
            train_from_false_positives (boolean):
                If True it will use false positive examples detected by medcat and train from them as negative examples.

        Returns:
            fp (dict):
                False positives for each CUI
            fn (dict):
                False negatives for each CUI
            tp (dict):
                True positives for each CUI
            p (dict):
                Precision for each CUI
            r (dict):
                Recall for each CUI
            f1 (dict):
                F1 for each CUI
            cui_counts (dict):
                Number of occurrence for each CUI
            examples (dict):
                FP/FN examples of sentences for each CUI
        '''
        fp = fn = tp = p = r = f1 = cui_counts = examples = {}
        data = json.load(open(data_path))
        cui_counts = {}

        if test_size == 0:
            self.log.info("Running without a test set, or train=test")
            test_set = data
            train_set = data
        else:
            train_set, test_set, _, _ = make_mc_train_test(data,
                                                           self.cdb,
                                                           test_size=test_size)

        if print_stats > 0:
            self._print_stats(test_set,
                              use_filters=use_filters,
                              use_cui_doc_limit=use_cui_doc_limit,
                              use_overlaps=use_overlaps,
                              use_groups=use_groups)

        if reset_cui_count:
            # Get all CUIs
            cuis = []
            for project in train_set['projects']:
                for doc in project['documents']:
                    if type(doc['annotations']) == list:
                        doc_annotations = doc['annotations']
                    elif type(doc['annotations']) == dict:
                        doc_annotations = doc['annotations'].values()

                    for ann in doc_annotations:
                        cuis.append(ann['cui'])
            for cui in set(cuis):
                if cui in self.cdb.cui2count_train:
                    self.cdb.cui2count_train[cui] = 10

        # Remove entities that were terminated
        if not never_terminate:
            for project in train_set['projects']:
                for doc in project['documents']:
                    if type(doc['annotations']) == list:
                        doc_annotations = doc['annotations']
                    elif type(doc['annotations']) == dict:
                        doc_annotations = doc['annotations'].values()

                    for ann in doc_annotations:
                        if ann.get('killed', False):
                            self.unlink_concept_name(ann['cui'], ann['value'])

        for epoch in tqdm(range(nepochs), desc='Epoch', leave=False):
            # Print acc before training
            for project in tqdm(train_set['projects'],
                                desc='Project',
                                leave=False,
                                total=len(train_set['projects'])):
                for i_doc, doc in tqdm(enumerate(project['documents']),
                                       desc='Document',
                                       leave=False,
                                       total=len(project['documents'])):
                    spacy_doc = self(doc['text'])
                    # Compatibility with old output where annotations are a list
                    if type(doc['annotations']) == list:
                        doc_annotations = doc['annotations']
                    elif type(doc['annotations']) == dict:
                        doc_annotations = doc['annotations'].values()

                    for ann in doc_annotations:
                        if not ann.get('killed', False):
                            cui = ann['cui']
                            start = ann['start']
                            end = ann['end']
                            spacy_entity = tkns_from_doc(spacy_doc=spacy_doc,
                                                         start=start,
                                                         end=end)
                            deleted = ann.get('deleted', False)
                            self.add_and_train_concept(
                                cui=cui,
                                name=ann['value'],
                                spacy_doc=spacy_doc,
                                spacy_entity=spacy_entity,
                                negative=deleted,
                                devalue_others=devalue_others)
                    if train_from_false_positives:
                        fps = get_false_positives(doc, spacy_doc)

                        for fp in fps:
                            self.add_and_train_concept(cui=fp._.cui,
                                                       name=fp.text,
                                                       spacy_doc=spacy_doc,
                                                       spacy_entity=fp,
                                                       negative=True,
                                                       do_add_concept=False)

            if terminate_last and not never_terminate:
                # Remove entities that were terminated, but after all training is done
                for project in train_set['projects']:
                    for doc in project['documents']:
                        if type(doc['annotations']) == list:
                            doc_annotations = doc['annotations']
                        elif type(doc['annotations']) == dict:
                            doc_annotations = doc['annotations'].values()

                        for ann in doc_annotations:
                            if ann.get('killed', False):
                                self.unlink_concept_name(
                                    ann['cui'], ann['value'])

            if print_stats > 0 and (epoch + 1) % print_stats == 0:
                fp, fn, tp, p, r, f1, cui_counts, examples = self._print_stats(
                    test_set,
                    epoch=epoch + 1,
                    use_filters=use_filters,
                    use_cui_doc_limit=use_cui_doc_limit,
                    use_overlaps=use_overlaps,
                    use_groups=use_groups)
        return fp, fn, tp, p, r, f1, cui_counts, examples

    def get_entities(self,
                     text,
                     only_cui=False,
                     addl_info=['cui2icd10', 'cui2ontologies']):
        r''' Get entities

        text:  text to be annotated
        return:  entities
        '''
        cnf_annotation_output = getattr(self.config, 'annotation_output', {})
        doc = self(text)
        out = {'entities': {}, 'tokens': []}
        if doc is not None:
            out_ent = {}
            if self.config.general.get('show_nested_entities', False):
                _ents = doc._.ents
            else:
                _ents = doc.ents

            if cnf_annotation_output.get("lowercase_context", True):
                doc_tokens = [tkn.text_with_ws.lower() for tkn in list(doc)]
            else:
                doc_tokens = [tkn.text_with_ws for tkn in list(doc)]

            if cnf_annotation_output.get('doc_extended_info', False):
                # Add tokens if extended info
                out['tokens'] = doc_tokens

            context_left = cnf_annotation_output.get('context_left', -1)
            context_right = cnf_annotation_output.get('context_right', -1)
            doc_extended_info = cnf_annotation_output.get(
                'doc_extended_info', False)

            for ind, ent in enumerate(_ents):
                cui = str(ent._.cui)
                if not only_cui:
                    out_ent['pretty_name'] = self.cdb.cui2preferred_name.get(
                        cui, '')
                    out_ent['cui'] = cui
                    out_ent['tuis'] = list(self.cdb.cui2type_ids.get(cui, ''))
                    out_ent['types'] = [
                        self.cdb.addl_info['type_id2name'].get(tui, '')
                        for tui in out_ent['tuis']
                    ]
                    out_ent['source_value'] = ent.text
                    out_ent['detected_name'] = str(ent._.detected_name)
                    out_ent['acc'] = float(ent._.context_similarity)
                    out_ent['context_similarity'] = float(
                        ent._.context_similarity)
                    out_ent['start'] = ent.start_char
                    out_ent['end'] = ent.end_char
                    for addl in addl_info:
                        tmp = self.cdb.addl_info[addl].get(cui, [])
                        out_ent[addl.split("2")[-1]] = list(tmp) if type(
                            tmp) == set else tmp
                    out_ent['id'] = ent._.id
                    out_ent['meta_anns'] = {}

                    if doc_extended_info:
                        out_ent['start_tkn'] = ent.start
                        out_ent['end_tkn'] = ent.end

                    if context_left > 0 and context_right > 0:
                        out_ent['context_left'] = doc_tokens[
                            max(ent.start - context_left, 0):ent.start]
                        out_ent['context_right'] = doc_tokens[
                            ent.end:min(ent.end +
                                        context_right, len(doc_tokens))]
                        out_ent['context_center'] = doc_tokens[ent.start:ent.
                                                               end]

                    if hasattr(ent._, 'meta_anns') and ent._.meta_anns:
                        out_ent['meta_anns'] = ent._.meta_anns

                    out['entities'][out_ent['id']] = dict(out_ent)
                else:
                    out['entities'][ent._.id] = cui

        return out

    def get_json(self,
                 text,
                 only_cui=False,
                 addl_info=['cui2icd10', 'cui2ontologies']):
        """ Get output in json format

        text:  text to be annotated
        return:  json with fields {'entities': <>, 'text': text}
        """
        ents = self.get_entities(text, only_cui,
                                 addl_info=addl_info)['entities']
        out = {'annotations': ents, 'text': text}

        return json.dumps(out)

    def multiprocessing(self,
                        in_data,
                        nproc=8,
                        batch_size=100,
                        only_cui=False,
                        addl_info=[]):
        r''' Run multiprocessing NOT FOR TRAINING

        in_data:  an iterator or array with format: [(id, text), (id, text), ...]
        nproc:  number of processors
        batch_size:  obvious

        return:  an list of tuples: [(id, doc_json), (id, doc_json), ...]
        '''
        if self._meta_annotations:
            # Hack for torch using multithreading, which is not good here
            import torch
            torch.set_num_threads(1)

        # Create the input output for MP
        in_q = Queue(maxsize=4 * nproc)
        manager = Manager()
        out_dict = manager.dict()
        out_dict['processed'] = []

        # Create processes
        procs = []
        for i in range(nproc):
            p = Process(target=self._mp_cons,
                        kwargs={
                            'in_q': in_q,
                            'out_dict': out_dict,
                            'pid': i,
                            'only_cui': only_cui,
                            'addl_info': addl_info
                        })
            p.start()
            procs.append(p)

        data = []
        for id, text in in_data:
            data.append((id, str(text)))
            if len(data) == batch_size:
                in_q.put(data)
                data = []
        # Put the last batch if it exists
        if len(data) > 0:
            in_q.put(data)

        for _ in range(nproc):  # tell workers we're done
            in_q.put(None)

        for p in procs:
            p.join()

        # Close the queue as it can cause memory leaks
        in_q.close()

        out = []
        for key in out_dict.keys():
            if 'pid' in key:
                data = out_dict[key]
                out.extend(data)

        # Sometimes necessary to free memory
        out_dict.clear()
        del out_dict

        return out

    def _mp_cons(self, in_q, out_dict, pid=0, only_cui=False, addl_info=[]):
        cnt = 0
        out = []
        while True:
            if not in_q.empty():
                data = in_q.get()
                if data is None:
                    out_dict['pid: {}'.format(pid)] = out
                    break

                for id, text in data:
                    try:
                        doc = self.get_entities(text=text,
                                                only_cui=only_cui,
                                                addl_info=addl_info)
                        doc['text'] = text
                        out.append((id, doc))
                    except Exception as e:
                        self.log.warning("Exception in _mp_cons")
                        self.log.warning(e, stack_info=True)

            sleep(1)

예제 #3

파일 보기

파일: test_ner_archive.py 프로젝트: piotrekwoznicki/MedCAT

    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
    with open(vocab_path, 'wb') as f:
        f.write(tmp.content)

vocab = Vocab.load(vocab_path)
# Make the pipeline
nlp = Pipe(tokenizer=spacy_split_all, config=config)
nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
               name='skip_and_punct',
               additional_fields=['is_punct'])
spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                  config=config,
                                  data_vocab=vocab)
nlp.add_token_normalizer(spell_checker=spell_checker, config=config)
ner = NER(cdb, config)
nlp.add_ner(ner)

# Add Linker
link = Linker(cdb, vocab, config)
nlp.add_linker(link)

# Test limits for tokens and uppercase
config.ner['max_skip_tokens'] = 1
config.ner['upper_case_limit_len'] = 4
config.linking['disamb_length_limit'] = 2
text = "CDB - I was running and then Movar    Virus attacked and CDb"
d = nlp(text)

assert len(d._.ents) == 2
assert d._.ents[0]._.link_candidates[0] == 'S-229004'