Python CustomisedRecoginiser.get_anns_by_label示例

编程语言: Python

命名空间/包名称: annotation_docs

方法/功能: get_anns_by_label

hotexamples.com的示例: 4

Python CustomisedRecoginiser.get_anns_by_label - 已找到4个示例。这些是从开源项目中提取的最受好评的annotation_docs.CustomisedRecoginiser.get_anns_by_label现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

CustomisedRecoginiser(7)

get_anns_by_label(4)

full_text_folder(3)

print_performances(3)

get_context_words(2)

get_prior_anns(2)

full_text(1)

get_same_sentence_anns(1)

validate_combined_performance(1)

validate_mapped_performance(1)

示例#1

显示文件

 def collect_dimensions(self, ann_dir):
     cm = self.concept_mapping
     file_keys = [
         f.split('.')[0] for f in listdir(ann_dir)
         if isfile(join(ann_dir, f))
     ]
     # collect dimension labels
     for fk in file_keys:
         cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
         t = self.label.replace('neg_', '')
         anns = cr.get_anns_by_label(t)
         neg_anns = cr.get_anns_by_label('neg_' + t)
         for a in anns + neg_anns:
             self.add_label_dimension_by_annotation(a)
             # self.add_context_dimension_by_annotation(a)
             if (a.negation != 'Negated' and self.label.startswith('neg_')) or \
                     (a.negation == 'Negated' and not self.label.startswith('neg_')):
                 continue
             sanns = cr.get_same_sentence_anns(a)
             context_anns = [] + sanns['umls'] + sanns['phenotype']
             # collect cui labels
             for u in sanns['umls']:
                 self._cui2label[u.cui] = u.pref
             for c in context_anns:
                 self.add_context_dimension_by_annotation(c)

示例#2

显示文件

    def assess_label_quality(self,
                             ann_dir,
                             gold_dir,
                             separate_by_label=True,
                             ignore_context=True):
        if ignore_context:
            logging.info('doing learning without considering contextual info')
        # print self.get_top_tfidf_dimensions(self.max_dimensions)
        cm = self.concept_mapping
        file_keys = [
            f.split('.')[0] for f in listdir(ann_dir)
            if isfile(join(ann_dir, f))
        ]
        label_type = self.label.replace('neg_', '')
        query_label_perform = {}
        for fk in file_keys:
            cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
            if not isfile(join(gold_dir, '%s-ann.xml' % fk)):
                continue
            gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk))

            not_matched_gds = []
            for e in gd.get_ess_entities():
                if (ignore_context and e.label.replace('neg_', '') == label_type) \
                        or (not ignore_context and e.label == self.label):
                    not_matched_gds.append(e.id)
            anns = cr.get_anns_by_label(self.label, no_context=ignore_context)
            for a in anns:
                multiple_true_positives = 0
                matched = False
                for g in gd.get_ess_entities():
                    if g.id in not_matched_gds:
                        gt = g.label.replace('neg_', '')
                        if g.overlap(a) and (
                            (g.label == self.label and not ignore_context) or
                            (ignore_context and gt == label_type)):
                            if matched:
                                multiple_true_positives += 1
                            matched = True
                            not_matched_gds.remove(g.id)

                if separate_by_label:
                    lbl = LabelModel.get_ann_query_label(a)
                else:
                    lbl = 'united'
                ql = lbl
                if ql not in query_label_perform:
                    query_label_perform[ql] = {'c': 0, 'w': 0}
                if matched:
                    query_label_perform[ql]['c'] += 1
                else:
                    query_label_perform[ql]['w'] += 1
        lbls = [(l, 1.0 * query_label_perform[l]['c'] /
                 (query_label_perform[l]['c'] + query_label_perform[l]['w']),
                 query_label_perform[l]['c'], query_label_perform[l]['w'])
                for l in query_label_perform]
        return sorted(lbls, key=lambda x: x[1])

示例#3

显示文件

    def load_data(self,
                  ann_dir,
                  gold_dir,
                  verbose=True,
                  ignore_mappings=[],
                  ignore_context=False,
                  separate_by_label=False,
                  ful_text_dir=None,
                  eHostGD=False,
                  annotated_anns={}):
        """

        :param ann_dir:
        :param gold_dir:
        :param verbose:
        :param ignore_mappings:
        :param ignore_context:
        :param separate_by_label:
        :param ful_text_dir:
        :param eHostGD:
        :param annotated_anns: NB: this is for labelling settings where only partial data is annotated on
        the documents. Therefore, we need to filter out those not assessed by the annotators to avoid kill some
        true positives (those are correct but not assessed by annotators)
        :return:
        """
        if ignore_context:
            logging.info('doing learning without considering contextual info')
        # print self.get_top_tfidf_dimensions(self.max_dimensions)
        cm = self.concept_mapping
        file_keys = [
            f[:f.rfind('.')] for f in listdir(ann_dir)
            if isfile(join(ann_dir, f))
        ]
        lbl2data = {}
        false_negatives = 0
        lbl2tps = {}
        label_type = self.label.replace('neg_', '')
        query_label_perform = {}
        for fk in file_keys:
            cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
            fk = fk.replace('se_ann_', '')
            if ful_text_dir is not None:
                cr.full_text_folder = ful_text_dir
            if eHostGD:
                if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)):
                    continue
                # logging.debug('using GD file %s' % join(gold_dir, '%s.txt.knowtator.xml' % fk))
                gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk))
            else:
                if not isfile(join(gold_dir, '%s-ann.xml' % fk)):
                    continue
                logging.debug('using GD file %s' %
                              join(gold_dir, '%s-ann.xml' % fk))
                gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk))

            # re-segement sentences
            # cr.re_segment_sentences(fk)
            # cr.relocate_all_anns(fk)
            # gd.relocate_anns(cr.get_full_text(fk))

            not_matched_gds = []
            for e in gd.get_ess_entities():
                if (ignore_context and e.label.replace('neg_', '') == label_type) \
                        or (not ignore_context and e.label == self.label):
                    not_matched_gds.append(e.id)

            anns = cr.get_anns_by_label(self.label,
                                        ignore_mappings=ignore_mappings,
                                        no_context=ignore_context)
            if len(annotated_anns) > 0:
                if '%s.txt' % fk not in annotated_anns:
                    continue
                kept_anns = []
                for a in anns:
                    for aa in annotated_anns['%s.txt' % fk]:
                        if int(aa['s']) == a.start and int(aa['e']) == a.end:
                            kept_anns.append(a)
                anns = kept_anns
            for a in anns:
                logging.debug('%s, %s, %s' % (a.str, a.start, a.end))
                multiple_true_positives = 0
                t2anns = cr.get_prior_anns(a)
                # if len(t2anns['umls']) + len(t2anns['phenotype']) == 0:
                #     t2anns = cr.get_prior_anns(a, contenxt_depth=-2)
                context_anns = [] + t2anns['umls'] + t2anns['phenotype'] + \
                               cr.get_context_words(a, fk)
                # context_anns = cr.get_context_words(a, fk)
                matched = False
                for g in gd.get_ess_entities():
                    if g.id in not_matched_gds:
                        gt = g.label.replace('neg_', '')
                        if g.overlap(a) and (
                            (g.label == self.label and not ignore_context) or
                            (ignore_context and gt == label_type)):
                            if matched:
                                multiple_true_positives += 1
                            matched = True
                            not_matched_gds.remove(g.id)
                if verbose:
                    if not matched:
                        logging.debug(
                            '%s %s %s' %
                            ('!',
                             self.get_ann_dim_label(a) + ' // ' + ' | '.join(
                                 self.get_ann_dim_label(a, generalise=True)
                                 for a in context_anns), fk))
                    else:
                        logging.debug(
                            '%s %s %s' %
                            ('R',
                             self.get_ann_dim_label(a) + ' // ' + ' | '.join(
                                 self.get_ann_dim_label(a, generalise=True)
                                 for a in context_anns), fk))

                lbl = LabelModel.get_label_specific_data(
                    self,
                    lbl2data,
                    a,
                    context_anns,
                    fk,
                    cr,
                    separate_by_label=separate_by_label)

                lbl2data[lbl]['multiple_tps'] += multiple_true_positives
                Y = lbl2data[lbl]['Y']
                Y.append([1 if matched else 0])
                ql = lbl
                if ql not in query_label_perform:
                    query_label_perform[ql] = {'c': 0, 'w': 0}
                if matched:
                    query_label_perform[ql]['c'] += 1
                else:
                    query_label_perform[ql]['w'] += 1
            false_negatives += len(not_matched_gds)

            missed = None
            for g in gd.get_ess_entities():
                if g.id in not_matched_gds:
                    missed = g
                    logging.debug('\t'.join([
                        'M', g.str,
                        str(g.negated),
                        str(g.start),
                        str(g.end),
                        join(gold_dir, '%s-ann.xml' % fk)
                    ]))
            # if len(not_matched_gds) > 0:
            #     print not_matched_gds
            #     for a in anns:
            #         logging.debug(a.str, a.start, a.end, missed.overlap(a))
        bad_labels = []
        for ql in query_label_perform:
            p = query_label_perform[ql]
            if p['c'] == 0 or (1.0 * p['w'] / p['c'] < 0.05):
                bad_labels.append(ql)
        return {
            'lbl2data': lbl2data,
            'fns': false_negatives,
            'bad_labels': bad_labels,
            'files': file_keys
        }

示例#4

显示文件

    def collect_tfidf_dimensions(self,
                                 ann_dir,
                                 gold_dir,
                                 ignore_context=False,
                                 separate_by_label=False,
                                 full_text_dir=None,
                                 eHostGD=False):
        cm = self.concept_mapping
        file_keys = [
            f[:f.rfind('.')] for f in listdir(ann_dir)
            if isfile(join(ann_dir, f))
        ]
        # collect dimension labels
        tp_freq = 0
        fp_freq = 0
        label_type = self.label.replace('neg_', '')
        fn_freq = 0
        for fk in file_keys:
            cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
            fk = fk.replace('se_ann_', '')
            if full_text_dir is not None:
                cr.full_text_folder = full_text_dir
            if eHostGD:
                if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)):
                    continue
                gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk))
            else:
                if not isfile(join(gold_dir, '%s-ann.xml' % fk)):
                    continue
                gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk))
            t = self.label.replace('neg_', '')
            anns = cr.get_anns_by_label(t)
            neg_anns = cr.get_anns_by_label('neg_' + t)

            # re-segement sentences
            # cr.re_segment_sentences(fk)
            # cr.relocate_all_anns(fk)
            # gd.relocate_anns(cr.get_full_text(fk))

            not_matched_gds = []
            for e in gd.get_ess_entities():
                if (ignore_context and e.label.replace('neg_', '') == label_type) \
                        or (not ignore_context and e.label == self.label):
                    not_matched_gds.append(e.id)
            for a in anns + neg_anns:
                # self.add_context_dimension_by_annotation(a)
                self.add_label_dimension_by_annotation(a)
                # if (not ignore_context) and ((a.negation != 'Negated' and self.label.startswith('neg_')) or \
                #         (a.negation == 'Negated' and not self.label.startswith('neg_'))):
                #     logging.info('skipped because context')
                #     continue

                matched = False
                for g in gd.get_ess_entities():
                    if g.id in not_matched_gds:
                        gt = g.label.replace('neg_', '')
                        if g.overlap(a) and (
                            (g.label == self.label and not ignore_context) or
                            (ignore_context and gt == label_type)):
                            matched = True
                            tp_freq += 1
                            not_matched_gds.remove(g.id)
                if not matched:
                    fp_freq += 1

                sanns = cr.get_prior_anns(a, contenxt_depth=-1)
                context_anns = [] + sanns['umls'] + sanns[
                    'phenotype'] + cr.get_context_words(a, fk)
                # context_anns =  cr.get_context_words(a, fk)
                # collect cui labels
                for u in sanns['umls']:
                    self._cui2label[u.cui] = u.pref
                for c in context_anns:
                    self.add_context_dimension_by_annotation(
                        c,
                        tp=True if matched else None,
                        fp=True if not matched else None,
                        lbl='united' if not separate_by_label else
                        LabelModel.get_ann_query_label(a))
            fn_freq += len(not_matched_gds)
        self._tps = tp_freq
        self._fps = fp_freq
        logging.debug('tp: %s, fp: %s, fn: %s' % (tp_freq, fp_freq, fn_freq))