Пример #1
0
 def filter(self, documents):
     pycrf = PyCRFSuite(self.binary_model)
     for pmid, doc in documents:
         dataset = Dataset()
         dataset.documents[pmid] = doc
         self.pipeline.execute(dataset)
         self.labeler.label(dataset)
         pycrf.tag(dataset, MUT_CLASS_ID)
         PostProcessing().process(dataset)
         ExclusiveNLDefiner().define(dataset)
         total_nl_mentions = []
         for part in doc:
             # print(part.annotations)
             print_verbose('predicted_annotations:',
                           part.predicted_annotations)
             nl_mentions = [
                 (ann.text, ann.subclass, ann.confidence)
                 for ann in part.predicted_annotations
                 if ann.subclass != 0 and ann.confidence <= self.threshold
             ]
             total_nl_mentions += nl_mentions
         if any(total_nl_mentions):
             print('nl mentions', json.dumps(total_nl_mentions, indent=4))
             yield pmid, doc
         print_verbose('nothing found')
Пример #2
0
    def __init__(self,
                 class_id=MUT_CLASS_ID,
                 bin_model=pkg_resources.resource_filename(
                     'nala.data', 'all3_model'),
                 features_pipeline=None,
                 execute_pipeline=True,
                 execute_pp=True,
                 keep_silent=True,
                 keep_genetic_markers=True,
                 keep_unnumbered=True,
                 keep_rs_ids=True):

        super().__init__([class_id])

        self.class_id = class_id
        self.bin_model = bin_model
        self.features_pipeline = features_pipeline if features_pipeline else get_prepare_pipeline_for_best_model(
        )
        self.execute_pipeline = execute_pipeline
        # ---
        self.crf = PyCRFSuite(model_file=self.bin_model)

        self.post = None
        if execute_pp:
            self.post = PostProcessing(
                keep_silent=keep_silent,
                keep_genetic_markers=keep_genetic_markers,
                keep_unnumbered=keep_unnumbered,
                keep_rs_ids=keep_rs_ids)
Пример #3
0
class NalaSingleModelTagger(Tagger):
    def __init__(self,
                 class_id=MUT_CLASS_ID,
                 bin_model=pkg_resources.resource_filename(
                     'nala.data', 'all3_model'),
                 features_pipeline=None,
                 execute_pipeline=True,
                 execute_pp=True,
                 keep_silent=True,
                 keep_genetic_markers=True,
                 keep_unnumbered=True,
                 keep_rs_ids=True):

        super().__init__([class_id])

        self.class_id = class_id
        self.bin_model = bin_model
        self.features_pipeline = features_pipeline if features_pipeline else get_prepare_pipeline_for_best_model(
        )
        self.execute_pipeline = execute_pipeline
        # ---
        self.crf = PyCRFSuite(model_file=self.bin_model)

        self.post = None

        if execute_pp:
            self.post = PostProcessing(
                keep_silent=keep_silent,
                keep_genetic_markers=keep_genetic_markers,
                keep_unnumbered=keep_unnumbered,
                keep_rs_ids=keep_rs_ids)

    def tag(self, dataset, class_id=None):
        class_id = self.class_id if class_id is None else class_id

        if self.execute_pipeline:
            self.features_pipeline.execute(dataset)

        self.crf.annotate(dataset, class_id)

        if self.post:
            self.post.process(dataset, class_id=class_id)
Пример #4
0
    def train(train_set):
        definer.define(train_set)
        train_set.delete_subclass_annotations(args.delete_subclasses)
        features_pipeline.execute(train_set)
        labeler.label(train_set)

        if args.pruner == "parts":
            train_set.prune_empty_parts()
        else:
            try:
                f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL)
            except AssertionError:
                f = (lambda _: False)
            train_set.prune_filtered_sentences(filterin=f,
                                               percent_to_keep=args.ps_random)

        stats(train_set, "training")

        model_path = os.path.join(args.output_folder, args.model_name + ".bin")
        PyCRFSuite.train(train_set, model_path, args.crf_train_params)

        return model_path
Пример #5
0
def find_number_of_documents():
    data = read_data(39, read_base=False)
    train, test = data.stratified_split()
    del data
    del train

    pipeline = get_prepare_pipeline_for_best_model()
    pipeline.execute(test)
    BIEOLabeler().label(test)
    PyCRFSuite().tag(test, 'idp4_model')
    PostProcessing().process(test)
    ExclusiveNLDefiner().define(test)

    keys = test.documents.keys()
    for test_size in range(30, 101, 10):
        sample = Dataset()
        random_keys = random.sample(keys, test_size)
        sample.documents = {key: test.documents[key] for key in random_keys}

        print('============== {} =============='.format(test_size))
        calculate_standard_error(sample)
Пример #6
0
    def filter(self, documents, min_found=1, use_nala=False):
        """
        :type documents: collections.Iterable[(str, nalaf.structures.data.Document)]
        """

        _progress = 1
        _start_time = time.time()
        _total_time = 0

        _time_avg_per_pattern = 0
        _pattern_calls = 0
        _time_reg_pattern_total = 0
        _time_max_pattern = 0
        _low_performant_pattern = ""

        # NLDefiners init
        exclusive_definer = ExclusiveNLDefiner()
        _e_array = [0, 0, 0]
        inclusive_definer = InclusiveNLDefiner()
        _i_array = [0, 0]

        last_found = 0
        crf = PyCRFSuite(self.location_binary_model)

        # counter_to_stop_for_caching = 0

        for pmid, doc in documents:
            # if any part of the document contains any of the keywords
            # yield that document

            # if counter_to_stop_for_caching > 400:
            #     break
            # counter_to_stop_for_caching += 1
            # print(counter_to_stop_for_caching)

            part_offset = 0
            data_tmp = Dataset()
            data_tmp.documents[pmid] = doc
            data_nala = deepcopy(data_tmp)
            NLTKSplitter().split(data_tmp)
            # data_tmvar = TmVarTagger().generate_abstracts([pmid])
            if use_nala:
                self.pipeline.execute(data_nala)
                self.labeler.label(data_nala)
                crf.tag(data_nala, MUT_CLASS_ID)
                PostProcessing().process(data_nala)
                ExclusiveNLDefiner().define(data_nala)

            used_regexs = {}

            positive_sentences = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences_

                for sent in sentences:
                    sent_length = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text)
                    # new_text = re.sub('\W+', ' ', new_text)

                    found_in_sentence = False

                    for i, reg in enumerate(self.patterns):
                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call
                        # todo create pattern performance eval for descending amount of recognized patterns
                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")
                        if match:
                            # if pmid in data_tmvar.documents:
                            #     anti_doc = data_tmvar.documents.get(pmid)
                            nala_doc = data_nala.documents.get(pmid)

                            start = part_offset + sent_offset + match.span()[0]
                            end = part_offset + sent_offset + match.span()[1]
                            # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end))
                            # print(not nala_doc.overlaps_with_mention(start, end, annotated=False))

                            if reg.pattern in used_regexs:
                                used_regexs[reg.pattern] += 1
                            else:
                                used_regexs[reg.pattern] = 1
                            print(color.PURPLE + new_text.replace(
                                match.group(), color.BOLD + color.DARKCYAN +
                                color.UNDERLINE + match.group() + color.END +
                                color.PURPLE) + color.END)
                            if not found_in_sentence:
                                positive_sentences += 1
                                found_in_sentence = True
                                # if not anti_doc.overlaps_with_mention(start,
                                #                                       end) \
                                #         and not nala_doc.overlaps_with_mention(start, end, annotated=False):
                                #     _e_result = exclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _e_array[_e_result] += 1
                                #     _i_result = inclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _i_array[_i_result] += 1
                                # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))

                                # last_found += 1
                                # found_in_sentence = True
                                # else:
                                #     # if nala not used only tmvar considered
                                #     if not anti_doc.overlaps_with_mention(start, end):
                                #         _e_result = exclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _e_array[_e_result] += 1
                                #         _i_result = inclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _i_array[_i_result] += 1
                                #         # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                #         # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))
                                #         last_found += 1
                                #         found_in_sentence = True

                            if use_nala:
                                nala_found_mention = nala_doc.overlaps_with_mention(
                                    start, end, annotated=False)
                                if nala_found_mention:
                                    print_verbose(nala_found_mention)
                                    if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold:
                                        yield pmid, doc

                        if _lasttime - time.time() > 1:
                            print_verbose('time intensive regex', i)
                    sent_offset += 2 + sent_length

                    # for per sentence positives
                    if found_in_sentence:
                        positive_sentences += 1

                part_offset += sent_offset
            if use_nala:
                for part in nala_doc:
                    for ann in part.predicted_annotations:
                        if ann.subclass > 0:
                            print_verbose(part.text[:ann.offset] + color.BOLD +
                                          ann.text + color.END +
                                          part.text[ann.offset +
                                                    len(ann.text):])
                            positive_sentences += min_found
            _old_time = _start_time
            _start_time = time.time()
            _one_time = _start_time - _old_time

            if _one_time > 0.3 and positive_sentences > min_found:
                _progress += 1
                _total_time += _one_time

            _time_per_doc = _total_time / _progress
            print_verbose(
                "PROGRESS: {:.2f} secs ETA per one positive document:"
                " {:.2f} secs".format(_total_time, _time_per_doc))
            print_debug('used regular expressions:',
                        json.dumps(used_regexs, indent=4))
            if positive_sentences >= min_found:
                last_found = 0
                print_verbose('YEP', pmid)
                yield pmid, doc
            else:
                print_verbose('NOPE', pmid)
Пример #7
0
    warning = 'Due to a dependence on GNormPlus, running nalaf with -s and -d switches might take a long time.'
    if args.string:
        print(warning)
        dataset = StringReader(args.string).read()
    elif args.pmids:
        dataset = PMIDReader(args.pmids).read()
    elif os.path.exists(args.dir_or_file):
        print(warning)
        dataset = TextFilesReader(args.dir_or_file).read()
    else:
        raise FileNotFoundError('directory or file "{}" does not exist'.format(args.dir_or_file))

    PrepareDatasetPipeline().execute(dataset)

    # get the predictions -- "example_entity_model" is only available in the nalaf src distribution
    crf = PyCRFSuite(model_file=pkg_resources.resource_filename('nalaf.data', 'example_entity_model'))
    crf.annotate(dataset, class_id=ENT2_CLASS_ID)

    GNormPlusGeneTagger(ENT1_CLASS_ID, ENTREZ_GENE_ID, UNIPROT_ID).tag(dataset, uniprot=True)
    StubSameSentenceRelationExtractor(ENT1_CLASS_ID, ENT2_CLASS_ID, REL_ENT1_ENT2_CLASS_ID).annotate(dataset)

    if args.output_dir:
        if not os.path.isdir(args.output_dir):
            raise NotADirectoryError('{} is not a directory'.format(args.output_dir))

        if args.file_format == 'ann.json':
            TagTogFormat(dataset, use_predicted=True, to_save_to=args.output_dir).export(threshold_val=0)
        elif args.file_format == 'pubtator':
            PubTatorFormat(dataset, location=os.path.join(args.output_dir, 'pubtator.txt')).export()
    else:
        ConsoleWriter(ENT1_CLASS_ID, ENT2_CLASS_ID, args.color).write(dataset)