def filter(self, documents): pycrf = PyCRFSuite(self.binary_model) for pmid, doc in documents: dataset = Dataset() dataset.documents[pmid] = doc self.pipeline.execute(dataset) self.labeler.label(dataset) pycrf.tag(dataset, MUT_CLASS_ID) PostProcessing().process(dataset) ExclusiveNLDefiner().define(dataset) total_nl_mentions = [] for part in doc: # print(part.annotations) print_verbose('predicted_annotations:', part.predicted_annotations) nl_mentions = [ (ann.text, ann.subclass, ann.confidence) for ann in part.predicted_annotations if ann.subclass != 0 and ann.confidence <= self.threshold ] total_nl_mentions += nl_mentions if any(total_nl_mentions): print('nl mentions', json.dumps(total_nl_mentions, indent=4)) yield pmid, doc print_verbose('nothing found')
def __init__(self, class_id=MUT_CLASS_ID, bin_model=pkg_resources.resource_filename( 'nala.data', 'all3_model'), features_pipeline=None, execute_pipeline=True, execute_pp=True, keep_silent=True, keep_genetic_markers=True, keep_unnumbered=True, keep_rs_ids=True): super().__init__([class_id]) self.class_id = class_id self.bin_model = bin_model self.features_pipeline = features_pipeline if features_pipeline else get_prepare_pipeline_for_best_model( ) self.execute_pipeline = execute_pipeline # --- self.crf = PyCRFSuite(model_file=self.bin_model) self.post = None if execute_pp: self.post = PostProcessing( keep_silent=keep_silent, keep_genetic_markers=keep_genetic_markers, keep_unnumbered=keep_unnumbered, keep_rs_ids=keep_rs_ids)
class NalaSingleModelTagger(Tagger): def __init__(self, class_id=MUT_CLASS_ID, bin_model=pkg_resources.resource_filename( 'nala.data', 'all3_model'), features_pipeline=None, execute_pipeline=True, execute_pp=True, keep_silent=True, keep_genetic_markers=True, keep_unnumbered=True, keep_rs_ids=True): super().__init__([class_id]) self.class_id = class_id self.bin_model = bin_model self.features_pipeline = features_pipeline if features_pipeline else get_prepare_pipeline_for_best_model( ) self.execute_pipeline = execute_pipeline # --- self.crf = PyCRFSuite(model_file=self.bin_model) self.post = None if execute_pp: self.post = PostProcessing( keep_silent=keep_silent, keep_genetic_markers=keep_genetic_markers, keep_unnumbered=keep_unnumbered, keep_rs_ids=keep_rs_ids) def tag(self, dataset, class_id=None): class_id = self.class_id if class_id is None else class_id if self.execute_pipeline: self.features_pipeline.execute(dataset) self.crf.annotate(dataset, class_id) if self.post: self.post.process(dataset, class_id=class_id)
def train(train_set): definer.define(train_set) train_set.delete_subclass_annotations(args.delete_subclasses) features_pipeline.execute(train_set) labeler.label(train_set) if args.pruner == "parts": train_set.prune_empty_parts() else: try: f = HighRecallRegexClassifier(ST=args.ps_ST, NL=args.ps_NL) except AssertionError: f = (lambda _: False) train_set.prune_filtered_sentences(filterin=f, percent_to_keep=args.ps_random) stats(train_set, "training") model_path = os.path.join(args.output_folder, args.model_name + ".bin") PyCRFSuite.train(train_set, model_path, args.crf_train_params) return model_path
def find_number_of_documents(): data = read_data(39, read_base=False) train, test = data.stratified_split() del data del train pipeline = get_prepare_pipeline_for_best_model() pipeline.execute(test) BIEOLabeler().label(test) PyCRFSuite().tag(test, 'idp4_model') PostProcessing().process(test) ExclusiveNLDefiner().define(test) keys = test.documents.keys() for test_size in range(30, 101, 10): sample = Dataset() random_keys = random.sample(keys, test_size) sample.documents = {key: test.documents[key] for key in random_keys} print('============== {} =============='.format(test_size)) calculate_standard_error(sample)
def filter(self, documents, min_found=1, use_nala=False): """ :type documents: collections.Iterable[(str, nalaf.structures.data.Document)] """ _progress = 1 _start_time = time.time() _total_time = 0 _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] last_found = 0 crf = PyCRFSuite(self.location_binary_model) # counter_to_stop_for_caching = 0 for pmid, doc in documents: # if any part of the document contains any of the keywords # yield that document # if counter_to_stop_for_caching > 400: # break # counter_to_stop_for_caching += 1 # print(counter_to_stop_for_caching) part_offset = 0 data_tmp = Dataset() data_tmp.documents[pmid] = doc data_nala = deepcopy(data_tmp) NLTKSplitter().split(data_tmp) # data_tmvar = TmVarTagger().generate_abstracts([pmid]) if use_nala: self.pipeline.execute(data_nala) self.labeler.label(data_nala) crf.tag(data_nala, MUT_CLASS_ID) PostProcessing().process(data_nala) ExclusiveNLDefiner().define(data_nala) used_regexs = {} positive_sentences = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences_ for sent in sentences: sent_length = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text) # new_text = re.sub('\W+', ' ', new_text) found_in_sentence = False for i, reg in enumerate(self.patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # todo create pattern performance eval for descending amount of recognized patterns # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: # if pmid in data_tmvar.documents: # anti_doc = data_tmvar.documents.get(pmid) nala_doc = data_nala.documents.get(pmid) start = part_offset + sent_offset + match.span()[0] end = part_offset + sent_offset + match.span()[1] # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end)) # print(not nala_doc.overlaps_with_mention(start, end, annotated=False)) if reg.pattern in used_regexs: used_regexs[reg.pattern] += 1 else: used_regexs[reg.pattern] = 1 print(color.PURPLE + new_text.replace( match.group(), color.BOLD + color.DARKCYAN + color.UNDERLINE + match.group() + color.END + color.PURPLE) + color.END) if not found_in_sentence: positive_sentences += 1 found_in_sentence = True # if not anti_doc.overlaps_with_mention(start, # end) \ # and not nala_doc.overlaps_with_mention(start, end, annotated=False): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True # else: # # if nala not used only tmvar considered # if not anti_doc.overlaps_with_mention(start, end): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True if use_nala: nala_found_mention = nala_doc.overlaps_with_mention( start, end, annotated=False) if nala_found_mention: print_verbose(nala_found_mention) if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold: yield pmid, doc if _lasttime - time.time() > 1: print_verbose('time intensive regex', i) sent_offset += 2 + sent_length # for per sentence positives if found_in_sentence: positive_sentences += 1 part_offset += sent_offset if use_nala: for part in nala_doc: for ann in part.predicted_annotations: if ann.subclass > 0: print_verbose(part.text[:ann.offset] + color.BOLD + ann.text + color.END + part.text[ann.offset + len(ann.text):]) positive_sentences += min_found _old_time = _start_time _start_time = time.time() _one_time = _start_time - _old_time if _one_time > 0.3 and positive_sentences > min_found: _progress += 1 _total_time += _one_time _time_per_doc = _total_time / _progress print_verbose( "PROGRESS: {:.2f} secs ETA per one positive document:" " {:.2f} secs".format(_total_time, _time_per_doc)) print_debug('used regular expressions:', json.dumps(used_regexs, indent=4)) if positive_sentences >= min_found: last_found = 0 print_verbose('YEP', pmid) yield pmid, doc else: print_verbose('NOPE', pmid)
warning = 'Due to a dependence on GNormPlus, running nalaf with -s and -d switches might take a long time.' if args.string: print(warning) dataset = StringReader(args.string).read() elif args.pmids: dataset = PMIDReader(args.pmids).read() elif os.path.exists(args.dir_or_file): print(warning) dataset = TextFilesReader(args.dir_or_file).read() else: raise FileNotFoundError('directory or file "{}" does not exist'.format(args.dir_or_file)) PrepareDatasetPipeline().execute(dataset) # get the predictions -- "example_entity_model" is only available in the nalaf src distribution crf = PyCRFSuite(model_file=pkg_resources.resource_filename('nalaf.data', 'example_entity_model')) crf.annotate(dataset, class_id=ENT2_CLASS_ID) GNormPlusGeneTagger(ENT1_CLASS_ID, ENTREZ_GENE_ID, UNIPROT_ID).tag(dataset, uniprot=True) StubSameSentenceRelationExtractor(ENT1_CLASS_ID, ENT2_CLASS_ID, REL_ENT1_ENT2_CLASS_ID).annotate(dataset) if args.output_dir: if not os.path.isdir(args.output_dir): raise NotADirectoryError('{} is not a directory'.format(args.output_dir)) if args.file_format == 'ann.json': TagTogFormat(dataset, use_predicted=True, to_save_to=args.output_dir).export(threshold_val=0) elif args.file_format == 'pubtator': PubTatorFormat(dataset, location=os.path.join(args.output_dir, 'pubtator.txt')).export() else: ConsoleWriter(ENT1_CLASS_ID, ENT2_CLASS_ID, args.color).write(dataset)