def __read_dictionaries(dic_paths, read_function, string_tokenizer, case_sensitive, stop_words): stop_words = DictionaryFeatureGenerator.__normalize_stop_words( stop_words) ret = [] for dic_path in dic_paths: try: reader = read_function(dic_path) try: name = DictionaryFeatureGenerator.__get_filename(dic_path) words_set = DictionaryFeatureGenerator.construct_words_set( reader, string_tokenizer, case_sensitive, stop_words) generator = DictionaryFeatureGenerator( name, words_set, case_sensitive) ret.append(generator) finally: reader.close() except Exception as e: traceback.print_exc() print_debug("Could not read dictionary: {}".format(dic_path), e) continue print_verbose("Using dictionaries: {}".format(", ".join( (repr(x) for x in ret)))) return ret
def read_predictions(self, dataset, predictionsfile, classification_threshold=None): classification_threshold = classification_threshold if classification_threshold is not None else self.classification_threshold values = [] with predictionsfile: predictionsfile.seek(0) for line in predictionsfile: prediction = float(line.strip()) print_verbose(" pred: " + str(prediction)) if prediction > classification_threshold: values.append(+1) else: values.append(-1) if (len(values) > 1): for index, edge in enumerate(dataset.edges()): edge.pred_target = values[index] else: if (next(dataset.edges(), None)): raise Exception( "EMPTY PREDICTIONS FILE -- This may be due to too small dataset or too few of features. Predictions file: " + predictionsfile.name) return dataset.form_predicted_relations()
def add_to_feature_set(self, feature_set, edge, feature_name, value=1): """ Return True if feature was added to feature_set. False, otherwise If the feature_name is None, the feature is not added in anycase. See: self.mk_feature_name """ if feature_name is None: return False else: feature_name = self.__set_final_name(feature_name) if not feature_set.is_locked: feature_index = feature_set.get(feature_name, None) if feature_index is None: feature_index = len(feature_set) feature_set[feature_name] = feature_index print_verbose( "Feature map: {} == {} -- _1st_ value: {}".format( str(feature_index), feature_name, str(value))) edge.features[feature_index] = value return True else: feature_index = feature_set.get(feature_name, None) if feature_index is not None: edge.features[feature_index] = value return True else: return False
def filter(self, documents): pycrf = PyCRFSuite(self.binary_model) for pmid, doc in documents: dataset = Dataset() dataset.documents[pmid] = doc self.pipeline.execute(dataset) self.labeler.label(dataset) pycrf.tag(dataset, MUT_CLASS_ID) PostProcessing().process(dataset) ExclusiveNLDefiner().define(dataset) total_nl_mentions = [] for part in doc: # print(part.annotations) print_verbose('predicted_annotations:', part.predicted_annotations) nl_mentions = [ (ann.text, ann.subclass, ann.confidence) for ann in part.predicted_annotations if ann.subclass != 0 and ann.confidence <= self.threshold ] total_nl_mentions += nl_mentions if any(total_nl_mentions): print('nl mentions', json.dumps(total_nl_mentions, indent=4)) yield pmid, doc print_verbose('nothing found')
def __exit__(self, exc_type, exc_val, exc_tb): if self.cache: print_verbose('writing the cache {}'.format(self.cache_filename)) if not os.path.exists(self.cache_directory): os.makedirs(self.cache_directory) with open(self.cache_filename, 'w') as file: json.dump(self.cache, file)
def __init__(self, model_file, n_bins=300): import numpy as np self.model = Word2Vec.load(model_file) data = np.vstack(self.model[word] for word in self.model.vocab) hist, self.bin_edges = np.histogram(data.flatten(), bins=n_bins) print_verbose('word embddings loaded with vocab size:', len(self.model.vocab))
def execute(self, dataset): """ :type dataset: nalaf.structures.data.Dataset() """ self.splitter.split(dataset) self.tokenizer.tokenize(dataset) for feature_generator in self.feature_generators: print_verbose('Apply feature generator:', type(feature_generator)) feature_generator.generate(dataset)
def __init__(self, model_file): import numpy as np self.model = Word2Vec.load(model_file) data = np.vstack(self.model[word] for word in self.model.vocab) self.pos_means = np.average(data, axis=0, weights=(data > 0)) self.neg_means = np.average(data, axis=0, weights=(data < 0)) print_verbose('word embddings loaded with vocab size:', len(self.model.vocab))
def get_word_embeddings_feature_generator(model_location=None, additive=None, multiplicative=None): """ :returns: nalaf.features.embeddings.WordEmbeddingsFeatureGenerator """ global _SINGLETON_WE_GENERATOR if _SINGLETON_WE_GENERATOR is None: additive = 0 if additive is None else additive multiplicative = 1 if multiplicative is None else multiplicative import tarfile import pkg_resources import requests from nalaf.features.embeddings import WordEmbeddingsFeatureGenerator from nalaf import print_verbose, print_warning if model_location is None: # D=100, no discretization, epoch=1, window=10 last_model = "word_embeddings_2016-03-28" we_model = pkg_resources.resource_filename( 'nala.data', os.path.join(last_model, 'word_embeddings.model')) if not os.path.exists(we_model): print_warning( 'Downloading Word Embeddings Model (this may take a long time). Expected path: ' + we_model) # TODO requests doesn't support ftp, but better use: ftp://rostlab.org/jmcejuela/...last_model... tar = '{}.tar.gz'.format(last_model) model_url = '{}/{}'.format('https://rostlab.org/~cejuela', tar) we_model_tar_gz = pkg_resources.resource_filename( 'nala.data', tar) response = requests.get(url=model_url, stream=True) with open(we_model_tar_gz, 'wb') as file: for chunk in response.iter_content(8048): if chunk: print('.', end="", flush=True) file.write(chunk) print() # Unpack the model print_verbose('Extracting') tar = tarfile.open(we_model_tar_gz) tar.extractall( path=pkg_resources.resource_filename('nala.data', '')) tar.close() _SINGLETON_WE_GENERATOR = WordEmbeddingsFeatureGenerator( we_model, additive, multiplicative) else: _SINGLETON_WE_GENERATOR = WordEmbeddingsFeatureGenerator( model_location, additive, multiplicative) return _SINGLETON_WE_GENERATOR
def export_ann_json(self, threshold_val=None): """ Creates all Annotation files in the corresponding ann.json format. Description of ann.json-format: "https://github.com/tagtog/tagtog-doc/wiki/ann.json" :return: """ for docid in self.data.documents.keys(): fname = os.path.join(self.annjson_path, docid + ".ann.json") print_verbose(fname) with open(fname, 'w', encoding='utf-8') as f: json_obj = self.get_single_ann_json(threshold_val, docid) json.dump(json_obj, f)
def __init__(self, dataset, use_predicted, to_save_to="resources/corpora/sample/anndoc", who="ml:nalaf", _annjson_folder="annjson", _html_folder="html", use_original_partids=True): """ init function that does prepare annjson folder and html folder :param to_save_to: usually resources/corpora/[name of corpus]/anndoc/ :type dataset: nalaf.structures.data.Dataset :param who: :param _annjson_folder: :param _html_folder: :return: """ self.location = to_save_to """ root folder, that documents are saved into """ self.data = dataset """ dataset param """ self.who = who """ who parameter """ self.use_original_partids = use_original_partids self.use_predicted = use_predicted # Possibility to use instance without writing files to disk if to_save_to: # check for root folder for files to save to if not os.path.isdir(self.location): print_verbose("mkdir", os.path.abspath(self.location)) try: os.makedirs(self.location) except FileExistsError: pass # create subfolders if not existent # annjson folder self.annjson_path = os.path.join(self.location, _annjson_folder) """ subfolder where ann.json files are saved into """ if not os.path.isdir(self.annjson_path): os.mkdir(self.annjson_path) # html folder self.html_folder = os.path.join(self.location, _html_folder) """ subfolder where html files are saved into """ if not os.path.isdir(self.html_folder): os.mkdir(self.html_folder)
def evaluate(self, dataset): """ :type dataset: nala.structures.data.Dataset :returns Evaluations """ subcounts = ['tp', 'fp', 'fn'] counts = {docid: dict.fromkeys(subcounts, 0) for docid in dataset.documents.keys()} print_verbose() for docid, doc in dataset.documents.items(): if self.evaluate_only_on_edges_plausible_relations: # a set would be better, but so far Relation is unshable relations_search_space = list(dataset.plausible_relations_from_generated_edges()) else: relations_search_space = None gold = doc.map_relations(use_predicted=False, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun, relations_search_space=relations_search_space).keys() pred = doc.map_relations(use_predicted=True, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun).keys() for r_pred in pred: accept_decisions = {self.relation_accept_fun(r_gold, r_pred) for r_gold in gold} assert set.issubset(accept_decisions, {True, False, None}), "`relation_accept_fun` cannot return: " + str(accept_decisions) if True in accept_decisions: # Count the true positives while iterating on gold pass elif None in accept_decisions: # Ignore as documented pass else: # either False or the set is empty, meaning that there are no gold annotations print_debug(" ", docid, ": FALSE POSITIV", r_pred) counts[docid]['fp'] += 1 for r_gold in gold: r_preds = [r_pred for r_pred in pred if self.relation_accept_fun(r_gold, r_pred)] if len(r_preds) > 0: # we could also do any(...); we have this in place only for debugging purposes print_verbose(" ", docid, ": true positive", r_gold) counts[docid]['tp'] += 1 else: print_debug(" ", docid, ": FALSE NEGATIV", r_gold) counts[docid]['fn'] += 1 print_verbose() evaluations = Evaluations() evaluations.add(EvaluationWithStandardError(self.rel_type, counts)) return evaluations
def __enter__(self): self.cache_directory = os.path.join(os.path.expanduser('~'), '.nalaf') self.cache_filename = '{}_cache.json'.format(os.path.join(self.cache_directory, self.__class__.__name__)) if os.path.exists(self.cache_filename): # if the file is too old reset the cache if self.is_timed and (time.time() - os.path.getctime(self.cache_filename)) > self.max_time_in_seconds: print_verbose('resetting the cache {}'.format(self.cache_filename)) os.remove(self.cache_filename) self.cache = {} else: print_verbose('reading from cache {}'.format(self.cache_filename)) with open(self.cache_filename) as f: self.cache = json.load(f) else: print_verbose('no cache found {}'.format(self.cache_filename)) self.cache = {} return self
def filter(self, documents, min_found=1, use_nala=False): """ :type documents: collections.Iterable[(str, nalaf.structures.data.Document)] """ _progress = 1 _start_time = time.time() _total_time = 0 _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] last_found = 0 crf = PyCRFSuite(self.location_binary_model) # counter_to_stop_for_caching = 0 for pmid, doc in documents: # if any part of the document contains any of the keywords # yield that document # if counter_to_stop_for_caching > 400: # break # counter_to_stop_for_caching += 1 # print(counter_to_stop_for_caching) part_offset = 0 data_tmp = Dataset() data_tmp.documents[pmid] = doc data_nala = deepcopy(data_tmp) NLTKSplitter().split(data_tmp) # data_tmvar = TmVarTagger().generate_abstracts([pmid]) if use_nala: self.pipeline.execute(data_nala) self.labeler.label(data_nala) crf.tag(data_nala, MUT_CLASS_ID) PostProcessing().process(data_nala) ExclusiveNLDefiner().define(data_nala) used_regexs = {} positive_sentences = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences_ for sent in sentences: sent_length = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text) # new_text = re.sub('\W+', ' ', new_text) found_in_sentence = False for i, reg in enumerate(self.patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # todo create pattern performance eval for descending amount of recognized patterns # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: # if pmid in data_tmvar.documents: # anti_doc = data_tmvar.documents.get(pmid) nala_doc = data_nala.documents.get(pmid) start = part_offset + sent_offset + match.span()[0] end = part_offset + sent_offset + match.span()[1] # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end)) # print(not nala_doc.overlaps_with_mention(start, end, annotated=False)) if reg.pattern in used_regexs: used_regexs[reg.pattern] += 1 else: used_regexs[reg.pattern] = 1 print(color.PURPLE + new_text.replace( match.group(), color.BOLD + color.DARKCYAN + color.UNDERLINE + match.group() + color.END + color.PURPLE) + color.END) if not found_in_sentence: positive_sentences += 1 found_in_sentence = True # if not anti_doc.overlaps_with_mention(start, # end) \ # and not nala_doc.overlaps_with_mention(start, end, annotated=False): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True # else: # # if nala not used only tmvar considered # if not anti_doc.overlaps_with_mention(start, end): # _e_result = exclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _e_array[_e_result] += 1 # _i_result = inclusive_definer.define_string( # new_text[match.span()[0]:match.span()[1]]) # _i_array[_i_result] += 1 # # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern # # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern)) # last_found += 1 # found_in_sentence = True if use_nala: nala_found_mention = nala_doc.overlaps_with_mention( start, end, annotated=False) if nala_found_mention: print_verbose(nala_found_mention) if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold: yield pmid, doc if _lasttime - time.time() > 1: print_verbose('time intensive regex', i) sent_offset += 2 + sent_length # for per sentence positives if found_in_sentence: positive_sentences += 1 part_offset += sent_offset if use_nala: for part in nala_doc: for ann in part.predicted_annotations: if ann.subclass > 0: print_verbose(part.text[:ann.offset] + color.BOLD + ann.text + color.END + part.text[ann.offset + len(ann.text):]) positive_sentences += min_found _old_time = _start_time _start_time = time.time() _one_time = _start_time - _old_time if _one_time > 0.3 and positive_sentences > min_found: _progress += 1 _total_time += _one_time _time_per_doc = _total_time / _progress print_verbose( "PROGRESS: {:.2f} secs ETA per one positive document:" " {:.2f} secs".format(_total_time, _time_per_doc)) print_debug('used regular expressions:', json.dumps(used_regexs, indent=4)) if positive_sentences >= min_found: last_found = 0 print_verbose('YEP', pmid) yield pmid, doc else: print_verbose('NOPE', pmid)
def __init__(self, model_file, additive=0, multiplicative=1): self.model = Word2Vec.load(model_file) self.additive = additive self.multiplicative = multiplicative print_verbose('word embddings loaded with vocab size:', len(self.model.vocab))
def evaluate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset :returns (tp, fp, fn, precision, recall, f_measure): (int, int, int, float, float, float) Calculates precision, recall and subsequently F1 measure, defined as: * precision: number of correctly predicted items as a percentage of the total number of predicted items len(predicted items that are also real)/len(predicted) or in other words tp / tp + fp * recall: number of correctly predicted items as a percentage of the total number of correct items len(real items that are also predicted)/len(real) or in other words tp / tp + fn """ TOTAL = EntityEvaluator.TOTAL_LABEL labels = [TOTAL] # find all possible subclasses or otherwise full classes labels += list(set(__class__._labelize(e) for e in dataset.entities())) labels += list( set(__class__._labelize(e) for e in dataset.predicted_entities())) docids = dataset.documents.keys() subcounts = ['tp', 'fp', 'fn'] counts = { label: {docid: dict.fromkeys(subcounts, 0) for docid in docids} for label in labels } for docid, doc in dataset.documents.items(): for partid, part in doc.parts.items(): gold_anns = set( filter(None, (self.entity_map_fun(e) for e in part.annotations))) pred_anns = set( filter(None, (self.entity_map_fun(e) for e in part.predicted_annotations))) for pred in pred_anns: accept_decisions = { self.entity_accept_fun(gold, pred) for gold in gold_anns } assert set.issubset( accept_decisions, {True, False, None }), "did not expect: " + str(accept_decisions) if True in accept_decisions: # Count the true positives while iterating on gold pass elif None in accept_decisions: pass else: # either False or the set is empty, meaning that there are no gold annotations print_debug(" ", docid, ": FALSE POSITIV", pred) counts[TOTAL][docid]['fp'] += 1 counts[__class__._labelize(pred)][docid]['fp'] += 1 for gold in gold_anns: accept_decisions = { self.entity_accept_fun(gold, pred) for pred in pred_anns } if True in accept_decisions: print_verbose(" ", docid, ": true positive", gold) counts[TOTAL][docid]['tp'] += 1 counts[__class__._labelize(gold)][docid]['tp'] += 1 elif "UNKNOWN:" in gold: # Pass when unknown normalization pass else: print_debug(" ", docid, ": FALSE NEGATIV", gold) counts[TOTAL][docid]['fn'] += 1 counts[__class__._labelize(gold)][docid]['fn'] += 1 evaluations = Evaluations() for label in labels: evaluations.add(EvaluationWithStandardError(label, counts[label])) return evaluations
def evaluate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset :returns (tp, fp, fn, tp_overlapping, precision, recall, f_measure): (int, int, int, int, float, float, float) Calculates precision, recall and subsequently F1 measure, defined as: * precision: number of correctly predicted items as a percentage of the total number of predicted items len(predicted items that are also real)/len(predicted) or in other words tp / tp + fp * recall: number of correctly predicted items as a percentage of the total number of correct items len(real items that are also predicted)/len(real) or in other words tp / tp + fn * possibly considers overlapping matches as well """ TOTAL = MentionLevelEvaluator.TOTAL_LABEL labels = [TOTAL] def labelize(e): """ Use this to represent an entity subclass as string and, if this is None or False (but not 0!), represent the entity with its class_id Convert to subclasses / classes ids to avoid the misstep of comparing possible subclass '0' with False, which in python breaks the universe --> info: https://twitter.com/juanmirocks/status/802209750612054016 """ return str(e.subclass) if str( e.subclass) not in ['None', 'False'] else str(e.class_id) if self.subclass_analysis: # find all possible subclasses or otherwise full classes subclasses = set(labelize(e) for e in dataset.entities()) subclasses.update( set(labelize(e) for e in dataset.predicted_entities())) for x in subclasses: labels.append(x) docids = dataset.documents.keys() subcounts = ['tp', 'fp', 'fn', 'fp_ov', 'fn_ov'] counts = { label: {docid: dict.fromkeys(subcounts, 0) for docid in docids} for label in labels } for docid, doc in dataset.documents.items(): for partid, part in doc.parts.items(): overlap_real = {label: [] for label in labels} overlap_predicted = {label: [] for label in labels} Entity.equality_operator = 'overlapping' for ann_a in part.annotations: for ann_b in part.predicted_annotations: if ann_a == ann_b: # equal according according to exclusive overlapping eq (not exact) overlap_real[TOTAL].append(ann_a) overlap_predicted[TOTAL].append(ann_b) if self.subclass_analysis: if labelize(ann_a) != labelize(ann_b): print_debug( 'overlapping subclasses do not match', ann_a.subclass, ann_b.subclass) ann_b.subclass = ann_a.subclass overlap_real[labelize(ann_a)].append(ann_a) overlap_predicted[labelize(ann_b)].append( ann_b) Entity.equality_operator = 'exact' for ann in part.predicted_annotations: if ann in part.annotations: counts[TOTAL][docid]['tp'] += 1 print_verbose(" ", docid, ": TRUE POSITVE", ann) if self.subclass_analysis: counts[labelize(ann)][docid]['tp'] += 1 else: counts[TOTAL][docid]['fp'] += 1 if ann in overlap_predicted[TOTAL]: counts[TOTAL][docid]['fp_ov'] += 1 else: print_debug(" ", docid, ": FALSE POSITIV", ann) if self.subclass_analysis: counts[labelize(ann)][docid]['fp'] += 1 if ann in overlap_predicted[labelize(ann)]: counts[labelize(ann)][docid]['fp_ov'] += 1 for ann in part.annotations: if ann not in part.predicted_annotations: counts[TOTAL][docid]['fn'] += 1 if ann in overlap_real[TOTAL]: counts[TOTAL][docid]['fn_ov'] += 1 else: print_debug(" ", docid, ": FALSE NEGATIV", ann) if self.subclass_analysis: counts[labelize(ann)][docid]['fn'] += 1 if ann in overlap_real[labelize(ann)]: counts[labelize(ann)][docid]['fn_ov'] += 1 evaluations = Evaluations() for label in labels: evaluations.add(EvaluationWithStandardError(label, counts[label])) return evaluations
def create_nalaf_entity(self, tagger_entity, original_text, offset_adjustment=0): offset = tagger_entity["start"] + offset_adjustment end = tagger_entity["end"] + offset_adjustment entity_text = original_text[offset:end] e_class_id = n_class_id = None norms = [] organisms_proteins = {} for norm in tagger_entity["ids"]: # assumption: the e_class_id and n_class_id once set will not change norm_id = norm["id"] if norm["type"] == "-3": e_class_id = self.organism_id n_class_id = self.taxonomy_norm_id norms.append(norm_id) elif norm["type"] == "-22": try: if any( are_go_parent_and_child(in_parent, norm_id) for in_parent in self.filter_in_go_localizations) and not any( are_go_parent_and_child(out_parent, norm_id) for out_parent in self.filter_out_go_localizations): e_class_id = self.localization_id n_class_id = self.go_norm_id norms.append(norm_id) else: print_verbose("REJECT", norm_id, get_localization_name(norm_id)) pass # reject except KeyError as e: print_verbose("REJECT", norm_id, get_localization_name(norm_id)) pass # reject elif norm["type"].startswith("uniprot_ac:"): organism = int(norm["type"].split(":")[1]) prots = organisms_proteins.get(organism, set()) prots.update({norm_id}) organisms_proteins[organism] = prots e_class_id = self.protein_id n_class_id = self.uniprot_norm_id norms.append(norm_id) elif norm["type"].startswith("string_id:"): # Set e_class_id thus not to reject the protein; this happens in the few cases the string id cannot be normalized to uniprot e_class_id = self.protein_id if not e_class_id: return None # reject else: norms = set( norms ) # convert to set first just in case the original tagger returns repeated ids (happened) if self.remove_ambiguous_proteins: # Remove ambiguous ids; heuristic: different normalizations for a same organism are considered ambiguous for organism, proteins in organisms_proteins.items(): if len(proteins) > 1: for ambiguous_protein in proteins: norms.remove(ambiguous_protein) if not norms: norms = None else: norms = ",".join(norms) if n_class_id: norms_dic = {n_class_id: norms} else: norms_dic = None pred_entity = Entity(class_id=e_class_id, offset=offset, text=entity_text, norms=norms_dic) return pred_entity