class TestExclusiveNLDefiner(unittest.TestCase): @classmethod def setUpClass(self): self.definer = ExclusiveNLDefiner() def test_on_empty_string(self): try: self.definer.define_string("") except Exception: self.fail( "empty string result is undefined but should not throw an exception" ) def test_define_string(self): f = self.definer.define_string testEqual = self.assertEqual testEqual(0, f("rs206437")) # rsid testEqual(0, f("ss469415642")) # ssid testEqual(2, f("C226 to T")) testEqual(2, f("G446 to A")) testEqual(2, f("C821 to T")) testEqual(2, f("Arg76 to Trp")) testEqual(2, f("Arg149 to Gln")) testEqual(2, f("Pro274 to Leu")) testEqual(2, f("T320 to C")) testEqual(2, f("Leu107 to Pro")) testEqual(2, f("C631 to T")) testEqual(2, f("Arg211 to Cys")) testEqual(2, f("Ala215 to Thr")) testEqual(1, f("deletion of its cytoplasmic tail")) testEqual(1, f("nonsense mutation Q3X")) testEqual(0, f("R142Q")) testEqual(1, f("G-->A transition of a CpG dinucleotide")) testEqual(1, f("A C-->T transition of the same CpG")) testEqual(0, f("R142X")) testEqual(0, f("R142X")) testEqual(0, f("R142Q")) testEqual(1, f("replacement of this CpG hotspot by CpA")) testEqual(0, f("R142X")) testEqual(1, f("caused skipping of the exon")) testEqual(1, f("Absence of exon 5")) testEqual(0, f("Asp8Asn")) testEqual(1, f("G to A transition at nt22")) testEqual(1, f("asparagine for aspartic acid at codon 8")) testEqual(0, f("Asp8Asn")) testEqual( 1, f("substitution of neutral asparagine for anionic aspartic acid")) testEqual(1, f("G to A transition is at a CpG dinucleotide")) testEqual(1, f("codon CAA encoding glutamine-2153 to UAA, a stop codon")) testEqual( 1, f("attaching an epitope tag sequence to the C terminus of the editing protein" )) testEqual(0, f("H15D")) testEqual(0, f("A83D")) testEqual(0, f("A179D")) testEqual(0, f("573 + IG-->A")) testEqual(0, f("H15D")) testEqual(0, f("A83D")) testEqual(0, f("A179D")) testEqual(1, f("skipping of exon 5")) testEqual(0, f("H15D")) testEqual( 1, f("Replacement of these small hydrophobic Ala residues with the charged, more bulky Asp side chain" )) testEqual(0, f("G20R")) testEqual(1, f("G to A transition at a CpG")) testEqual(1, f("glycine to arginine substitution at codon 20")) testEqual(0, f("26delA")) testEqual(0, f("delPhe1388")) testEqual(1, f("deleted C1 domain")) testEqual(0, f("Q115P")) testEqual(0, f("g.3912G>C")) testEqual(0, f("c.925delA")) testEqual(0, f("c.388+3insT")) testEqual(0, f("3992-9g-->a")) testEqual(2, f("3992-9g-->a mutation")) testEqual(2, f("G643 to A")) testEqual(2, f("leucine for arginine 90")) testEqual(1, f("deletion of aa 527-534")) testEqual( 1, f("deletion of 10 and 8 residues from the N- and C-terminals")) testEqual(1, f("143 from alanine to glycine")) testEqual( 1, f("alterations of amino acid residue 143 from alanine to glycine")) testEqual(1, f("trinucleotide deletion")) testEqual(1, f("arginine-141 to serine substitution")) testEqual(1, f("mutations at Arg885")) testEqual(1, f("point mutation at Cys93")) testEqual(1, f("heterozygous missense 3035G>T")) testEqual(2, f("synonymous 696T>C")) testEqual(2, f("missense Glu285Ala")) testEqual(1, f("somatic 16-bp deletion")) testEqual(1, f("serine 749 is phosphorylated")) testEqual(1, f("Ser58 to Glu substitution")) testEqual(1, f("deletion of")) testEqual(1, f("deletion of")) testEqual(1, f("deletion of")) testEqual(1, f("deletion of")) testEqual(0, f("GAT-->GTT, Asp-->Val")) testEqual(2, f("codon 98 GAT-->GTT, Asp-->Val")) testEqual(2, f("codon 92, TAC-->TAT")) testEqual( 1, f("arginine-127 into glutamine and arginine-469 into tryptophan")) testEqual(2, f("arginine-127 into glutamine")) testEqual(2, f("arginine-469 into tryptophan")) testEqual(0, f("TP73Δex2/3")) testEqual(1, f("abrogated loss of Chr19")) # More difficult testEqual(2, f("chromothripsis")) testEqual(2, f("Morpholino knockdown")) testEqual(2, f("methionine replaces lysine 27")) testEqual(2, f("lysine(27)-to-methionine")) testEqual(1, f("C-tail displacement")) testEqual(1, f("22q11 deletion syndrome")) testEqual(1, f("hippocampal neuron L1 insertions")) testEqual(1, f("copy-number variants"))
def pattern_stats(dataset): """ Testing Ground Carsten - High Recall Patterns creation method development method ported here. :type nala.structures.Dataset: dataset to perform pattern evaluation on (must include annotations) :return: nothing (print statements for the moment) """ ExclusiveNLDefiner().define(dataset) # PubTatorFormat(dataset, no_annotations=False).export() print(dataset) nl_annotations = [] # import connecting_words.json with open('nala/data/connecting_words.json', 'r') as f: regexs = json.load(f) # print(regexs) compiled_regexs = [re.compile(x) for x in regexs] nr_word_regex = re.compile( '\\b(one|two|three|four|five|six|seven|eight|nine|ten)\\b') aa_short_regex = re.compile( '\\b(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr)\\b' ) aa_long_regex = re.compile( '\\b(glutamine|glutamic acid|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic acid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine)\\b' ) bp_code = re.compile('\\b\\w\\b') wordlist = [] # for ann in dataset.annotations(): # if ann.subclass == 1 or ann.subclass == 2: # new_text = ann.text.lower() # for reg in compiled_regexs: # new_text = reg.sub('_TT_', new_text) # # re.sub('\\b\\d+\\b]', '_NR_', new_text) # new_text = re.sub('\\b\\w*\\d+\\w*\\b', '_CODE_', new_text) # new_text = nr_word_regex.sub('_TT_', new_text) # new_text = aa_short_regex.sub('_AA_', new_text) # new_text = aa_long_regex.sub('_AA_', new_text) # new_text = bp_code.sub('_TT_', new_text) # new_text = re.sub('\\W', ' ', new_text) # # new_text = re.sub('\\b(\\w{1,3})\\b', '_TT_', new_text) # # wordlist.extend(new_text.split(' ')) # # print(new_text) # nl_annotations.append(new_text) # # wordset = set(wordlist) # wordlist = sorted(list(wordset)) # print(json.dumps(wordlist, indent=2, sort_keys=True)) # print(json.dumps(nl_annotations, indent=2, sort_keys=True)) # todo provide method to create new pattern on an automated base # read in nl_patterns with open('nala/data/nl_patterns.json', 'r') as f: regexs = json.load(f) patterns = [re.compile(x) for x in regexs] # f-measure pattern-based _perf_patterns = {} for reg in patterns: _perf_patterns[reg.pattern] = [0, 0, -1] # check for annotations # for part in dataset.parts(): # print(part.text) # dataset with tmVar # TODO change if idp4 then those results otherwise use tmvartagger and caching dataset_high_recall = TmVarReader( 'resources/corpora/idp4/pubtator_tmvar.txt').read() TP = 0 FP = 0 _length = len(dataset.documents.keys()) _progress = 0 _timestart = time.time() _time_avg_per_pattern = 0 _pattern_calls = 0 _time_reg_pattern_total = 0 _time_max_pattern = 0 _low_performant_pattern = "" _avg_chars_per_doc = dataset.get_size_chars() / len( dataset.documents.keys()) # NLDefiners init exclusive_definer = ExclusiveNLDefiner() _e_array = [0, 0, 0] inclusive_definer = InclusiveNLDefiner() _i_array = [0, 0] # todo param file to save to with open('results/testing_ground_carsten.txt', 'w', encoding='utf-8') as f: for did, doc in dataset.documents.items(): part_offset = 0 for i, x in enumerate(doc.parts): # print("Part", i) sent_offset = 0 cur_part = doc.parts.get(x) sentences = cur_part.sentences # new_text = cur_part.text.lower() # new_text = re.sub('\s+', ' ', new_text) # sentences = new_text.split('. ') for sent in sentences: sent_len = len(sent) new_text = sent.lower() new_text = re.sub('[\./\\-(){}\[\],%]', '', new_text) new_text = re.sub('\W+', ' ', new_text) for i, reg in enumerate(patterns): _lasttime = time.time() # time start var match = reg.search(new_text) # debug bottleneck patterns _time_current_reg = time.time( ) - _lasttime # time end var _pattern_calls += 1 # pattern calls already occured _time_reg_pattern_total += _time_current_reg # total time spent on searching with patterns if _time_reg_pattern_total > 0: _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls # avg spent time per pattern call # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg: # print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern) # if _time_max_pattern < _time_current_reg: # _time_max_pattern = _time_current_reg # _low_performant_pattern = reg.pattern # print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern) # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)': # if _time_current_reg > _time_avg_per_pattern * 10: # # print(_time_avg_per_pattern, _time_current_reg) # f.write("BAD_PATTERN\n") # f.write(sent + "\n") # f.write(new_text + "\n") if match: if did in dataset_high_recall.documents: anti_doc = dataset_high_recall.documents.get( did) start = part_offset + sent_offset + match.span( )[0] end = part_offset + sent_offset + match.span( )[1] if not anti_doc.overlaps_with_mention( start, end): _e_result = exclusive_definer.define_string( new_text[match.span()[0]:match.span( )[1]]) _e_array[_e_result] += 1 _i_result = inclusive_definer.define_string( new_text[match.span()[0]:match.span( )[1]]) _i_array[_i_result] += 1 if doc.overlaps_with_mention(start, end): TP += 1 f.write( "{}\tTP\te{}\ti{}\t{}\t{}\t{}\n". format(did, _e_result, _i_result, sent, match, reg.pattern)) _perf_patterns[reg.pattern][0] += 1 else: FP += 1 f.write( "{}\tFP\te{}\ti{}\t{}\t{}\t{}\n". format(did, _e_result, _i_result, sent, match, reg.pattern)) _perf_patterns[reg.pattern][1] += 1 if _perf_patterns[reg.pattern][1] > 0: _perf_patterns[ reg.pattern][2] = _perf_patterns[ reg. pattern][0] / _perf_patterns[ reg.pattern][1] if _lasttime - time.time() > 1: print(i) sent_offset += 2 + sent_len part_offset += sent_offset _progress += doc.get_size() / _avg_chars_per_doc _time_progressed = time.time() - _timestart _time_per_doc = _time_progressed / _progress _time_req_time = _time_per_doc * _length _time_eta = _time_req_time - _time_progressed print("PROGRESS: {:.3%} PROGRESS: {:.2f} secs ETA: {:.2f} secs". format(_progress / _length, _time_progressed, _time_eta)) if TP + FP > 0: print( 'STATS: TP:{}, FP:{}, TP+FP:{} %containingNLmentions:{:.4%}' .format(TP, FP, TP + FP, TP / (TP + FP))) print("Exclusive Definer:", _e_array) print("Inclusive Definer:", _i_array) for key, value in _perf_patterns.items(): if value[2] != -1: print(value, key)
class PostProcessing: def __init__(self, keep_silent=True, keep_genetic_markers=True, keep_unnumbered=True, keep_rs_ids=True): amino_acids = [ 'alanine', 'ala', 'arginine', 'arg', 'asparagine', 'asn', 'aspartic acid', 'aspartate', 'asp', 'cysteine', 'cys', 'glutamine', 'gln', 'glutamic acid', 'glutamate', 'glu', 'glycine', 'gly', 'histidine', 'his', 'isoleucine', 'ile', 'leucine', 'leu', 'lysine', 'lys', 'methionine', 'met', 'phenylalanine', 'phe', 'proline', 'pro', 'serine', 'ser', 'threonine', 'thr', 'tryptophan', 'trp', 'tyrosine', 'tyr', 'valine', 'val', 'aspartic acid', 'asparagine', 'asx', 'glutamine', 'glutamic acid', 'glx' ] nucleotides = ['adenine', 'guanine', 'thymine', 'cytosine', 'uracil'] keywords = [ 'substit\w*', 'lead\w*', 'exchang\w*', 'chang\w*', 'mutant\w*', 'mutate\w*', 'devia\w*', 'modif\w*', 'alter\w*', 'switch\w*', 'variat\w*', 'instead\w*', 'replac\w*', 'in place', 'convert\w*', 'becom\w*' ] # AA = '|'.join(amino_acids) AA_NN = '|'.join(amino_acids + nucleotides) AA_LL = '|'.join(amino_acids + list('CISQMNPKDTFAGHLRWVEYX')) KK = '|'.join(keywords) genetic_marker_regex = re.compile(r'\bD\d+([A-Z]\d+)?S\d{2,}\b') rs_id_regex = re.compile(r'\b\[?rs\]? *\d{3,}(,\d+)*\b') ss_id_regex = re.compile(r'\b\[?ss\]? *\d{3,}(,\d+)*\b') self.patterns = [ re.compile( '({SS})[- ]*[1-9][0-9]* +(in|to|into|for|of|by|with|at) +({SS})( *(,|,?or|,?and) +({SS}))*' .format(SS=AA_NN), re.IGNORECASE), re.compile( '({SS}) +(in|to|into|for|of|by|with|at) +({SS})[- ]*[1-9][0-9]*' '( *(,|,?or|,?and) +({SS})[- ]*[1-9][0-9]*)*'.format(SS=AA_NN), re.IGNORECASE), re.compile( '({SS})(( (({KK})) (in|to|into|for|of|by|with|at) (a|an|the|) ' '*({SS})[1-9]\d*( *(,|or|and|, and|, or) ({SS})[1-9]\d*)*)' '|([- ]*[1-9]\d*( +((has|have|had) +been|is|are|was|were|) ' '+(({KK})))? +(in|to|into|for|of|by|with|at) +({SS})( *(,|or|and|, and|, or) +({SS}))*))' .format(SS=AA_NN, KK=KK), re.IGNORECASE), re.compile(r'\bp\. *({SS}) *[-+]*\d+ *({SS})\b'.format(SS=AA_NN), re.IGNORECASE), re.compile( r'\b({SS})[-to ]*[-+]*\d+[-to ]*({SS})\b'.format(SS=AA_NN), re.IGNORECASE), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX](/|-|-*>|→|-to-)[CISQMNPKDTFAGHLRWVEYX] *[-+]*\d+\b' ), re.compile( r'((?<!\w)[-+]*\d+:? *?)??[CISQMNPKDTFAGHLRWVEYX] *(/|-|-*>|→|-*to-*) *[CISQMNPKDTFAGHLRWVEYX]\b' ), re.compile(r'\b[CISQMNPKDTFAGHLRWVEYX]{3,}/-(?<!\w)'), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX] *\d{2,} *[CISQMNPKDTFAGHLRWVEYX]( *(/) *[CISQMNPKDTFAGHLRWVEYX])*\b' ), genetic_marker_regex, rs_id_regex, ss_id_regex, re.compile( r'\b(\d+-)?\d*[D|d]elta(\d{2,}|[CISQMNPKDTFAGHLRWVEYX])\b'), re.compile(r'\b(c\. *)?[ATCG] *([-+]|\d)\d+ *[ATCG]\b'), re.compile(r'\b(c\.|E(X|x)\d+) *([-+]|\d)\d+[ATCG] *> *[ATCG]\b'), re.compile(r'\b[ATCG][-+]*\d+[ATCG]/[ATCG]\b'), re.compile( r'(?<!\w)[-+]?\d+ *\d* *(b|bp|N|ntb|p|BP|B) *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ), re.compile( r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[0-9CISQMNPKDTFAGHLRWVEYX]+\b' ), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ), re.compile( r'\b(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup) *(\d+(b|bp|N|ntb|p|BP|B)|[ATCG]{1,})\b' ), re.compile( r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[CISQMNPKDTFAGHLRWVEYX]+\b' ), re.compile( r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b' ) ] self.negative_patterns = [ # single AAs re.compile(r'^({SS}) *\d+$'.format(SS=AA_NN), re.IGNORECASE), re.compile(r'^[CISQMNPKDTFAGHLRWVEYX]+ *\d+$'), re.compile(r'^({SS})([-/>]({SS}))*$'.format(SS=AA_LL), re.IGNORECASE), # just numbers re.compile(r'^[-+]?\d+([-+/ ]+\d+)*( *(b|bp|N|ntb|p|BP|B))?$') ] if not keep_genetic_markers: self.negative_patterns.append(genetic_marker_regex) if not keep_rs_ids: self.negative_patterns.append(rs_id_regex) self.negative_patterns.append(ss_id_regex) self.keep_unnumbered = keep_unnumbered self.at_least_one_letter_n_number_letter_n_number = re.compile( '(?=.*[A-Za-z])(?=.*[0-9])[A-Za-z0-9]+') self.keep_silent = keep_silent self.definer = ExclusiveNLDefiner() def process(self, dataset, class_id=MUT_CLASS_ID): for doc_id, doc in dataset.documents.items(): for part_id, part in doc.parts.items(): self.__fix_issues(part) for regex in self.patterns: for match in regex.finditer(part.text): start = match.start() end = match.end() matched_text = part.text[start:end] ann = Entity(class_id, start, matched_text) Entity.equality_operator = 'exact_or_overlapping' if ann not in part.predicted_annotations: part.predicted_annotations.append( Entity(class_id, start, matched_text)) Entity.equality_operator = 'overlapping' if ann in part.predicted_annotations: for index, ann_b in enumerate( part.predicted_annotations): if ann == ann_b and len(matched_text) > len( ann_b.text): part.predicted_annotations[index] = ann to_delete = [ index for index, ann in enumerate(part.predicted_annotations) if any(r.search(ann.text) for r in self.negative_patterns) or (not self.keep_silent and self.__is_silent(ann)) or (not self.keep_unnumbered and not self._is_numbered(ann)) ] part.predicted_annotations = [ ann for index, ann in enumerate(part.predicted_annotations) if index not in to_delete ] # sanity check, make sure annotations match their offset for part in dataset.parts(): for ann in part.predicted_annotations: assert ann.text == part.text[ann.offset:ann.offset + len(ann.text)] while ann.text[0] == ' ': ann.offset += 1 ann.text = ann.text[1:] while ann.text[-1] == ' ': ann.text = ann.text[:-1] # assert ann.text == ann.text.strip(), ("'" + ann.text + "'") def __is_silent(self, ann): split = re.split('[^A-Za-z]+', ann.text) return len(split) == 2 and split[0] == split[1] def _is_numbered(self, ann): return any(c.isdigit() for c in ann.text) or self.definer.define_string( ann.text) == 1 def __fix_issues(self, part): """ :type part: nalaf.structures.data.Part """ to_be_removed = [] for index, ann in enumerate(part.predicted_annotations): start = ann.offset end = ann.offset + len(ann.text) # split multiple mentions split = re.split(r' *(?:\band\b|/|\\|,|;|\bor\b) *', ann.text) if len(split) > 1: # for each split part calculate the offsets and the constraints offset = 0 split_info = [] for text in split: split_info.append( (text, self.definer.define_string(text), ann.text.find(text, offset), self.at_least_one_letter_n_number_letter_n_number. search(text))) offset += len(text) split_parts = [ split_part for split_part in split_info if split_part[0] != '' ] lens = [len(split_part[0]) for split_part in split_parts] patterns = [ re.sub( '\W+', '', re.sub('[0-9]', '0', re.sub('[a-zA-Z]', 'a', parts[0]))) for parts in split_parts ] # if all the non empty parts are from class ST (0) and also contain at least one number and one letter # or if the lengths of the splitted parts are the same or follow the same pattern if all(split_part[1] == 0 and split_part[3] for split_part in split_parts) or max(lens) == min( lens) or len(set(patterns)) == 1: to_be_removed.append(index) # add them to for split_text, split_class, split_offset, aonanl in split_info: if split_text != '': part.predicted_annotations.append( Entity(ann.class_id, ann.offset + split_offset, split_text)) # fix boundary, 1858C>T --> +1858C>T if re.search('^[0-9]', ann.text) and re.search( '([\-\+])', part.text[start - 1]): ann.offset -= 1 ann.text = part.text[start - 1] + ann.text start -= 1 # fix boundary delete ( if ann.text[0] == '(' and ')' not in ann.text: ann.offset += 1 ann.text = ann.text[1:] start += 1 # fix boundary delete ) if ann.text[-1] == ')' and '(' not in ann.text: ann.text = ann.text[:-1] # fix boundary add missing ( if part.text[start - 1] == '(' and ')' in ann.text: ann.offset -= 1 ann.text = '(' + ann.text start -= 1 # fix boundary add missing ) try: if part.text[end] == ')' and '(' in ann.text: ann.text += ')' except IndexError: pass # fix boundary add missing number after fsX try: found_missing_fsx = False if part.text[end:end + 2] == 'fs': ann.text += 'fs' end += 2 found_missing_fsx = True if ann.text.endswith('fs') and part.text[end] == 'X': ann.text += 'X' end += 1 found_missing_fsx = True if found_missing_fsx: while part.text[end].isnumeric(): ann.text += part.text[end] end += 1 except IndexError: pass # fix boundary add missing c. or p. before ann try: if ann.text.startswith('.'): if part.text[start - 1] in ('c', 'p'): ann.offset -= 1 ann.text = part.text[start - 1] + ann.text start -= 1 elif part.text[start - 2:start] in ('c.', 'p.', 'rt'): ann.offset -= 2 ann.text = part.text[start - 2:start] + ann.text start -= 2 except IndexError: pass # fix boundary add missing \d+ at the beginning if ann.text[0] == '-' or part.text[start - 1] == '-': tmp = start while tmp - 1 > 0 and (part.text[tmp - 1].isnumeric() or part.text[tmp - 1] == '-'): tmp -= 1 if part.text[tmp - 1] == ' ': ann.offset = tmp ann.text = part.text[ann.offset:start] + ann.text start = tmp isword = re.compile(r'\w') # The word must end in space to the left # not matched: 'and+2740 A>G' if isword.search(ann.text[0]) and \ (not (ann.offset >= 3 and part.text[ann.offset - 3: ann.offset] == "and" or (ann.offset >= 2 and part.text[ann.offset - 2: ann.offset] == "or"))): while ann.offset > 0 and isword.search( part.text[ann.offset - 1]): ann.text = part.text[ann.offset - 1] + ann.text ann.offset -= 1 veryend = len(part.text) end = ann.offset + len(ann.text) # The word must end in space to the right while end < veryend and isword.search(part.text[end]): ann.text = ann.text + part.text[end] end += 1 # Remove parenthesis if within parenthesis but no parentesis either in between if ann.text[0] in ['('] and ann.text[-1] in [ ')' ] and (ann.text.count('(') < 2 and ann.text.count(')') < 2): ann.offset += 1 ann.text = ann.text[1:-1] # Follow the rule of abbreviations + first gene mutation (then protein mutation) if ((ann.text[-1] == ')' or (end < veryend and part.text[end] == ")")) and ann.text[:-1].count('(') == 1): # Requirement 1: must be space to the left of (, not to match things like in Arg407(AGG) or IVS3(+1) p = re.compile("\\s+\\(") split = p.split(ann.text) if len(split) == 2: # Requirement 2: both parths must contain a number (== position, they can stand alone) def req2(): return any(c.isdigit() for c in split[0]) and any( c.isdigit() for c in split[1]) # Other Reqs on left part def req3(): return any(c.isalpha() for c in split[0].replace( 'and', '').replace('or', '')) # Other Reqs on right part def req4(): return any(c.isalpha() for c in split[1].replace( 'and', '').replace('or', '')) if req2() and len(split[0]) > 2 and req3() and req4(): # Neg.: Arg407(AGG) - single amino acid substitution (Phe for Val) - nonsense mutation (286T) # Neg.: deletion (229bp) - nonsense mutation (glycine 568 to stop) # Neg.: one insertion mutation (698insC) - AChR epsilon (CHRNE E376K) # Neg. (other reqs): M1 (Val213) - 207 and 208 (207-HA) # Neg. (other reqs): located 14 amino acids toward the amino-terminal end from the (682) # # Pos.: serine to arginine at the codon 113 (p. S113R) # Pos.: mutagenesis of the initial ATG codon to ACG (Met 1 to Thr) - H2A at position 105 (Q105) # Pos.: Trp replacing Gln in position 156 (A*2406) - A-1144-to-C transversion (K382Q) # Pos: deletion of 123 bases (41 codons) - exon 12 (R432T) ann1text = split[0] to_be_removed.append(index) part.predicted_annotations.append( Entity(ann.class_id, ann.offset, ann1text)) ann2text = split[1] if ann.text[-1] != ')' else split[ 1][:-1] # last part is number of spaces + ( ann2offset = ann.offset + len(ann1text) + ( len(ann.text) - sum(len(x) for x in split)) part.predicted_annotations.append( Entity(ann.class_id, ann2offset, ann2text)) part.predicted_annotations = [ ann for index, ann in enumerate(part.predicted_annotations) if index not in to_be_removed ]