示例#1
0
class TestExclusiveNLDefiner(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.definer = ExclusiveNLDefiner()

    def test_on_empty_string(self):
        try:
            self.definer.define_string("")
        except Exception:
            self.fail(
                "empty string result is undefined but should not throw an exception"
            )

    def test_define_string(self):
        f = self.definer.define_string
        testEqual = self.assertEqual

        testEqual(0, f("rs206437"))  # rsid
        testEqual(0, f("ss469415642"))  # ssid

        testEqual(2, f("C226 to T"))
        testEqual(2, f("G446 to A"))
        testEqual(2, f("C821 to T"))
        testEqual(2, f("Arg76 to Trp"))
        testEqual(2, f("Arg149 to Gln"))
        testEqual(2, f("Pro274 to Leu"))
        testEqual(2, f("T320 to C"))
        testEqual(2, f("Leu107 to Pro"))
        testEqual(2, f("C631 to T"))
        testEqual(2, f("Arg211 to Cys"))
        testEqual(2, f("Ala215 to Thr"))
        testEqual(1, f("deletion of its cytoplasmic tail"))
        testEqual(1, f("nonsense mutation Q3X"))
        testEqual(0, f("R142Q"))
        testEqual(1, f("G-->A transition of a CpG dinucleotide"))
        testEqual(1, f("A C-->T transition of the same CpG"))
        testEqual(0, f("R142X"))
        testEqual(0, f("R142X"))
        testEqual(0, f("R142Q"))
        testEqual(1, f("replacement of this CpG hotspot by CpA"))
        testEqual(0, f("R142X"))
        testEqual(1, f("caused skipping of the exon"))
        testEqual(1, f("Absence of exon 5"))
        testEqual(0, f("Asp8Asn"))
        testEqual(1, f("G to A transition at nt22"))
        testEqual(1, f("asparagine for aspartic acid at codon 8"))
        testEqual(0, f("Asp8Asn"))
        testEqual(
            1,
            f("substitution of neutral asparagine for anionic aspartic acid"))
        testEqual(1, f("G to A transition is at a CpG dinucleotide"))
        testEqual(1,
                  f("codon CAA encoding glutamine-2153 to UAA, a stop codon"))
        testEqual(
            1,
            f("attaching an epitope tag sequence to the C terminus of the editing protein"
              ))
        testEqual(0, f("H15D"))
        testEqual(0, f("A83D"))
        testEqual(0, f("A179D"))
        testEqual(0, f("573 + IG-->A"))
        testEqual(0, f("H15D"))
        testEqual(0, f("A83D"))
        testEqual(0, f("A179D"))
        testEqual(1, f("skipping of exon 5"))
        testEqual(0, f("H15D"))
        testEqual(
            1,
            f("Replacement of these small hydrophobic Ala residues with the charged, more bulky Asp side chain"
              ))
        testEqual(0, f("G20R"))
        testEqual(1, f("G to A transition at a CpG"))
        testEqual(1, f("glycine to arginine substitution at codon 20"))
        testEqual(0, f("26delA"))

        testEqual(0, f("delPhe1388"))
        testEqual(1, f("deleted C1 domain"))

        testEqual(0, f("Q115P"))
        testEqual(0, f("g.3912G>C"))
        testEqual(0, f("c.925delA"))
        testEqual(0, f("c.388+3insT"))

        testEqual(0, f("3992-9g-->a"))
        testEqual(2, f("3992-9g-->a mutation"))
        testEqual(2, f("G643 to A"))
        testEqual(2, f("leucine for arginine 90"))

        testEqual(1, f("deletion of aa 527-534"))
        testEqual(
            1, f("deletion of 10 and 8 residues from the N- and C-terminals"))
        testEqual(1, f("143 from alanine to glycine"))
        testEqual(
            1,
            f("alterations of amino acid residue 143 from alanine to glycine"))

        testEqual(1, f("trinucleotide deletion"))

        testEqual(1, f("arginine-141 to serine substitution"))
        testEqual(1, f("mutations at Arg885"))
        testEqual(1, f("point mutation at Cys93"))
        testEqual(1, f("heterozygous missense 3035G>T"))
        testEqual(2, f("synonymous 696T>C"))
        testEqual(2, f("missense Glu285Ala"))
        testEqual(1, f("somatic 16-bp deletion"))
        testEqual(1, f("serine 749 is phosphorylated"))
        testEqual(1, f("Ser58 to Glu substitution"))
        testEqual(1, f("deletion of"))
        testEqual(1, f("deletion of"))
        testEqual(1, f("deletion of"))
        testEqual(1, f("deletion of"))
        testEqual(0, f("GAT-->GTT, Asp-->Val"))
        testEqual(2, f("codon 98 GAT-->GTT, Asp-->Val"))
        testEqual(2, f("codon 92, TAC-->TAT"))
        testEqual(
            1,
            f("arginine-127 into glutamine and arginine-469 into tryptophan"))
        testEqual(2, f("arginine-127 into glutamine"))
        testEqual(2, f("arginine-469 into tryptophan"))

        testEqual(0, f("TP73Δex2/3"))
        testEqual(1, f("abrogated loss of Chr19"))

        # More difficult

        testEqual(2, f("chromothripsis"))
        testEqual(2, f("Morpholino knockdown"))
        testEqual(2, f("methionine replaces lysine 27"))
        testEqual(2, f("lysine(27)-to-methionine"))

        testEqual(1, f("C-tail displacement"))
        testEqual(1, f("22q11 deletion syndrome"))
        testEqual(1, f("hippocampal neuron L1 insertions"))
        testEqual(1, f("copy-number variants"))
示例#2
0
def pattern_stats(dataset):
    """
    Testing Ground Carsten - High Recall Patterns creation method development method ported here.
    :type nala.structures.Dataset: dataset to perform pattern evaluation on (must include annotations)
    :return: nothing (print statements for the moment)
    """
    ExclusiveNLDefiner().define(dataset)

    # PubTatorFormat(dataset, no_annotations=False).export()

    print(dataset)

    nl_annotations = []

    # import connecting_words.json
    with open('nala/data/connecting_words.json', 'r') as f:
        regexs = json.load(f)

    # print(regexs)
    compiled_regexs = [re.compile(x) for x in regexs]

    nr_word_regex = re.compile(
        '\\b(one|two|three|four|five|six|seven|eight|nine|ten)\\b')
    aa_short_regex = re.compile(
        '\\b(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr)\\b'
    )
    aa_long_regex = re.compile(
        '\\b(glutamine|glutamic acid|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic acid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine)\\b'
    )
    bp_code = re.compile('\\b\\w\\b')

    wordlist = []

    # for ann in dataset.annotations():
    #     if ann.subclass == 1 or ann.subclass == 2:
    #         new_text = ann.text.lower()
    #         for reg in compiled_regexs:
    #             new_text = reg.sub('_TT_', new_text)
    #         # re.sub('\\b\\d+\\b]', '_NR_', new_text)
    #         new_text = re.sub('\\b\\w*\\d+\\w*\\b', '_CODE_', new_text)
    #         new_text = nr_word_regex.sub('_TT_', new_text)
    #         new_text = aa_short_regex.sub('_AA_', new_text)
    #         new_text = aa_long_regex.sub('_AA_', new_text)
    #         new_text = bp_code.sub('_TT_', new_text)
    #         new_text = re.sub('\\W', ' ', new_text)
    #         # new_text = re.sub('\\b(\\w{1,3})\\b', '_TT_', new_text)
    #
    #         wordlist.extend(new_text.split(' '))
    #         # print(new_text)
    #         nl_annotations.append(new_text)
    #
    # wordset = set(wordlist)
    # wordlist = sorted(list(wordset))
    # print(json.dumps(wordlist, indent=2, sort_keys=True))
    # print(json.dumps(nl_annotations, indent=2, sort_keys=True))

    # todo provide method to create new pattern on an automated base
    # read in nl_patterns
    with open('nala/data/nl_patterns.json', 'r') as f:
        regexs = json.load(f)

    patterns = [re.compile(x) for x in regexs]

    # f-measure pattern-based
    _perf_patterns = {}
    for reg in patterns:
        _perf_patterns[reg.pattern] = [0, 0, -1]

    # check for annotations

    # for part in dataset.parts():
    #     print(part.text)

    # dataset with tmVar
    # TODO change if idp4 then those results otherwise use tmvartagger and caching
    dataset_high_recall = TmVarReader(
        'resources/corpora/idp4/pubtator_tmvar.txt').read()
    TP = 0
    FP = 0
    _length = len(dataset.documents.keys())
    _progress = 0
    _timestart = time.time()

    _time_avg_per_pattern = 0
    _pattern_calls = 0
    _time_reg_pattern_total = 0
    _time_max_pattern = 0
    _low_performant_pattern = ""
    _avg_chars_per_doc = dataset.get_size_chars() / len(
        dataset.documents.keys())

    # NLDefiners init
    exclusive_definer = ExclusiveNLDefiner()
    _e_array = [0, 0, 0]
    inclusive_definer = InclusiveNLDefiner()
    _i_array = [0, 0]

    # todo param file to save to
    with open('results/testing_ground_carsten.txt', 'w',
              encoding='utf-8') as f:
        for did, doc in dataset.documents.items():
            part_offset = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences
                # new_text = cur_part.text.lower()
                # new_text = re.sub('\s+', ' ', new_text)
                # sentences = new_text.split('. ')
                for sent in sentences:
                    sent_len = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', '', new_text)
                    new_text = re.sub('\W+', ' ', new_text)
                    for i, reg in enumerate(patterns):

                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call

                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")

                        if match:
                            if did in dataset_high_recall.documents:
                                anti_doc = dataset_high_recall.documents.get(
                                    did)
                                start = part_offset + sent_offset + match.span(
                                )[0]
                                end = part_offset + sent_offset + match.span(
                                )[1]
                                if not anti_doc.overlaps_with_mention(
                                        start, end):
                                    _e_result = exclusive_definer.define_string(
                                        new_text[match.span()[0]:match.span(
                                        )[1]])
                                    _e_array[_e_result] += 1
                                    _i_result = inclusive_definer.define_string(
                                        new_text[match.span()[0]:match.span(
                                        )[1]])
                                    _i_array[_i_result] += 1
                                    if doc.overlaps_with_mention(start, end):
                                        TP += 1
                                        f.write(
                                            "{}\tTP\te{}\ti{}\t{}\t{}\t{}\n".
                                            format(did, _e_result, _i_result,
                                                   sent, match, reg.pattern))
                                        _perf_patterns[reg.pattern][0] += 1
                                    else:
                                        FP += 1
                                        f.write(
                                            "{}\tFP\te{}\ti{}\t{}\t{}\t{}\n".
                                            format(did, _e_result, _i_result,
                                                   sent, match, reg.pattern))
                                        _perf_patterns[reg.pattern][1] += 1

                                    if _perf_patterns[reg.pattern][1] > 0:
                                        _perf_patterns[
                                            reg.pattern][2] = _perf_patterns[
                                                reg.
                                                pattern][0] / _perf_patterns[
                                                    reg.pattern][1]
                        if _lasttime - time.time() > 1:
                            print(i)
                    sent_offset += 2 + sent_len
                part_offset += sent_offset
            _progress += doc.get_size() / _avg_chars_per_doc
            _time_progressed = time.time() - _timestart
            _time_per_doc = _time_progressed / _progress
            _time_req_time = _time_per_doc * _length
            _time_eta = _time_req_time - _time_progressed
            print("PROGRESS: {:.3%} PROGRESS: {:.2f} secs ETA: {:.2f} secs".
                  format(_progress / _length, _time_progressed, _time_eta))
            if TP + FP > 0:
                print(
                    'STATS: TP:{}, FP:{}, TP+FP:{} %containingNLmentions:{:.4%}'
                    .format(TP, FP, TP + FP, TP / (TP + FP)))

    print("Exclusive Definer:", _e_array)
    print("Inclusive Definer:", _i_array)

    for key, value in _perf_patterns.items():
        if value[2] != -1:
            print(value, key)
示例#3
0
class PostProcessing:
    def __init__(self,
                 keep_silent=True,
                 keep_genetic_markers=True,
                 keep_unnumbered=True,
                 keep_rs_ids=True):

        amino_acids = [
            'alanine', 'ala', 'arginine', 'arg', 'asparagine', 'asn',
            'aspartic acid', 'aspartate', 'asp', 'cysteine', 'cys',
            'glutamine', 'gln', 'glutamic acid', 'glutamate', 'glu', 'glycine',
            'gly', 'histidine', 'his', 'isoleucine', 'ile', 'leucine', 'leu',
            'lysine', 'lys', 'methionine', 'met', 'phenylalanine', 'phe',
            'proline', 'pro', 'serine', 'ser', 'threonine', 'thr',
            'tryptophan', 'trp', 'tyrosine', 'tyr', 'valine', 'val',
            'aspartic acid', 'asparagine', 'asx', 'glutamine', 'glutamic acid',
            'glx'
        ]

        nucleotides = ['adenine', 'guanine', 'thymine', 'cytosine', 'uracil']

        keywords = [
            'substit\w*', 'lead\w*', 'exchang\w*', 'chang\w*', 'mutant\w*',
            'mutate\w*', 'devia\w*', 'modif\w*', 'alter\w*', 'switch\w*',
            'variat\w*', 'instead\w*', 'replac\w*', 'in place', 'convert\w*',
            'becom\w*'
        ]

        # AA = '|'.join(amino_acids)
        AA_NN = '|'.join(amino_acids + nucleotides)
        AA_LL = '|'.join(amino_acids + list('CISQMNPKDTFAGHLRWVEYX'))
        KK = '|'.join(keywords)

        genetic_marker_regex = re.compile(r'\bD\d+([A-Z]\d+)?S\d{2,}\b')
        rs_id_regex = re.compile(r'\b\[?rs\]? *\d{3,}(,\d+)*\b')
        ss_id_regex = re.compile(r'\b\[?ss\]? *\d{3,}(,\d+)*\b')

        self.patterns = [
            re.compile(
                '({SS})[- ]*[1-9][0-9]* +(in|to|into|for|of|by|with|at) +({SS})( *(,|,?or|,?and) +({SS}))*'
                .format(SS=AA_NN), re.IGNORECASE),
            re.compile(
                '({SS}) +(in|to|into|for|of|by|with|at) +({SS})[- ]*[1-9][0-9]*'
                '( *(,|,?or|,?and) +({SS})[- ]*[1-9][0-9]*)*'.format(SS=AA_NN),
                re.IGNORECASE),
            re.compile(
                '({SS})(( (({KK})) (in|to|into|for|of|by|with|at) (a|an|the|) '
                '*({SS})[1-9]\d*( *(,|or|and|, and|, or) ({SS})[1-9]\d*)*)'
                '|([- ]*[1-9]\d*( +((has|have|had) +been|is|are|was|were|) '
                '+(({KK})))? +(in|to|into|for|of|by|with|at) +({SS})( *(,|or|and|, and|, or) +({SS}))*))'
                .format(SS=AA_NN, KK=KK), re.IGNORECASE),
            re.compile(r'\bp\. *({SS}) *[-+]*\d+ *({SS})\b'.format(SS=AA_NN),
                       re.IGNORECASE),
            re.compile(
                r'\b({SS})[-to ]*[-+]*\d+[-to ]*({SS})\b'.format(SS=AA_NN),
                re.IGNORECASE),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX](/|-|-*>|→|-to-)[CISQMNPKDTFAGHLRWVEYX] *[-+]*\d+\b'
            ),
            re.compile(
                r'((?<!\w)[-+]*\d+:? *?)??[CISQMNPKDTFAGHLRWVEYX] *(/|-|-*>|→|-*to-*) *[CISQMNPKDTFAGHLRWVEYX]\b'
            ),
            re.compile(r'\b[CISQMNPKDTFAGHLRWVEYX]{3,}/-(?<!\w)'),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX] *\d{2,} *[CISQMNPKDTFAGHLRWVEYX]( *(/) *[CISQMNPKDTFAGHLRWVEYX])*\b'
            ), genetic_marker_regex, rs_id_regex, ss_id_regex,
            re.compile(
                r'\b(\d+-)?\d*[D|d]elta(\d{2,}|[CISQMNPKDTFAGHLRWVEYX])\b'),
            re.compile(r'\b(c\. *)?[ATCG] *([-+]|\d)\d+ *[ATCG]\b'),
            re.compile(r'\b(c\.|E(X|x)\d+) *([-+]|\d)\d+[ATCG] *> *[ATCG]\b'),
            re.compile(r'\b[ATCG][-+]*\d+[ATCG]/[ATCG]\b'),
            re.compile(
                r'(?<!\w)[-+]?\d+ *\d* *(b|bp|N|ntb|p|BP|B) *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            ),
            re.compile(
                r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[0-9CISQMNPKDTFAGHLRWVEYX]+\b'
            ),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            ),
            re.compile(
                r'\b(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup) *(\d+(b|bp|N|ntb|p|BP|B)|[ATCG]{1,})\b'
            ),
            re.compile(
                r'(?<!\w)[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)[CISQMNPKDTFAGHLRWVEYX]+\b'
            ),
            re.compile(
                r'\b[CISQMNPKDTFAGHLRWVEYX]+ *[-+]*\d+ *(INS|DEL|INDEL|DELINS|DUP|ins|del|indel|delins|dup)\b'
            )
        ]

        self.negative_patterns = [
            # single AAs
            re.compile(r'^({SS}) *\d+$'.format(SS=AA_NN), re.IGNORECASE),
            re.compile(r'^[CISQMNPKDTFAGHLRWVEYX]+ *\d+$'),
            re.compile(r'^({SS})([-/>]({SS}))*$'.format(SS=AA_LL),
                       re.IGNORECASE),
            # just numbers
            re.compile(r'^[-+]?\d+([-+/ ]+\d+)*( *(b|bp|N|ntb|p|BP|B))?$')
        ]

        if not keep_genetic_markers:
            self.negative_patterns.append(genetic_marker_regex)

        if not keep_rs_ids:
            self.negative_patterns.append(rs_id_regex)
            self.negative_patterns.append(ss_id_regex)

        self.keep_unnumbered = keep_unnumbered

        self.at_least_one_letter_n_number_letter_n_number = re.compile(
            '(?=.*[A-Za-z])(?=.*[0-9])[A-Za-z0-9]+')
        self.keep_silent = keep_silent
        self.definer = ExclusiveNLDefiner()

    def process(self, dataset, class_id=MUT_CLASS_ID):
        for doc_id, doc in dataset.documents.items():
            for part_id, part in doc.parts.items():
                self.__fix_issues(part)
                for regex in self.patterns:
                    for match in regex.finditer(part.text):
                        start = match.start()
                        end = match.end()
                        matched_text = part.text[start:end]
                        ann = Entity(class_id, start, matched_text)

                        Entity.equality_operator = 'exact_or_overlapping'
                        if ann not in part.predicted_annotations:
                            part.predicted_annotations.append(
                                Entity(class_id, start, matched_text))
                        Entity.equality_operator = 'overlapping'
                        if ann in part.predicted_annotations:
                            for index, ann_b in enumerate(
                                    part.predicted_annotations):
                                if ann == ann_b and len(matched_text) > len(
                                        ann_b.text):
                                    part.predicted_annotations[index] = ann

                to_delete = [
                    index
                    for index, ann in enumerate(part.predicted_annotations)
                    if any(r.search(ann.text) for r in self.negative_patterns)
                    or (not self.keep_silent and self.__is_silent(ann)) or
                    (not self.keep_unnumbered and not self._is_numbered(ann))
                ]

                part.predicted_annotations = [
                    ann for index, ann in enumerate(part.predicted_annotations)
                    if index not in to_delete
                ]

        # sanity check, make sure annotations match their offset
        for part in dataset.parts():
            for ann in part.predicted_annotations:
                assert ann.text == part.text[ann.offset:ann.offset +
                                             len(ann.text)]
                while ann.text[0] == ' ':
                    ann.offset += 1
                    ann.text = ann.text[1:]
                while ann.text[-1] == ' ':
                    ann.text = ann.text[:-1]
                # assert ann.text == ann.text.strip(), ("'" + ann.text + "'")

    def __is_silent(self, ann):
        split = re.split('[^A-Za-z]+', ann.text)
        return len(split) == 2 and split[0] == split[1]

    def _is_numbered(self, ann):
        return any(c.isdigit()
                   for c in ann.text) or self.definer.define_string(
                       ann.text) == 1

    def __fix_issues(self, part):
        """
        :type part: nalaf.structures.data.Part
        """
        to_be_removed = []
        for index, ann in enumerate(part.predicted_annotations):
            start = ann.offset
            end = ann.offset + len(ann.text)

            # split multiple mentions
            split = re.split(r' *(?:\band\b|/|\\|,|;|\bor\b) *', ann.text)
            if len(split) > 1:
                # for each split part calculate the offsets and the constraints
                offset = 0
                split_info = []
                for text in split:
                    split_info.append(
                        (text, self.definer.define_string(text),
                         ann.text.find(text, offset),
                         self.at_least_one_letter_n_number_letter_n_number.
                         search(text)))
                    offset += len(text)

                split_parts = [
                    split_part for split_part in split_info
                    if split_part[0] != ''
                ]
                lens = [len(split_part[0]) for split_part in split_parts]
                patterns = [
                    re.sub(
                        '\W+', '',
                        re.sub('[0-9]', '0', re.sub('[a-zA-Z]', 'a',
                                                    parts[0])))
                    for parts in split_parts
                ]

                # if all the non empty parts are from class ST (0) and also contain at least one number and one letter
                # or if the lengths of the splitted parts are the same or follow the same pattern
                if all(split_part[1] == 0 and split_part[3]
                       for split_part in split_parts) or max(lens) == min(
                           lens) or len(set(patterns)) == 1:
                    to_be_removed.append(index)

                    # add them to
                    for split_text, split_class, split_offset, aonanl in split_info:
                        if split_text != '':
                            part.predicted_annotations.append(
                                Entity(ann.class_id, ann.offset + split_offset,
                                       split_text))

            # fix boundary, 1858C>T --> +1858C>T
            if re.search('^[0-9]', ann.text) and re.search(
                    '([\-\+])', part.text[start - 1]):
                ann.offset -= 1
                ann.text = part.text[start - 1] + ann.text
                start -= 1

            # fix boundary delete (
            if ann.text[0] == '(' and ')' not in ann.text:
                ann.offset += 1
                ann.text = ann.text[1:]
                start += 1

            # fix boundary delete )
            if ann.text[-1] == ')' and '(' not in ann.text:
                ann.text = ann.text[:-1]

            # fix boundary add missing (
            if part.text[start - 1] == '(' and ')' in ann.text:
                ann.offset -= 1
                ann.text = '(' + ann.text
                start -= 1

            # fix boundary add missing )
            try:
                if part.text[end] == ')' and '(' in ann.text:
                    ann.text += ')'
            except IndexError:
                pass

            # fix boundary add missing number after fsX
            try:
                found_missing_fsx = False
                if part.text[end:end + 2] == 'fs':
                    ann.text += 'fs'
                    end += 2
                    found_missing_fsx = True
                if ann.text.endswith('fs') and part.text[end] == 'X':
                    ann.text += 'X'
                    end += 1
                    found_missing_fsx = True
                if found_missing_fsx:
                    while part.text[end].isnumeric():
                        ann.text += part.text[end]
                        end += 1
            except IndexError:
                pass

            # fix boundary add missing c. or p. before ann
            try:
                if ann.text.startswith('.'):
                    if part.text[start - 1] in ('c', 'p'):
                        ann.offset -= 1
                        ann.text = part.text[start - 1] + ann.text
                        start -= 1
                elif part.text[start - 2:start] in ('c.', 'p.', 'rt'):
                    ann.offset -= 2
                    ann.text = part.text[start - 2:start] + ann.text
                    start -= 2
            except IndexError:
                pass

            # fix boundary add missing \d+ at the beginning
            if ann.text[0] == '-' or part.text[start - 1] == '-':
                tmp = start
                while tmp - 1 > 0 and (part.text[tmp - 1].isnumeric()
                                       or part.text[tmp - 1] == '-'):
                    tmp -= 1
                if part.text[tmp - 1] == ' ':
                    ann.offset = tmp
                    ann.text = part.text[ann.offset:start] + ann.text
                    start = tmp

            isword = re.compile(r'\w')

            # The word must end in space to the left
            # not matched: 'and+2740 A>G'
            if isword.search(ann.text[0]) and \
                (not (ann.offset >= 3 and part.text[ann.offset - 3: ann.offset] == "and"
                or (ann.offset >= 2 and part.text[ann.offset - 2: ann.offset] == "or"))):

                while ann.offset > 0 and isword.search(
                        part.text[ann.offset - 1]):
                    ann.text = part.text[ann.offset - 1] + ann.text
                    ann.offset -= 1

            veryend = len(part.text)
            end = ann.offset + len(ann.text)

            # The word must end in space to the right
            while end < veryend and isword.search(part.text[end]):
                ann.text = ann.text + part.text[end]
                end += 1

            # Remove parenthesis if within parenthesis but no parentesis either in between
            if ann.text[0] in ['('] and ann.text[-1] in [
                    ')'
            ] and (ann.text.count('(') < 2 and ann.text.count(')') < 2):
                ann.offset += 1
                ann.text = ann.text[1:-1]

            # Follow the rule of abbreviations + first gene mutation (then protein mutation)
            if ((ann.text[-1] == ')' or
                 (end < veryend and part.text[end] == ")"))
                    and ann.text[:-1].count('(') == 1):
                # Requirement 1: must be space to the left of (, not to match things like in Arg407(AGG) or IVS3(+1)
                p = re.compile("\\s+\\(")
                split = p.split(ann.text)
                if len(split) == 2:

                    # Requirement 2: both parths must contain a number (== position, they can stand alone)
                    def req2():
                        return any(c.isdigit() for c in split[0]) and any(
                            c.isdigit() for c in split[1])

                    # Other Reqs on left part
                    def req3():
                        return any(c.isalpha() for c in split[0].replace(
                            'and', '').replace('or', ''))

                    # Other Reqs on right part
                    def req4():
                        return any(c.isalpha() for c in split[1].replace(
                            'and', '').replace('or', ''))

                    if req2() and len(split[0]) > 2 and req3() and req4():
                        # Neg.: Arg407(AGG) - single amino acid substitution (Phe for Val) - nonsense mutation (286T)
                        # Neg.: deletion (229bp) -  nonsense mutation (glycine 568 to stop)
                        # Neg.: one insertion mutation (698insC) - AChR epsilon (CHRNE E376K)
                        # Neg. (other reqs): M1 (Val213) - 207 and 208 (207-HA)
                        # Neg. (other reqs): located 14 amino acids toward the amino-terminal end from the (682)
                        #
                        # Pos.: serine to arginine at the codon 113 (p. S113R)
                        # Pos.: mutagenesis of the initial ATG codon to ACG (Met 1 to Thr) - H2A at position 105 (Q105)
                        # Pos.: Trp replacing Gln in position 156 (A*2406) - A-1144-to-C transversion (K382Q)
                        # Pos: deletion of 123 bases (41 codons) - exon 12 (R432T)

                        ann1text = split[0]
                        to_be_removed.append(index)
                        part.predicted_annotations.append(
                            Entity(ann.class_id, ann.offset, ann1text))
                        ann2text = split[1] if ann.text[-1] != ')' else split[
                            1][:-1]
                        # last part is number of spaces + (
                        ann2offset = ann.offset + len(ann1text) + (
                            len(ann.text) - sum(len(x) for x in split))
                        part.predicted_annotations.append(
                            Entity(ann.class_id, ann2offset, ann2text))

        part.predicted_annotations = [
            ann for index, ann in enumerate(part.predicted_annotations)
            if index not in to_be_removed
        ]