Exemplo n.º 1
0
    def _cleanup_label(label):
        """
        Reformat the ALL CAPS OMIM labels to something more pleasant to read.
        This will:
        1.  remove the abbreviation suffixes
        2.  convert the roman numerals to integer numbers
        3.  make the text title case,
            except for suplied conjunctions/prepositions/articles
        :param label:
        :return:
        """
        conjunctions = ['and', 'but', 'yet', 'for', 'nor', 'so']
        little_preps = [
            'at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'
        ]
        articles = ['a', 'an', 'the']

        # remove the abbreviation
        lbl = label.split(r';')[0]

        fixedwords = []
        i = 0
        for wrd in lbl.split():
            i += 1
            # convert the roman numerals to numbers,
            # but assume that the first word is not
            # a roman numeral (this permits things like "X inactivation"
            if i > 1 and re.match(romanNumeralPattern, wrd):
                num = fromRoman(wrd)
                # make the assumption that the number of syndromes are <100
                # this allows me to retain "SYNDROME C"
                # and not convert it to "SYNDROME 100"
                if 0 < num < 100:
                    # get the non-roman suffix, if present.
                    # for example, IIIB or IVA
                    suffix = wrd.replace(toRoman(num), '', 1)
                    fixed = ''.join((str(num), suffix))
                    wrd = fixed

            # capitalize first letter
            wrd = wrd.title()

            # replace interior conjunctions, prepositions,
            # and articles with lowercase
            if wrd.lower() in (conjunctions + little_preps +
                               articles) and i != 1:
                wrd = wrd.lower()

            fixedwords.append(wrd)

        lbl = ' '.join(fixedwords)
        # print (label, '-->', lbl)
        return lbl
Exemplo n.º 2
0
    def _cleanup_label(label):
        """
        Reformat the ALL CAPS OMIM labels to something more pleasant to read.
        This will:
        1.  remove the abbreviation suffixes
        2.  convert the roman numerals to integer numbers
        3.  make the text title case,
            except for suplied conjunctions/prepositions/articles
        :param label:
        :return:
        """
        conjunctions = ['and', 'but', 'yet', 'for', 'nor', 'so']
        little_preps = [
            'at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or']
        articles = ['a', 'an', 'the']

        # remove the abbreviation
        l = re.split(r';', label)[0]

        fixedwords = []
        i = 0
        for w in l.split():
            i += 1
            # convert the roman numerals to numbers,
            # but assume that the first word is not
            # a roman numeral (this permits things like "X inactivation"
            if i > 1 and re.match(romanNumeralPattern, w):
                n = fromRoman(w)
                # make the assumption that the number of syndromes are <100
                # this allows me to retain "SYNDROME C"
                # and not convert it to "SYNDROME 100"
                if 0 < n < 100:
                    # get the non-roman suffix, if present.
                    # for example, IIIB or IVA
                    suffix = w.replace(toRoman(n), '', 1)
                    fixed = ''.join((str(n), suffix))
                    w = fixed

            # capitalize first letter
            w = w.title()

            # replace interior conjunctions, prepositions,
            # and articles with lowercase
            if w.lower() in (conjunctions+little_preps+articles) and i != 1:
                w = w.lower()

            fixedwords.append(w)

        l = ' '.join(fixedwords)
        # print (label,'-->',l)
        return l