Пример #1
0
class EnglishPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/english/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/english/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        return cls.extract_first_pos(pos)

    @staticmethod
    def extract_first_pos(pos):
        """Extract the first POS tag.

        Several POS tags are separated by ':'.
        """
        return pos.split(':')[0]
Пример #2
0
class QaqetPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/qaqet/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/qaqet/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):

        if pos.startswith('-') or pos.startswith('='):
            return 'sfx'
        elif pos.endswith('-') or pos.endswith('='):
            return 'sfx'
        else:
            pos = cls.clean_pos(pos)

            if ud:
                return cls.pos_ud_dict.get(pos, '')
            else:
                return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        pos = cls.unify_unknowns_morpheme(pos)
        pos = ToolboxMorphemeCleaner.remove_morpheme_delimiters(pos)
        return pos

    @classmethod
    def unify_unknowns_morpheme(cls, morpheme):
        unknown_re = re.compile(r'\bx+|\?{2}|\*{3}')
        return unknown_re.sub('???', morpheme)
Пример #3
0
class JapaneseMiyataPOSMapper:

    pos_dict = parse_csv(
        get_full_path(
            'parsers/corpora/main/japanese_miyata/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path(
            'parsers/corpora/main/japanese_miyata/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def replace_colon_by_dot_pos(cls, pos):
        """Replace the colons in the POS tag by a dot."""
        return pos.replace(':', '.')

    @classmethod
    def clean_pos(cls, pos):
        pos = cls.replace_colon_by_dot_pos(pos)
        return pos
Пример #4
0
class NungonPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/nungon/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/nungon/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.remove_question_mark(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @staticmethod
    def remove_question_mark(morpheme):
        """Remove the question mark in the morpheme.

        Question marks might code insecure annotations. They are prefixed to
        the morpheme.
        """
        return morpheme.lstrip('?')
Пример #5
0
class QaqetGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/qaqet/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = ToolboxMorphemeCleaner.remove_morpheme_delimiters(gloss)
        return cls.infer_gloss(gloss)

    @classmethod
    def infer_gloss(cls, gloss):
        if gloss:
            atms_gloss_raw = gloss.split('.')
            gloss = []
            for atm_gl_raw in atms_gloss_raw:
                if atm_gl_raw not in cls.gloss_dict:
                    atm_gl = '???'
                else:
                    atm_gl = cls.gloss_dict[atm_gl_raw]
                gloss.append(atm_gl)
            # If all atm_poses are '', set to None.
            for atm_gloss in gloss:
                if atm_gloss != '???':
                    gloss = '.'.join(gloss)
                    break
            else:
                gloss = ''
        else:
            gloss = ''

        return gloss
Пример #6
0
class KuWaruPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/ku_waru/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/ku_waru/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.infer_pos(pos, cls.pos_ud_dict)
        else:
            return cls.infer_pos(pos, cls.pos_dict)

    @classmethod
    def infer_pos(cls, pos, pos_dict):
        if cls.is_suffix(pos):
            return 'sfx'
        elif cls.is_prefix(pos):
            return 'pfx'
        else:
            pos = ToolboxMorphemeCleaner.clean(pos)
            return pos_dict.get(pos, '')

    @staticmethod
    def is_suffix(pos):
        return pos.startswith('-') or pos.startswith('=')

    @staticmethod
    def is_prefix(pos):
        return pos.endswith('-') or pos.endswith('=')
Пример #7
0
class InuktitutGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/inuktitut/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @classmethod
    def clean_gloss(cls, gloss):
        """Replace the stem and grammatical gloss connector."""
        return cls.replace_stem_gram_gloss_connector(gloss)

    @staticmethod
    def replace_stem_gram_gloss_connector(gloss):
        """Replace the stem and grammatical gloss connector.

        A stem gloss is connected with a grammatical gloss by an ampersand.
        The connector is replaced by a dot.

        Args:
            gloss (str): The gloss.

        Returns:
            str: The stem and grammatical connector replaced by a dot.
        """
        return gloss.replace('&', '.')
Пример #8
0
class TurkishGloss2SegmentMapper:

    gloss2seg = parse_csv(get_full_path(
        'parsers/corpora/main/turkish/resources/gloss2segment.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.gloss2seg.get(gloss, '')
Пример #9
0
class RussianGlossMapper:

    gloss_dict = parse_csv(get_full_path(
        'parsers/corpora/main/russian/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.gloss_dict.get(gloss, '')
Пример #10
0
class ChintangGlossMapper:

    gloss_dict = parse_csv(get_full_path(
                    'parsers/corpora/main/chintang/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = ToolboxMorphemeCleaner.remove_morpheme_delimiters(gloss)
        return cls.gloss_dict.get(gloss, '')
Пример #11
0
class JapaneseMiyataGlossMapper:

    gloss_dict = parse_csv(
        get_full_path(
            'parsers/corpora/main/japanese_miyata/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.gloss_dict.get(gloss, '')
Пример #12
0
class JapaneseMiiProGloss2SegmentMapper:

    gloss2seg = parse_csv(
        get_full_path(
            'parsers/corpora/main/japanese_miipro/resources/gloss2segment.csv')
    )

    @classmethod
    def map(cls, gloss):
        return cls.gloss2seg.get(gloss, '')
Пример #13
0
class RussianPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/russian/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/russian/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')
Пример #14
0
class JapaneseMiiProPOSMapper:

    pos_dict = parse_csv(get_full_path(
        'parsers/corpora/main/japanese_miipro/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(get_full_path(
        'parsers/corpora/main/japanese_miipro/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')
Пример #15
0
class ChintangPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/chintang/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/chintang/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if pos.startswith('-'):
            return 'sfx'
        elif pos.endswith('-'):
            return 'pfx'
        else:
            pos = ToolboxMorphemeCleaner.clean(pos)
            if ud:
                return cls.pos_ud_dict.get(pos, '')
            else:
                return cls.pos_dict.get(pos, '')
Пример #16
0
class TuatschinPOSMapper:

    pos_dict = parse_csv(get_full_path(
        'parsers/corpora/main/tuatschin/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(get_full_path(
        'parsers/corpora/main/tuatschin/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        for cleaning_method in [
            cls.remove_specifications
        ]:
            pos = cleaning_method(pos)
        return pos

    @staticmethod
    def remove_specifications(pos):
        """Remove specifications of POS tags.

        Specifications start with `_`.

        Examples:
        - words erroneously written apart: _cont
        - child forms: _Chld
        - discourse particles: _Discpart
        ...
        """
        regex = re.compile(r'_[^_]+')
        pos = regex.sub('', pos)
        return pos
Пример #17
0
class InuktitutPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        """Replace the POS tag separator."""
        return cls.replace_pos_separator(pos)

    @staticmethod
    def replace_pos_separator(pos):
        """Replace the POS tag separator.

        A morpheme may have several POS tags separated by a pipe.
        POS tags to the right are subcategories of the POS tags to the left.
        The separator is replaced by a dot.

        Args:
            pos (str): The POS tag.

        Returns:
            str: POS tag separator replaced by a dot.
        """
        return pos.replace('|', '.')
Пример #18
0
class CreePOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/cree/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/cree/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @staticmethod
    def uppercase_pos_in_parentheses(pos):
        """Uppercase POS tags in parentheses.

        Parentheses indicate covert grammatical categories.
        """
        pos_in_parentheses_regex = re.compile(r'(\()(\S+)(\))')
        # extract POS in parentheses
        match = pos_in_parentheses_regex.search(pos)
        if not match:
            return pos
        else:
            # replace by uppercased version
            up_pos = match.group(2).upper()
            return pos_in_parentheses_regex.sub(r'\1{}\3'.format(up_pos), pos)

    @classmethod
    def clean_pos(cls, pos):
        return cls.uppercase_pos_in_parentheses(pos)
Пример #19
0
class CreeGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/cree/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @staticmethod
    def replace_gloss_connector(gloss):
        """Replace the gloss connectors.

        There are three different gloss connectors: '.', '+', ','
        ',' adds an additional specification to a gloss, e.g.
        'p,quest” (question particle)'. '+' and ',' are replaced by a dot.
        """
        return gloss.replace(',', '.').replace('+', '.')

    @classmethod
    def clean_gloss(cls, gloss):
        # gloss = cls.replace_gloss_connector(gloss)
        return gloss
Пример #20
0
class NungonGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/nungon/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @classmethod
    def clean_gloss(cls, gloss):
        for cleaning_method in [
                cls.remove_question_mark, cls.replace_slash, cls.replace_plus
        ]:
            gloss = cleaning_method(gloss)
        return gloss

    @staticmethod
    def remove_question_mark(morpheme):
        """Remove the question mark in the morpheme.

        Question marks might code insecure annotations. They are prefixed to
        the morpheme.
        """
        return morpheme.lstrip('?')

    @staticmethod
    def replace_slash(gloss):
        """Replace the slash by a dot between numbers."""
        return re.sub(r'(\d)/(\d)', r'\1.\2', gloss)

    @staticmethod
    def replace_plus(gloss):
        """Replace the plus by a dot."""
        return gloss.replace('+', '.')
Пример #21
0
class KuWaruGlossMapper:

    gloss_dict = parse_csv(get_full_path(
        'parsers/corpora/main/ku_waru/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        return cls.infer_gloss(gloss)

    @classmethod
    def infer_gloss(cls, gloss):
        gloss = ToolboxMorphemeCleaner.clean(gloss)

        gloss = cls.replace_colons(gloss)
        gloss = cls.remove_bt_tp(gloss)
        gloss = cls.replace_many_to_one(gloss)

        number_person_rgx = re.compile(r'[1-3/]+(SG|DU|PL)')

        # case 1: direct mapping
        if gloss in cls.gloss_dict:
            return cls.gloss_dict[gloss]

        # case 2: number-person combinations
        if number_person_rgx.fullmatch(gloss):
            return gloss

        # case 3: lexical gloss
        if gloss.islower():
            return ''

        # case 4: NER
        if gloss in ['PERSON', 'PLACE', 'TRIBE']:
            return ''

        # case 5: multi-category morpheme
        if '.' in gloss:
            categories = gloss.split('.')

            mapped_categories = []

            for category in categories:
                if number_person_rgx.fullmatch(category):
                    mapped_category = category
                elif category.islower():
                    return ''
                else:
                    mapped_category = cls.gloss_dict.get(category, '???')

                mapped_categories.append(mapped_category)

            return '.'.join(mapped_categories)

        # other
        return ''

    @staticmethod
    def replace_colons(gloss):
        """Replace colons by dots.

        Args:
            gloss (str): The gloss.

        Example:
            IMP:2/3DU => IMP.2/3DU
        """
        return gloss.replace(':', '.')

    @staticmethod
    def remove_bt_tp(gloss):
        """Remove TP and BT categories.

        `BT` denotes baby talk
        `TP` denotes Tok Pisin

        Args:
            gloss (str): The gloss.

        Example:
            banana.BT => banana
        """
        return re.sub(r'\.(BT|TP)', '', gloss)

    @staticmethod
    def replace_many_to_one(gloss):
        """Replace multi-word gloss by one-word gloss.

        Args:
            gloss (str): The gloss.

        Current multi-word glosses mapping to one-word glosses are:
            TAG.Q
            that.ABK
            that.ABU
            that.ANA
            that.END
            this.DEF
            this.IP

        Example:
            this.DEF => PROX
        """
        gloss = gloss.replace('TAG.Q', 'Q')
        gloss = re.sub(r'that\.AB[KU]', 'DEM', gloss)
        gloss = re.sub(r'that\.(ANA|END)', 'DIST', gloss)
        gloss = re.sub(r'this\.(DEF|IP)', 'PROX', gloss)

        return gloss
Пример #22
0
class SesothoGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/sesotho/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)
        return cls.gloss_dict.get(gloss, '')

    @classmethod
    def clean_gloss(cls, gloss):
        """Clean a Sesotho gloss."""
        for method in [
                cls.remove_markers, cls.clean_proper_names_gloss_words,
                cls.remove_nominal_concord_markers,
                cls.unify_untranscribed_glosses
        ]:
            gloss = method(gloss)
        return gloss

    @classmethod
    def remove_markers(cls, gloss):
        """Remove noun and verb markers."""
        gloss = cls.remove_noun_markers(gloss)
        gloss = cls.remove_verb_markers(gloss)
        return gloss

    @staticmethod
    def remove_noun_markers(gloss):
        """Remove noun markers."""
        return re.sub(r'[nN]\^(?=\d)', '', gloss)

    @staticmethod
    def remove_verb_markers(gloss):
        """Remove verb markers."""
        return re.sub(r'[vs]\^', '', gloss)

    @staticmethod
    def clean_proper_names_gloss_words(gloss):
        """Clean glosses of proper names.

        In proper names substitute 'n^' marker with 'a_'.
        Lowercase the labels of propernames.
        """
        gloss = re.sub(r'[nN]\^([gG]ame|[nN]ame|[pP]lace|[sS]ong)', r'a_\1',
                       gloss)
        if re.search(r'a_(Game|Name|Place|Song)', gloss):
            gloss = gloss.lower()
        return gloss

    @staticmethod
    def remove_nominal_concord_markers(gloss):
        """Remove markers for nominal concord."""
        match = re.search(r'^(d|lr|obr|or|pn|ps)\d+', gloss)
        if match:
            pos = match.group(1)
            return re.sub(pos, '', gloss)

        return gloss

    @staticmethod
    def unify_untranscribed_glosses(gloss):
        """Unify untranscribed glosses.

        In Sesotho glossing for words which are not understood or
        couldn't be analyzed are marked by 'word' or by 'xxx'. Turn
        both into the standart '???'.
        """
        if gloss == 'word' or gloss == 'xxx':
            return '???'

        return gloss
Пример #23
0
 def __init__(self, path_label2macro_role=None):
     if path_label2macro_role:
         self.label2macro_role = parse_csv(path_label2macro_role)
     else:
         self.label2macro_role = {}
Пример #24
0
class TuatschinGlossMapper:

    gloss_dict = parse_csv(
        get_full_path('parsers/corpora/main/tuatschin/resources/gloss.csv'))

    @classmethod
    def map(cls, gloss):
        gloss = cls.clean_gloss(gloss)

        if gloss:
            # replace person/number combinations first
            pnum_regex = re.compile(r'([0123])\.(Sing)')
            gloss = pnum_regex.sub(r'\1SG', gloss)
            pnum_regex = re.compile(r'([0123])\.(Plur)')
            gloss = pnum_regex.sub(r'\1PL', gloss)

            parts = []
            is_null = False
            for part in gloss.split('.'):
                if re.search(r'[0123](SG|PL)', part):
                    parts.append(part)
                else:
                    if part in cls.gloss_dict:
                        part = cls.gloss_dict[part]

                        if part != '???':
                            parts.append(part)
                        else:
                            is_null = True
                            break
                    else:
                        is_null = True
                        break

            if is_null:
                gloss = ''
            else:
                gloss = '.'.join(parts)

        else:
            gloss = ''

        return gloss

    @classmethod
    def clean_gloss(cls, gloss):
        for cleaning_method in [cls.remove_pos]:
            gloss = cleaning_method(gloss)
        return gloss

    @staticmethod
    def remove_pos(gloss):
        """Remove the POS tag.

        Morpho-syntactic annotations start with the POS tag:
        [POS].[SUB-GlOSS1].[SUB-GLOSS2]

        Example:
            ADJ.Fem.Sing => Fem.Sing
        """
        regex = re.compile(r'^[^.]+\.')
        gloss = regex.sub('', gloss)

        return gloss