Exemplo n.º 1
0
class EnglishPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/english/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/english/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        return cls.extract_first_pos(pos)

    @staticmethod
    def extract_first_pos(pos):
        """Extract the first POS tag.

        Several POS tags are separated by ':'.
        """
        return pos.split(':')[0]
Exemplo n.º 2
0
class KuWaruPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/ku_waru/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/ku_waru/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.infer_pos(pos, cls.pos_ud_dict)
        else:
            return cls.infer_pos(pos, cls.pos_dict)

    @classmethod
    def infer_pos(cls, pos, pos_dict):
        if cls.is_suffix(pos):
            return 'sfx'
        elif cls.is_prefix(pos):
            return 'pfx'
        else:
            pos = ToolboxMorphemeCleaner.clean(pos)
            return pos_dict.get(pos, '')

    @staticmethod
    def is_suffix(pos):
        return pos.startswith('-') or pos.startswith('=')

    @staticmethod
    def is_prefix(pos):
        return pos.endswith('-') or pos.endswith('=')
Exemplo n.º 3
0
class NungonPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/nungon/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/nungon/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.remove_question_mark(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @staticmethod
    def remove_question_mark(morpheme):
        """Remove the question mark in the morpheme.

        Question marks might code insecure annotations. They are prefixed to
        the morpheme.
        """
        return morpheme.lstrip('?')
Exemplo n.º 4
0
class JapaneseMiyataPOSMapper:

    pos_dict = parse_csv(
        get_full_path(
            'parsers/corpora/main/japanese_miyata/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path(
            'parsers/corpora/main/japanese_miyata/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def replace_colon_by_dot_pos(cls, pos):
        """Replace the colons in the POS tag by a dot."""
        return pos.replace(':', '.')

    @classmethod
    def clean_pos(cls, pos):
        pos = cls.replace_colon_by_dot_pos(pos)
        return pos
Exemplo n.º 5
0
class QaqetPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/qaqet/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/qaqet/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):

        if pos.startswith('-') or pos.startswith('='):
            return 'sfx'
        elif pos.endswith('-') or pos.endswith('='):
            return 'sfx'
        else:
            pos = cls.clean_pos(pos)

            if ud:
                return cls.pos_ud_dict.get(pos, '')
            else:
                return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        pos = cls.unify_unknowns_morpheme(pos)
        pos = ToolboxMorphemeCleaner.remove_morpheme_delimiters(pos)
        return pos

    @classmethod
    def unify_unknowns_morpheme(cls, morpheme):
        unknown_re = re.compile(r'\bx+|\?{2}|\*{3}')
        return unknown_re.sub('???', morpheme)
Exemplo n.º 6
0
class RussianPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/russian/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/russian/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')
Exemplo n.º 7
0
class JapaneseMiiProPOSMapper:

    pos_dict = parse_csv(get_full_path(
        'parsers/corpora/main/japanese_miipro/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(get_full_path(
        'parsers/corpora/main/japanese_miipro/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')
Exemplo n.º 8
0
class ChintangPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/chintang/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/chintang/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        if pos.startswith('-'):
            return 'sfx'
        elif pos.endswith('-'):
            return 'pfx'
        else:
            pos = ToolboxMorphemeCleaner.clean(pos)
            if ud:
                return cls.pos_ud_dict.get(pos, '')
            else:
                return cls.pos_dict.get(pos, '')
Exemplo n.º 9
0
class TuatschinPOSMapper:

    pos_dict = parse_csv(get_full_path(
        'parsers/corpora/main/tuatschin/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(get_full_path(
        'parsers/corpora/main/tuatschin/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        for cleaning_method in [
            cls.remove_specifications
        ]:
            pos = cleaning_method(pos)
        return pos

    @staticmethod
    def remove_specifications(pos):
        """Remove specifications of POS tags.

        Specifications start with `_`.

        Examples:
        - words erroneously written apart: _cont
        - child forms: _Chld
        - discourse particles: _Discpart
        ...
        """
        regex = re.compile(r'_[^_]+')
        pos = regex.sub('', pos)
        return pos
Exemplo n.º 10
0
class InuktitutPOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/inuktitut/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @classmethod
    def clean_pos(cls, pos):
        """Replace the POS tag separator."""
        return cls.replace_pos_separator(pos)

    @staticmethod
    def replace_pos_separator(pos):
        """Replace the POS tag separator.

        A morpheme may have several POS tags separated by a pipe.
        POS tags to the right are subcategories of the POS tags to the left.
        The separator is replaced by a dot.

        Args:
            pos (str): The POS tag.

        Returns:
            str: POS tag separator replaced by a dot.
        """
        return pos.replace('|', '.')
Exemplo n.º 11
0
class CreePOSMapper:

    pos_dict = parse_csv(
        get_full_path('parsers/corpora/main/cree/resources/pos.csv'))

    pos_ud_dict = parse_pos_ud(
        get_full_path('parsers/corpora/main/cree/resources/pos.csv'))

    @classmethod
    def map(cls, pos, ud=False):
        pos = cls.clean_pos(pos)

        if ud:
            return cls.pos_ud_dict.get(pos, '')
        else:
            return cls.pos_dict.get(pos, '')

    @staticmethod
    def uppercase_pos_in_parentheses(pos):
        """Uppercase POS tags in parentheses.

        Parentheses indicate covert grammatical categories.
        """
        pos_in_parentheses_regex = re.compile(r'(\()(\S+)(\))')
        # extract POS in parentheses
        match = pos_in_parentheses_regex.search(pos)
        if not match:
            return pos
        else:
            # replace by uppercased version
            up_pos = match.group(2).upper()
            return pos_in_parentheses_regex.sub(r'\1{}\3'.format(up_pos), pos)

    @classmethod
    def clean_pos(cls, pos):
        return cls.uppercase_pos_in_parentheses(pos)