Пример #1
0
    def __init__(self, db):
        if not isinstance(db, CalimaStarDB):
            raise ReinflectorError('DB is not an instance of CalimaStarDB')
        if not db.flags.generation:
            raise ReinflectorError('DB does not support reinflection')

        self._db = db

        self._analyzer = CalimaStarAnalyzer(db)
        self._generator = CalimaStarGenerator(db)
Пример #2
0
    def __init__(self, grammar, separator, min_base_length, dialects):

        self.separator = separator
        self.grammar_file = grammar
        self.min_base_length = min_base_length
        ### Dialects are represented as subgrammars that are merged to form the full grammar
        ## Each analysis produced by a grammar should report the subgrammar generating it
        ## These can then be pruned if they do not appear in the list of desired dialects here
        self.dialects = dialects
        ### The free built-in grammar database doesn't distinguish bases
        ## If you use this grammar, here's a cheap hack to predict the base token
        if self.grammar_file == 'built-in' or 'built-in' in self.dialects:
            ### The free grammar only supports the MSA variety of Arabic
            self.dialects = ['built-in', 'MSA']
            self.grammar = CalimaStarDB(os.path.join(DESEG_DIR, 'grammar.db'),
                                        'a')
            ### Order of tags used to predict which token belongs to base when multiple tags occur
            ## I did this in 5 minutes as proof of concept.. the order could be improved
            ## If you really want good results on MSA, consider buying the Sama database
            self.open_classes_hierarchy = [
                'NOUN', 'ADJ', 'VERB', 'IV', 'PV', 'CV', 'ADV', 'NOUN_PROP',
                'IV_PASS', 'PV_PASS', 'VERB_PART', 'FOREIGN', 'PSEUDO_VERB',
                'FOCUS_PART', 'REL_ADV', 'ABBREV', 'PART', 'INTERROG_PRON',
                'REL_PRON', 'NOUN_QUANT', 'PRON_3MS', 'PRON_3MP', 'PRON_3D',
                'PRON_2D'
                'PRON_2MS', 'PRON_2FS', 'PRON_1S', 'PRON_2MS', 'PRON_2MP',
                'PRON_3FS', 'PRON_3FP', 'PRON_2D', 'PRON_1P', 'DEM_PRON_FP',
                'DEM_PRON_MP', 'DEM_PRON_MS', 'DEM_PRON', 'DEM_PRON_F',
                'DEM_PRON_FD', 'DEM_PRON_MD', 'DEM_PRON_FS', 'FUT_PART',
                'NEG_PART', 'VOC_PART', 'NOUN_NUM', 'PREP', 'SUB_CONJ', 'CONJ',
                'INTERJ', 'INTERROG_ADV', 'INTERROG_PART', 'EXCLAM_PRON',
                'NUMERIC_COMMA', 'PUNC', 'DET'
            ]

        else:
            ### Try to load the specified grammar database in analyze mode
            try:
                self.grammar = CalimaStarDB(grammar, 'a')
            ### Resort to the free built-in grammar database if the specified database can't be found
            except FileNotFoundError:
                stderr.write(
                    '\nCould not locate grammar database "{}"\nLoading built-in database almor-msa\n'
                    .format(grammar))
                self.grammar = CalimaStarDB(
                    os.path.join(DESEG_DIR, 'grammar.db'), 'a')
                self.grammar_file = 'built-in'
                self.dialects = ['built-in', 'MSA']

        ### Run the analyzer in back-off mode, where input words can be any POS
        self.analyzer = CalimaStarAnalyzer(self.grammar, 'NOAN_ALL')
Пример #3
0
def _analyze(db, fin, fout, backoff, cache):
    analyzer = CalimaStarAnalyzer(db, backoff)
    memoize_table = {} if cache else None

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            if cache and token in memoize_table:
                if six.PY3:
                    fout.write(memoize_table[token])
                else:
                    fout.write(force_encoding(memoize_table[token]))

                fout.write('\n\n')
            else:
                analyses = analyzer.analyze(token)
                serialized = _serialize_analyses(fout, token, analyses,
                                                 db.order)

                if cache:
                    memoize_table[token] = serialized

                if six.PY3:
                    fout.write(serialized)
                else:
                    fout.write(force_encoding(serialized))

                fout.write('\n\n')

        line = force_unicode(fin.readline())
def _analyze(db, fin, fout, backoff, cache, num_disambig=None):
    if cache:
        analyzer = CalimaStarAnalyzer(db, backoff, cache_size=1024)
    else:
        analyzer = CalimaStarAnalyzer(db, backoff)
    disambig = None

    if num_disambig is not None:
        disambig = MLEDisambiguator(analyzer)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            if num_disambig is not None:
                dambg = disambig.disambiguate([token], num_disambig)
                analyses = [a.analysis for a in dambg[0].analyses]
            else:
                analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
Пример #5
0
from camel_tools.calima_star.database import CalimaStarDB
from camel_tools.calima_star.analyzer import CalimaStarAnalyzer
import camel_tools.utils

db = CalimaStarDB('/usr/local/lib/python3.7/site-packages/camel_tools/calima_star/databases/almor-msa/almor-msa-r13.db', 'a')
analyzer = CalimaStarAnalyzer(db)

PP4S = [
'فتية',
'صبية',
'غلمة',
'جيرة',
'إخوة',
'شيخة',
'ثيرة'
]

def isAVowel(char):
    if char == 'ا':
        return True
    if char == 'أ':
        return True
    if char == 'ي':
        return True
    if char == 'ى':
        return True
    if char == 'و':
        return True
    return False

#أفعل
Пример #6
0
class CalimaStarReinflector(object):
    """CALIMA Star reinflector component.

    Arguments:
        db (:obj:`~camel_tools.calima_star.database.CalimaStarDB`): Database to
            use for generation. Must be opened in reinflection mode or both
            analysis and generation modes.

    Raises:
        :obj:`~camel_tools.calima_star.errors.ReinflectorError`: If **db** is
            not an instance of
            :obj:`~camel_tools.calima_star.database.CalimaStarDB` or if **db**
            does not support reinflection.
    """
    def __init__(self, db):
        if not isinstance(db, CalimaStarDB):
            raise ReinflectorError('DB is not an instance of CalimaStarDB')
        if not db.flags.generation:
            raise ReinflectorError('DB does not support reinflection')

        self._db = db

        self._analyzer = CalimaStarAnalyzer(db)
        self._generator = CalimaStarGenerator(db)

    def reinflect(self, word, feats):
        """Generate analyses for a given word from a given set of inflectional
        features.

        Arguments:
            word (:obj:`str`): Word to reinflect.
            feats (:obj:`dict`): Dictionary of features.
                See :doc:`/reference/calima_star_features` for more information
                on features and their values.

        Returns:
            :obj:`list` of :obj:`dict`: List of generated analyses.
            See :doc:`/reference/calima_star_features` for more information on
            features and their values.

        Raises:
            :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeature`:
                If a feature is given that is not defined in database.
            :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeatureValue`:
                If an invalid value is given to a feature or if 'pos' feature
                is not defined.
        """

        analyses = self._analyzer.analyze(word)

        if not analyses or len(analyses) == 0:
            return []

        for feat in feats:
            if feat not in self._db.defines:
                raise InvalidReinflectorFeature(feat)
            elif self._db.defines[feat] is not None:
                if feat in _ANY_FEATS and feats[feat] == 'ANY':
                    continue
                elif feats[feat] not in self._db.defines[feat]:
                    raise InvalidReinflectorFeatureValue(feat, feats[feat])

        has_clitics = False
        for feat in _CLITIC_FEATS:
            if feat in feats:
                has_clitics = True
                break

        results = deque()

        for analysis in analyses:
            if dediac_ar(analysis['diac']) != dediac_ar(word):
                continue

            if 'pos' in feats and feats['pos'] != analysis['pos']:
                continue

            lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0]

            if 'lex' in feats and feats['lex'] != lemma:
                continue

            is_valid = True
            generate_feats = {}

            for feat in analysis.keys():
                if feat in _IGNORED_FEATS:
                    continue
                elif feat in _SPECIFIED_FEATS and feat not in feats:
                    continue
                elif has_clitics and feat in _CLITIC_IGNORED_FEATS:
                    continue
                else:
                    if feat in feats:
                        if feats[feat] == 'ANY':
                            continue
                        elif analysis[feat] != 'na':
                            generate_feats[feat] = feats[feat]
                        else:
                            is_valid = False
                            break
                    elif analysis[feat] != 'na':
                        generate_feats[feat] = analysis[feat]

            if is_valid:
                generated = self._generator.generate(lemma, generate_feats)
                if generated is not None:
                    results.extend(generated)

        return list(results)
Пример #7
0
class Analyzer:
    """
    This class should describe an analyzer that can take an input word
        and run it through some de-lexical grammar
    The get_possible_segmentations function should return a set of tiples 
        Each triple will represent a possible segmentation
            The first item in the triple is a potentially empty list of proclitics,
            The second item is the base, represented as a string
            The third item is a potentially empty list of enclitics
        ( [[proclitics1], base1, [enclitics1]], [[proclitic2], base2, [enclitics2]], ... )
    """
    def __init__(self, grammar, separator, min_base_length, dialects):

        self.separator = separator
        self.grammar_file = grammar
        self.min_base_length = min_base_length
        ### Dialects are represented as subgrammars that are merged to form the full grammar
        ## Each analysis produced by a grammar should report the subgrammar generating it
        ## These can then be pruned if they do not appear in the list of desired dialects here
        self.dialects = dialects
        ### The free built-in grammar database doesn't distinguish bases
        ## If you use this grammar, here's a cheap hack to predict the base token
        if self.grammar_file == 'built-in' or 'built-in' in self.dialects:
            ### The free grammar only supports the MSA variety of Arabic
            self.dialects = ['built-in', 'MSA']
            self.grammar = CalimaStarDB(os.path.join(DESEG_DIR, 'grammar.db'),
                                        'a')
            ### Order of tags used to predict which token belongs to base when multiple tags occur
            ## I did this in 5 minutes as proof of concept.. the order could be improved
            ## If you really want good results on MSA, consider buying the Sama database
            self.open_classes_hierarchy = [
                'NOUN', 'ADJ', 'VERB', 'IV', 'PV', 'CV', 'ADV', 'NOUN_PROP',
                'IV_PASS', 'PV_PASS', 'VERB_PART', 'FOREIGN', 'PSEUDO_VERB',
                'FOCUS_PART', 'REL_ADV', 'ABBREV', 'PART', 'INTERROG_PRON',
                'REL_PRON', 'NOUN_QUANT', 'PRON_3MS', 'PRON_3MP', 'PRON_3D',
                'PRON_2D'
                'PRON_2MS', 'PRON_2FS', 'PRON_1S', 'PRON_2MS', 'PRON_2MP',
                'PRON_3FS', 'PRON_3FP', 'PRON_2D', 'PRON_1P', 'DEM_PRON_FP',
                'DEM_PRON_MP', 'DEM_PRON_MS', 'DEM_PRON', 'DEM_PRON_F',
                'DEM_PRON_FD', 'DEM_PRON_MD', 'DEM_PRON_FS', 'FUT_PART',
                'NEG_PART', 'VOC_PART', 'NOUN_NUM', 'PREP', 'SUB_CONJ', 'CONJ',
                'INTERJ', 'INTERROG_ADV', 'INTERROG_PART', 'EXCLAM_PRON',
                'NUMERIC_COMMA', 'PUNC', 'DET'
            ]

        else:
            ### Try to load the specified grammar database in analyze mode
            try:
                self.grammar = CalimaStarDB(grammar, 'a')
            ### Resort to the free built-in grammar database if the specified database can't be found
            except FileNotFoundError:
                stderr.write(
                    '\nCould not locate grammar database "{}"\nLoading built-in database almor-msa\n'
                    .format(grammar))
                self.grammar = CalimaStarDB(
                    os.path.join(DESEG_DIR, 'grammar.db'), 'a')
                self.grammar_file = 'built-in'
                self.dialects = ['built-in', 'MSA']

        ### Run the analyzer in back-off mode, where input words can be any POS
        self.analyzer = CalimaStarAnalyzer(self.grammar, 'NOAN_ALL')

    def get_possible_segmentations(self, word):

        ### Assumes input word is already normalized if necessary
        possible_segmentations = []
        min_base_length = min(len(word), self.min_base_length)

        ### Run the analyzer
        try:
            analyses = self.analyzer.analyze(word)
            completed_analyses = {}

            ### Parse each analysis
            for analysis in analyses:

                ### Check the subgrammar that produced it
                dialect = self.get_analysis_dialect(analysis)
                if dialect in self.dialects:

                    possible_segmentation = [[], None, [], dialect]
                    ### Parse free built-in Almor grammar analysis
                    if 'built-in' in self.dialects:
                        analysis = dediacritize_normalize(
                            self.accomodate_built_in_grammar(word, analysis))
                    ### Parse non-standard dialect subgrammar analysis
                    elif dialect != 'MSA':
                        analysis = dediacritize_normalize(
                            self.accomodate_DA_grammar(word, analysis))
                    ### Parse Sama MSA grammar analysis
                    else:
                        analysis = dediacritize_normalize(
                            analysis.get('d3seg', None))

                    ### If no analysis, default to the entire word as the base
                    if analysis == None:
                        possible_segmentation[1] = word
                        possible_segmentations.append(possible_segmentation)
                        break

                    ### Make sure no segmentations leak into the segmentations
                    ## (our grammars are adapted from databases designed for segmentation)
                    if tuple([analysis, dialect]) not in completed_analyses:
                        completed_analyses[tuple([analysis, dialect])] = True
                        cat_tok = analysis.replace('+', '').replace('_', '')
                        if cat_tok == word:

                            ### Separate tokens
                            analysis = analysis.split('_')
                            ### Handle words entirely consisting of diacritics
                            if len(analysis) == 0:
                                possible_segmentation[1] = word
                            ### For non-empty words
                            else:
                                ### Only consider tokens consisting of more than just diacritics
                                all_tokens_empty = True
                                for token in analysis:
                                    if len(token.strip(self.separator)) != 0:
                                        all_tokens_empty = False

                                        ### handle proclitics
                                        if self.separator == token[-1]:
                                            possible_segmentation[0].append(
                                                token)
                                        ### handle enclitics
                                        elif self.separator == token[0]:
                                            possible_segmentation[2].append(
                                                token)
                                        ### handle base
                                        else:
                                            possible_segmentation[1] = token
                                ### Finish handling words entirely consisting of diacritics
                                if all_tokens_empty:
                                    possible_segmentation[1] = word

                            ### Prune ill-formed bases
                            base = possible_segmentation[1]
                            if base != None and len(
                                    base
                            ) >= min_base_length and possible_segmentation not in possible_segmentations:  # and base in self.vocabulary
                                possible_segmentations.append(
                                    possible_segmentation)

        ### If inconsistency in the database, word will be the base with no clitics
        except KeyError:
            possible_segmentation = [[], word, [], 'MSA']
            possible_segmentations.append(possible_segmentation)
            stderr.write(
                '\nGrammar database key error for {}\nUsing default segmentation analysis {}\n'
                .format(word, str(possible_segmentations)))

        ### And if no reasonable analyses are produced, default base is the word with no clitics
        if len(possible_segmentations) == 0:
            possible_segmentations = [[[], word, [], 'MSA']]

        return possible_segmentations

    def accomodate_DA_grammar(self, word, analysis):

        ### DA doesn't give D3tok so we need to parse diac
        analysis_seg = analysis['diac'].replace('_', '+').split('#')

        if len(analysis_seg) != 3:
            stderr.write('Bad Analysis!!!\n\t{}\n{}\n{}\n\n'.format(
                str(analysis_seg), str(word), str(analysis)))
            analysis_seg = ['', word, '']

        tokens = []

        proclitics = analysis_seg[0].split('+')
        for pro in proclitics:
            tokens.append('{}+_'.format(pro))

        tokens.append(analysis_seg[1])

        enclitics = analysis_seg[2].split('+')
        for en in enclitics:
            tokens.append('_+{}'.format(en))

        return ''.join(tokens)

    def accomodate_built_in_grammar(self, word, analysis):

        ### Almor doesn't give D3tok so we need to parse BW
        analysis = analysis['bw'].replace('+', '/').strip('/').split('/')

        open_class_tag = None
        for open_class in self.open_classes_hierarchy:
            if open_class in analysis:
                open_class_tag = open_class
                break

        try:
            assert open_class_tag != None
        except:
            stderr.write(
                'Could not find a base token!\nPlease add the problematic tag to the open_classes_hierarchy in the greedy_analyzer.py'
            )
            stderr.write('{}\n'.format(word))
            stderr.write('{}\n'.format(str(analysis)))
            stderr.write('{}\n'.format(str(self.open_classes_hierarchy)))
            exit()

        try:
            assert len(analysis) % 2 == 0
        except:
            stderr.write('Malformed analysis!\n')
            stderr.write('{}\n'.format(word))
            stderr.write('{}\n'.format(str(analysis)))
            exit()

        tokens = []
        pro = True
        for m in range(0, len(analysis), 2):
            token = dediacritize_normalize(analysis[m])
            if len(token) > 0:
                if pro and analysis[m + 1] == open_class_tag:
                    pro = False
                    tokens.append('{}'.format(token))
                else:
                    if pro:
                        tokens.append('{}+_'.format(token))
                    else:
                        tokens.append('_+{}'.format(token))

        return ''.join(tokens)

    def get_analysis_dialect(self, analysis_dict):

        if 'built-in' in self.dialects:
            return 'MSA'
        else:
            dialect = DIALECT_RE.findall(analysis_dict['gloss'])
            if len(dialect) == 0:
                return None
            else:
                return dialect[0][1:4]
Пример #8
0
 def __init__(self, analyzer_db_path):
     self.db = CalimaStarDB(analyzer_db_path)
     self.analyzer = CalimaStarAnalyzer(self.db, cache_size=46000)
     self.disambiguator = MLEDisambiguator(self.analyzer)
     self.w_to_features = {}