Exemplo n.º 1
0
 def __init__(self):
     
     self.CuratedList = self.loadCuratedList()
     self.stop_words = set(stopwords.words('arabic'))
     self.arStemmer = Analyzer(MorphologyDB.builtin_db())
     self.sentSegRegexPattern = self.loadSentSegmentationList()        
     self.DotChar = '_'
Exemplo n.º 2
0
    def pretrained(model_name='msa',
                   top=1,
                   use_gpu=True,
                   batch_size=32,
                   cache_size=10000):
        """Load a pre-trained model provided with camel_tools.

        Args:
            model_name (:obj:`str`, optional): Name of pre-trained model to
                load. Three models are available: 'msa', 'egy', and 'glf.
                Defaults to `msa`.
            top (:obj:`int`, optional): The maximum number of top analyses to
                return. Defaults to 1.
            use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
                Defaults to True.
            batch_size (:obj:`int`, optional): The batch size. Defaults to 32.
            cache_size (:obj:`int`, optional): If greater than zero, then
                the analyzer will cache the analyses for the cache_size most
                frequent words, otherwise no analyses will be cached.
                Defaults to 100000.

        Returns:
            :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
            pre-trained model.
        """

        model_info = CATALOGUE.get_dataset('DisambigBertUnfactored',
                                           model_name)
        model_config = _read_json(Path(model_info.path, 'default_config.json'))
        model_path = str(model_info.path)
        features = FEATURE_SET_MAP[model_config['feature']]
        db = MorphologyDB.builtin_db(model_config['db_name'], 'a')
        analyzer = Analyzer(db,
                            backoff=model_config['backoff'],
                            cache_size=cache_size)
        scorer = model_config['scorer']
        tie_breaker = model_config['tie_breaker']
        ranking_cache = model_config['ranking_cache']

        return BERTUnfactoredDisambiguator(model_path,
                                           analyzer,
                                           top=top,
                                           features=features,
                                           scorer=scorer,
                                           tie_breaker=tie_breaker,
                                           use_gpu=use_gpu,
                                           batch_size=batch_size,
                                           ranking_cache=ranking_cache)
Exemplo n.º 3
0
    def pretrained_from_config(config,
                               top=1,
                               use_gpu=True,
                               batch_size=32,
                               cache_size=10000):
        """Load a pre-trained model from a config file.

            Args:
                config (:obj:`str`): Config file that defines the model
                    details. Defaults to `None`.
                top (:obj:`int`, optional): The maximum number of top analyses
                    to return. Defaults to 1.
                use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
                    Defaults to True.
                batch_size (:obj:`int`, optional): The batch size.
                    Defaults to 32.
                cache_size (:obj:`int`, optional): If greater than zero, then
                    the analyzer will cache the analyses for the cache_size
                    most frequent words, otherwise no analyses will be cached.
                    Defaults to 100000.

            Returns:
                :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
                pre-trained model.
            """

        model_config = _read_json(config)
        model_path = model_config['model_path']
        features = FEATURE_SET_MAP[model_config['feature']]
        db = MorphologyDB(model_config['db_path'], 'a')
        analyzer = Analyzer(db,
                            backoff=model_config['backoff'],
                            cache_size=cache_size)
        scorer = model_config['scorer']
        tie_breaker = model_config['tie_breaker']
        ranking_cache = model_config['ranking_cache']

        return BERTUnfactoredDisambiguator(model_path,
                                           analyzer,
                                           top=top,
                                           features=features,
                                           scorer=scorer,
                                           tie_breaker=tie_breaker,
                                           use_gpu=use_gpu,
                                           batch_size=batch_size,
                                           ranking_cache=ranking_cache)
Exemplo n.º 4
0
def load(lang, nlp=None):
    # Make sure the language is supported
    supported = {"en", "ar"}
    if lang not in supported:
        raise Exception("%s is an unsupported or unknown language" % lang)

    if lang == "en":
        # Load spacy
        nlp = nlp or spacy.load(lang, disable=["ner"])

        # Load language edit merger
        merger = import_module("errant.%s.merger" % lang)

        # Load language edit classifier
        classifier = import_module("errant.%s.classifier" % lang)
        # The English classifier needs spacy
        classifier.nlp = nlp

        # Return a configured ERRANT annotator
        return Annotator(lang, nlp, merger, classifier)

    if lang == "ar":
        # Load spacy
        # nlp = nlp or spacy.load(lang, disable=["ner"])
        db = MorphologyDB.builtin_db()
        analyzer = Analyzer(db)
        mled = MLEDisambiguator.pretrained()
        tagger = DefaultTagger(mled, 'pos')
        nlp = [analyzer, tagger]

        # Load language edit merger
        merger = import_module("errant.%s.merger" % lang)

        # Load language edit classifier
        classifier = import_module("errant.%s.classifier" % lang)
        # The English classifier needs spacy
        #classifier.nlp = nlp

        # Return a configured ERRANT annotator
        return Annotator(lang, nlp, merger, classifier)
Exemplo n.º 5
0
def _calima_egy_r13_analyzer():
    db = MorphologyDB.builtin_db('calima-egy-r13', 'a')
    analyzer = Analyzer(db, 'NOAN_PROP')
    return analyzer
    if s_size > max_sentence:
        max_sentence = s_size

    sentence_size += s_size
    

    fd.close()
    


print(min_sentence, max_sentence, sentence_size/len(sentences))

# Extract Morphological properties of every word from corpus

db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

# # Create analyzer with NOAN_PROP backoff
# analyzer = Analyzer(db, 'NOAN_PROP')

training_set = []

for sentence in sentences:
    s = []
    for word in sentence:
        
        analyses = analyzer.analyze(word['INPUT STRING'])
        # print(word, analyses)
        for d in analyses:
            # print(get_tag(d['bw']) == sentences[0][0]['POS'])
Exemplo n.º 7
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments.get('--list', False):
            _list_dbs()
            sys.exit(1)

        analyze = arguments.get('analyze', False)
        generate = arguments.get('generate', False)
        reinflect = arguments.get('reinflect', False)

        cache = arguments.get('--cache', False)
        backoff = arguments.get('--backoff', 'NONE')

        # Make sure we have a valid backoff mode
        if backoff is None:
            backoff = 'NONE'
        if analyze and backoff not in _ANALYSIS_BACKOFFS:
            sys.stderr.write('Error: invalid backoff mode.\n')
            sys.exit(1)
        if generate and backoff not in _GENARATION_BACKOFFS:
            sys.stderr.write('Error: invalid backoff mode.\n')
            sys.exit(1)

        # Open files (or just use stdin and stdout)
        fin, fout = _open_files(arguments['FILE'], arguments['--output'])

        # Determine required DB flags
        if analyze:
            dbflags = 'a'
        elif generate and backoff == 'NONE':
            dbflags = 'g'
        else:
            dbflags = 'r'

        # Load DB
        try:
            dbname = arguments.get('--db', _DEFAULT_DB)
            if dbname in _BUILTIN_DBS:
                db = MorphologyDB.builtin_db(dbname, dbflags)
            else:
                db = MorphologyDB(dbname, dbflags)
        except DatabaseError:
            sys.stderr.write('Error: Couldn\'t parse database.\n')
            sys.exit(1)
        except IOError:
            sys.stderr.write('Error: Database file could not be read.\n')
            sys.exit(1)

        # Continue execution in requested mode
        if analyze:
            try:
                _analyze(db, fin, fout, backoff, cache)
            except AnalyzerError as error:
                sys.stderr.write('Error: {}\n'.format(error.msg))
                sys.exit(1)
            except IOError:
                sys.stderr.write('Error: An IO error occurred.\n')
                sys.exit(1)

        elif generate:
            try:
                _generate(db, fin, fout, backoff)
            except IOError:
                sys.stderr.write('Error: An IO error occurred.\n')
                sys.exit(1)

        elif reinflect:
            try:
                _reinflect(db, fin, fout)
            except IOError:
                sys.stderr.write('Error: An IO error occurred.\n')
                sys.exit(1)

        sys.exit(0)

    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)
    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
Exemplo n.º 8
0
def _list_dbs():
    for db in sorted(MorphologyDB.list_builtin_dbs()):
        sys.stdout.write('{}\t{}\n'.format(db.name, db.version))
Exemplo n.º 9
0
import camel_tools as camelt
from camel_tools.utils.charsets import AR_DIAC_CHARSET
from camel_tools.utils.stringutils import force_unicode, force_encoding
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.morphology.generator import Generator
from camel_tools.morphology.reinflector import Reinflector
from camel_tools.morphology.errors import DatabaseError, AnalyzerError
from camel_tools.morphology.errors import GeneratorError, MorphologyError

__version__ = camelt.__version__

_ANALYSIS_BACKOFFS = frozenset(
    ('NONE', 'NOAN_ALL', 'NOAN_PROP', 'ADD_ALL', 'ADD_PROP'))
_GENARATION_BACKOFFS = frozenset(('NONE', 'REINFLECT'))
_BUILTIN_DBS = frozenset([db.name for db in MorphologyDB.list_builtin_dbs()])
_DEFAULT_DB = 'calima-msa-r13'

_DIAC_RE = re.compile(r'[' + re.escape(u''.join(AR_DIAC_CHARSET)) + r']')


def _tokenize(s):
    return s.split()


def _dediac(word):
    return _DIAC_RE.sub('', word)


def _to_int(s):
    s = str(s)
Exemplo n.º 10
0
"""

import re
import sys

from docopt import docopt

import camel_tools
from camel_tools.morphology.database import MorphologyDB
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.cli.utils import open_files

__version__ = camel_tools.__version__

_BUILTIN_DBS = MorphologyDB.list_builtin_dbs()
_DEFAULT_DB = 'calima-msa-r13'
_WHITESPACE_RE = re.compile(r'\s+|\S+')


def _diac_tokens(tokens, disambig, ignore_markers, marker, strip_markers,
                 pretokenized):
    result = []

    for token in tokens:
        if len(token.strip()) == 0:
            result.append(token)
        elif ignore_markers and token.startswith(marker):
            if strip_markers:
                result.append(token[len(marker):])
            else: