示例#1
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments['--list']:
            for dl in DataCatalogue.downloads_list():
                print("{}\t{}\t{}".format(dl.name, dl.size, dl.description))
            sys.exit(0)

        package_name = arguments.get('<PACKAGE>', None)

        try:
            dl_info = DataCatalogue.get_download_info(package_name)
        except:
            sys.stderr.write('Error: Invalid package name. Run `camel_data -l`'
                             ' to get a list of available packages.\n')
            sys.exit(1)

        try:
            DataDownloader.download(dl_info)
        except DownloaderError as e:
            sys.stderr.write('Error: {}\n'.format(e.msg))
            sys.exit(1)

    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)

    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
示例#2
0
    def pretrained(model_name=None, analyzer=None, top=1, cache_size=100000):
        """Load a pre-trained MLE disambiguator provided with CAMeL Tools.

        Args:
            model_name (:obj:`str`, optional): The name of the pretrained
                model. If none, the default model ('calima-msa-r13') is loaded.
                At the moment, the model names available are the same as those
                in :ref:`camel_morphology_dbs`.
                Defaults to None.
            analyzer (:obj:`Analyzer`, optional): Alternative
                analyzer to use. If None, an instance of the model's default
                analyzer is created. Defaults to None.
            top (:obj:`int`, optional): The maximum number of top analyses to
                return. Defaults to 1.
            cache_size (:obj:`int`, optional): The number of unique word
                disambiguations to cache. The cache uses a
                least-frequently-used eviction policy. Defaults to 100000.

        Returns:
            :obj:`MLEDisambiguator`: The loaded MLE disambiguator.
        """

        model_info = DataCatalogue.get_dataset_info('DisambigMLE', model_name)
        mle_path = model_info.path / 'model.json'

        if analyzer is None:
            analyzer = _MLE_ANALYZER_MAP[model_info.name]()

        return MLEDisambiguator(analyzer, str(mle_path), top, cache_size)
示例#3
0
    def list_builtin_dbs():
        """Returns a list of builtin databases provided with CAMeL Tools.

        Returns:
            :obj:`list` of :obj:`~camel_tools.data.DatasetInfo`: List of
            builtin databases.
        """

        return list(DataCatalogue.get_component_info('MorphologyDB').datasets)
示例#4
0
    def pretrained(model_name=None):
        """Load a pre-trained model provided with camel_tools.

        Args:
            model_name (:obj:`str`, optional): Name of pre-trained model to
                load. One model is available: 'arabert'.
                If None, the default model ('arabert') will be loaded.
                Defaults to None.

        Returns:
            :obj:`NERecognizer`: Instance with loaded pre-trained model.
        """

        model_info = DataCatalogue.get_dataset_info('NamedEntityRecognition',
                                                    model_name)
        model_path = str(model_info.path)

        return NERecognizer(model_path)
示例#5
0
    def pretrained(model_name=None):
        """Load a pre-trained model provided with camel_tools.

        Args:
            model_name (:obj:`str`, optional): Name of pre-trained model to
                load.
                Two models are available: 'arabert' and 'mbert'.
                If None, the default model ('arabert') will be loaded.
                Defaults to None.

        Returns:
            :obj:`SentimentAnalyzer`: Instance with loaded pre-trained model.
        """

        model_info = DataCatalogue.get_dataset_info('SentimentAnalysis',
                                                    model_name)
        model_path = str(model_info.path)

        return SentimentAnalyzer(model_path)
示例#6
0
    def builtin_db(db_name='calima-msa-r13', flags='a'):
        """Create a :obj:`MorphologyDB` instance from one of the builtin
        databases provided.

        Args:
            db_name (:obj:`str`, optional): Name of builtin database.
                You can use :meth:`list_builtin_dbs` to get a list of
                builtin databases or see :ref:`camel_morphology_dbs`.
                Defaults to 'calima-msa-r13'.
            flags (:obj:`str`, optional): Flag string to be passed to
                :obj:`MorphologyDB` constructor. Defaults to 'a'.

        Returns:
            :obj:`MorphologyDB`: Instance of builtin database with given flags.
        """

        db_info = DataCatalogue.get_dataset_info('MorphologyDB', db_name)

        return MorphologyDB(str(Path(db_info.path, 'morphology.db')), flags)
示例#7
0
    'JED': 'Gulf',
    'JER': 'Levant',
    'KHA': 'Nile Basin',
    'MOS': 'Gulf',
    'MSA': 'Modern Standard Arabic',
    'MUS': 'Gulf',
    'RAB': 'Maghreb',
    'RIY': 'Gulf',
    'SAL': 'Levant',
    'SAN': 'Gulf of Aden',
    'SFX': 'Maghreb',
    'TRI': 'Maghreb',
    'TUN': 'Maghreb'
}

_DATA_DIR = DataCatalogue.get_dataset_info('DialectID').path
_CHAR_LM_DIR = Path(_DATA_DIR, 'lm', 'char')
_WORD_LM_DIR = Path(_DATA_DIR, 'lm', 'word')
_TRAIN_DATA_PATH = Path(_DATA_DIR, 'corpus_26_train.tsv')
_TRAIN_DATA_EXTRA_PATH = Path(_DATA_DIR, 'corpus_6_train.tsv')
_DEV_DATA_PATH = Path(_DATA_DIR, 'corpus_26_dev.tsv')
_TEST_DATA_PATH = Path(_DATA_DIR, 'corpus_26_test.tsv')


class DIDPred(collections.namedtuple('DIDPred', ['top', 'scores'])):
    """A named tuple containing dialect ID prediction results.

    Attributes:
        top (:obj:`str`): The dialect label with the highest score. See
            :ref:`dialectid_labels` for a list of output labels.
        scores (:obj:`dict`): A dictionary mapping each dialect label to it's