Exemplo n.º 1
0
def download(lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None):
    # set global logging level
    set_logging_level(logging_level, verbose)
    # process different pipeline parameters
    lang, dir, package, processors = process_pipeline_parameters(lang, dir, package, processors)

    # Download resources.json to obtain latest packages.
    logger.debug('Downloading resource file...')
    request_file(f'{DEFAULT_RESOURCES_URL}/resources_{__resources_version__}.json', os.path.join(dir, 'resources.json'))
    resources = json.load(open(os.path.join(dir, 'resources.json')))
    if lang not in resources:
        raise Exception(f'Unsupported language: {lang}.')
    if 'alias' in resources[lang]:
        logger.info(f'"{lang}" is an alias for "{resources[lang]["alias"]}"')
        lang = resources[lang]['alias']
    lang_name = resources[lang]['lang_name'] if 'lang_name' in resources[lang] else ''
    url = resources['url']

    # Default: download zipfile and unzip
    if package == 'default' and (processors is None or len(processors) == 0):
        logger.info(f'Downloading default packages for language: {lang} ({lang_name})...')
        request_file(f'{url}/{__resources_version__}/{lang}/default.zip', os.path.join(dir, lang, f'default.zip'), md5=resources[lang]['default_md5'])
        unzip(os.path.join(dir, lang), 'default.zip')
    # Customize: maintain download list
    else:
        download_list = maintain_processor_list(resources, lang, package, processors)
        download_list = add_dependencies(resources, lang, download_list)
        download_list = flatten_processor_list(download_list)
        download_table = make_table(['Processor', 'Package'], download_list)
        logger.info(f'Downloading these customized packages for language: {lang} ({lang_name})...\n{download_table}')

        # Download packages
        for key, value in download_list:
            try:
                request_file(f'{url}/{__resources_version__}/{lang}/{key}/{value}.pt', os.path.join(dir, lang, key, f'{value}.pt'), md5=resources[lang][key][value]['md5'])
            except KeyError as e:
                raise Exception(f"Cannot find the following processor and model name combination: {key}, {value}. Please check if you have provided the correct model name.") from e
    logger.info(f'Finished downloading models and saved to {dir}.')
Exemplo n.º 2
0
def download(lang='en',
             dir=DEFAULT_MODEL_DIR,
             package='default',
             processors={},
             logging_level='INFO',
             verbose=None,
             resources_url=DEFAULT_RESOURCES_URL,
             resources_branch=None,
             resources_version=DEFAULT_RESOURCES_VERSION,
             model_url=DEFAULT_MODEL_URL):
    # set global logging level
    set_logging_level(logging_level, verbose)
    # process different pipeline parameters
    lang, dir, package, processors = process_pipeline_parameters(
        lang, dir, package, processors)

    if resources_url == DEFAULT_RESOURCES_URL and resources_branch is not None:
        resources_url = STANZA_RESOURCES_GITHUB + resources_branch
    # Download resources.json to obtain latest packages.
    logger.debug('Downloading resource file...')
    # handle short name for resources urls; otherwise treat it as url
    if resources_url.lower() in ('stanford', 'stanfordnlp'):
        resources_url = STANFORDNLP_RESOURCES_URL
    # make request
    request_file(f'{resources_url}/resources_{resources_version}.json',
                 os.path.join(dir, 'resources.json'))
    # unpack results
    try:
        resources = json.load(open(os.path.join(dir, 'resources.json')))
    except:
        raise Exception(
            f'Cannot load resource file. Please check your network connection, '
            f'or provided resource url and resource version.')
    if lang not in resources:
        raise Exception(f'Unsupported language: {lang}.')
    if 'alias' in resources[lang]:
        logger.info(f'"{lang}" is an alias for "{resources[lang]["alias"]}"')
        lang = resources[lang]['alias']
    lang_name = resources[lang]['lang_name'] if 'lang_name' in resources[
        lang] else ''
    url = resources['url'] if model_url.lower() == 'default' else model_url

    # Default: download zipfile and unzip
    if package == 'default' and (processors is None or len(processors) == 0):
        logger.info(
            f'Downloading default packages for language: {lang} ({lang_name})...'
        )
        request_file(f'{url}/{resources_version}/{lang}/default.zip',
                     os.path.join(dir, lang, f'default.zip'),
                     md5=resources[lang]['default_md5'])
        unzip(os.path.join(dir, lang), 'default.zip')
    # Customize: maintain download list
    else:
        download_list = maintain_processor_list(resources, lang, package,
                                                processors)
        download_list = add_dependencies(resources, lang, download_list)
        download_list = flatten_processor_list(download_list)
        download_table = make_table(['Processor', 'Package'], download_list)
        logger.info(f'Downloading these customized packages for language: '
                    f'{lang} ({lang_name})...\n{download_table}')

        # Download packages
        for key, value in download_list:
            try:
                request_file(
                    f'{url}/{resources_version}/{lang}/{key}/{value}.pt',
                    os.path.join(dir, lang, key, f'{value}.pt'),
                    md5=resources[lang][key][value]['md5'])
            except KeyError as e:
                raise Exception(
                    f'Cannot find the following processor and model name combination: '
                    f'{key}, {value}. Please check if you have provided the correct model name.'
                ) from e
    logger.info(f'Finished downloading models and saved to {dir}.')
Exemplo n.º 3
0
    def __init__(self,
                 lang='en',
                 dir=DEFAULT_MODEL_DIR,
                 package='default',
                 processors={},
                 logging_level='INFO',
                 verbose=None,
                 use_gpu=True,
                 **kwargs):
        self.lang, self.dir, self.kwargs = lang, dir, kwargs

        # set global logging level
        set_logging_level(logging_level, verbose)
        self.logging_level = logging.getLevelName(logger.level)
        # process different pipeline parameters
        lang, dir, package, processors = process_pipeline_parameters(
            lang, dir, package, processors)

        # Load resources.json to obtain latest packages.
        logger.debug('Loading resource file...')
        resources_filepath = os.path.join(dir, 'resources.json')
        if not os.path.exists(resources_filepath):
            raise Exception(
                f"Resources file not found at: {resources_filepath}. Try to download the model again."
            )
        with open(resources_filepath) as infile:
            resources = json.load(infile)
        if lang in resources:
            if 'alias' in resources[lang]:
                logger.info(
                    f'"{lang}" is an alias for "{resources[lang]["alias"]}"')
                lang = resources[lang]['alias']
            lang_name = resources[lang][
                'lang_name'] if 'lang_name' in resources[lang] else ''
        else:
            logger.warning(f'Unsupported language: {lang}.')

        # Maintain load list
        self.load_list = maintain_processor_list(
            resources, lang, package, processors) if lang in resources else []

        self.load_list = add_dependencies(
            resources, lang, self.load_list) if lang in resources else []
        self.load_list = self.update_kwargs(kwargs, self.load_list)
        if len(self.load_list) == 0:
            raise Exception(
                'No processor to load. Please check if your language or package is correctly set.'
            )
        load_table = make_table(['Processor', 'Package'],
                                [row[:2] for row in self.load_list])
        logger.info(
            f'Loading these models for language: {lang} ({lang_name}):\n{load_table}'
        )

        self.config = build_default_config(resources, lang, dir,
                                           self.load_list)
        self.config.update(kwargs)

        # Load processors
        self.processors = {}

        # configs that are the same for all processors
        pipeline_level_configs = {'lang': lang, 'mode': 'predict'}
        self.use_gpu = torch.cuda.is_available() and use_gpu
        logger.info("Use device: {}".format("gpu" if self.use_gpu else "cpu"))

        # set up processors
        pipeline_reqs_exceptions = []
        for item in self.load_list:
            processor_name, _, _ = item
            logger.info('Loading: ' + processor_name)
            curr_processor_config = self.filter_config(processor_name,
                                                       self.config)
            curr_processor_config.update(pipeline_level_configs)
            logger.debug('With settings: ')
            logger.debug(curr_processor_config)
            try:
                # try to build processor, throw an exception if there is a requirements issue
                self.processors[processor_name] = NAME_TO_PROCESSOR_CLASS[
                    processor_name](config=curr_processor_config,
                                    pipeline=self,
                                    use_gpu=self.use_gpu)
            except ProcessorRequirementsException as e:
                # if there was a requirements issue, add it to list which will be printed at end
                pipeline_reqs_exceptions.append(e)
                # add the broken processor to the loaded processors for the sake of analyzing the validity of the
                # entire proposed pipeline, but at this point the pipeline will not be built successfully
                self.processors[processor_name] = e.err_processor

        # if there are any processor exceptions, throw an exception to indicate pipeline build failure
        if pipeline_reqs_exceptions:
            logger.info('\n')
            raise PipelineRequirementsException(pipeline_reqs_exceptions)

        logger.info("Done loading processors!")
Exemplo n.º 4
0
def download(lang='en',
             model_dir=DEFAULT_MODEL_DIR,
             package='default',
             processors={},
             logging_level=None,
             verbose=None,
             resources_url=DEFAULT_RESOURCES_URL,
             resources_branch=None,
             resources_version=DEFAULT_RESOURCES_VERSION,
             model_url=DEFAULT_MODEL_URL,
             proxies=None):
    # set global logging level
    set_logging_level(logging_level, verbose)
    # process different pipeline parameters
    lang, model_dir, package, processors = process_pipeline_parameters(
        lang, model_dir, package, processors)

    download_resources_json(model_dir, resources_url, resources_branch,
                            resources_version, proxies)
    # unpack results
    with open(os.path.join(model_dir, 'resources.json')) as fin:
        resources = json.load(fin)
    if lang not in resources:
        raise ValueError(f'Unsupported language: {lang}.')
    if 'alias' in resources[lang]:
        logger.info(f'"{lang}" is an alias for "{resources[lang]["alias"]}"')
        lang = resources[lang]['alias']
    lang_name = resources[lang]['lang_name'] if 'lang_name' in resources[
        lang] else ''
    url = resources['url'] if model_url.lower() == 'default' else model_url

    # Default: download zipfile and unzip
    if package == 'default' and (processors is None or len(processors) == 0):
        logger.info(
            f'Downloading default packages for language: {lang} ({lang_name})...'
        )
        request_file(
            f'{url}/{resources_version}/{lang}/default.zip',
            os.path.join(model_dir, lang, f'default.zip'),
            proxies,
            md5=resources[lang]['default_md5'],
        )
        unzip(os.path.join(model_dir, lang), 'default.zip')
    # Customize: maintain download list
    else:
        download_list = maintain_processor_list(resources, lang, package,
                                                processors)
        download_list = add_dependencies(resources, lang, download_list)
        download_list = flatten_processor_list(download_list)
        download_table = make_table(['Processor', 'Package'], download_list)
        logger.info(f'Downloading these customized packages for language: '
                    f'{lang} ({lang_name})...\n{download_table}')

        # Download packages
        for key, value in download_list:
            try:
                request_file(
                    f'{url}/{resources_version}/{lang}/{key}/{value}.pt',
                    os.path.join(model_dir, lang, key, f'{value}.pt'),
                    proxies,
                    md5=resources[lang][key][value]['md5'])
            except KeyError as e:
                raise ValueError(
                    f'Cannot find the following processor and model name combination: '
                    f'{key}, {value}. Please check if you have provided the correct model name.'
                ) from e
    logger.info(f'Finished downloading models and saved to {model_dir}.')
Exemplo n.º 5
0
    def __init__(self,
                 lang='en',
                 dir=DEFAULT_MODEL_DIR,
                 package='default',
                 processors={},
                 logging_level=None,
                 verbose=None,
                 use_gpu=True,
                 model_dir=None,
                 **kwargs):
        self.lang, self.dir, self.kwargs = lang, dir, kwargs
        if model_dir is not None and dir == DEFAULT_MODEL_DIR:
            self.dir = model_dir

        # set global logging level
        set_logging_level(logging_level, verbose)
        # process different pipeline parameters
        lang, self.dir, package, processors = process_pipeline_parameters(
            lang, self.dir, package, processors)

        # Load resources.json to obtain latest packages.
        logger.debug('Loading resource file...')
        resources_filepath = os.path.join(self.dir, 'resources.json')
        if not os.path.exists(resources_filepath):
            raise ResourcesFileNotFoundError(resources_filepath)
        with open(resources_filepath) as infile:
            resources = json.load(infile)
        if lang in resources:
            if 'alias' in resources[lang]:
                logger.info(
                    f'"{lang}" is an alias for "{resources[lang]["alias"]}"')
                lang = resources[lang]['alias']
            lang_name = resources[lang][
                'lang_name'] if 'lang_name' in resources[lang] else ''
        else:
            logger.warning(f'Unsupported language: {lang}.')

        # Maintain load list
        processors = self.maybe_add_mwt(kwargs, resources, lang, processors)
        self.load_list = maintain_processor_list(
            resources, lang, package, processors) if lang in resources else []
        self.load_list = add_dependencies(
            resources, lang, self.load_list) if lang in resources else []
        self.load_list = self.update_kwargs(kwargs, self.load_list)
        if len(self.load_list) == 0:
            raise ValueError(
                'No processors to load for language {}.  Please check if your language or package is correctly set.'
                .format(lang))
        load_table = make_table(['Processor', 'Package'],
                                [row[:2] for row in self.load_list])
        logger.info(
            f'Loading these models for language: {lang} ({lang_name}):\n{load_table}'
        )

        self.config = build_default_config(resources, lang, self.dir,
                                           self.load_list)
        self.config.update(kwargs)

        # Load processors
        self.processors = {}

        # configs that are the same for all processors
        pipeline_level_configs = {'lang': lang, 'mode': 'predict'}
        self.use_gpu = torch.cuda.is_available() and use_gpu
        logger.info("Use device: {}".format("gpu" if self.use_gpu else "cpu"))

        # set up processors
        pipeline_reqs_exceptions = []
        for item in self.load_list:
            processor_name, _, _ = item
            logger.info('Loading: ' + processor_name)
            curr_processor_config = self.filter_config(processor_name,
                                                       self.config)
            curr_processor_config.update(pipeline_level_configs)
            # TODO: this is obviously a hack
            # a better solution overall would be to make a pretagged version of the pos annotator
            # and then subsequent modules can use those tags without knowing where those tags came from
            if "pretagged" in self.config and "pretagged" not in curr_processor_config:
                curr_processor_config["pretagged"] = self.config["pretagged"]
            logger.debug('With settings: ')
            logger.debug(curr_processor_config)
            try:
                # try to build processor, throw an exception if there is a requirements issue
                self.processors[processor_name] = NAME_TO_PROCESSOR_CLASS[
                    processor_name](config=curr_processor_config,
                                    pipeline=self,
                                    use_gpu=self.use_gpu)
            except ProcessorRequirementsException as e:
                # if there was a requirements issue, add it to list which will be printed at end
                pipeline_reqs_exceptions.append(e)
                # add the broken processor to the loaded processors for the sake of analyzing the validity of the
                # entire proposed pipeline, but at this point the pipeline will not be built successfully
                self.processors[processor_name] = e.err_processor
            except FileNotFoundError as e:
                # For a FileNotFoundError, we try to guess if there's
                # a missing model directory or file.  If so, we
                # suggest the user try to download the models
                if 'model_path' in curr_processor_config:
                    model_path = curr_processor_config['model_path']
                    model_dir, model_name = os.path.split(model_path)
                    lang_dir = os.path.dirname(model_dir)
                    if not os.path.exists(lang_dir):
                        # model files for this language can't be found in the expected directory
                        raise LanguageNotDownloadedError(
                            lang, lang_dir, model_path) from e
                    if processor_name not in resources[lang]:
                        # user asked for a model which doesn't exist for this language?
                        raise UnsupportedProcessorError(processor_name, lang)
                    if not os.path.exists(model_path):
                        model_name, _ = os.path.splitext(model_name)
                        # TODO: before recommending this, check that such a thing exists in resources.json.
                        # currently that case is handled by ignoring the model, anyway
                        raise FileNotFoundError(
                            'Could not find model file %s, although there are other models downloaded for language %s.  Perhaps you need to download a specific model.  Try: stanza.download(lang="%s",package=None,processors={"%s":"%s"})'
                            % (model_path, lang, lang, processor_name,
                               model_name)) from e

                # if we couldn't find a more suitable description of the
                # FileNotFoundError, just raise the old error
                raise

        # if there are any processor exceptions, throw an exception to indicate pipeline build failure
        if pipeline_reqs_exceptions:
            logger.info('\n')
            raise PipelineRequirementsException(pipeline_reqs_exceptions)

        logger.info("Done loading processors!")