def download(lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None): # set global logging level set_logging_level(logging_level, verbose) # process different pipeline parameters lang, dir, package, processors = process_pipeline_parameters(lang, dir, package, processors) # Download resources.json to obtain latest packages. logger.debug('Downloading resource file...') request_file(f'{DEFAULT_RESOURCES_URL}/resources_{__resources_version__}.json', os.path.join(dir, 'resources.json')) resources = json.load(open(os.path.join(dir, 'resources.json'))) if lang not in resources: raise Exception(f'Unsupported language: {lang}.') if 'alias' in resources[lang]: logger.info(f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang]['lang_name'] if 'lang_name' in resources[lang] else '' url = resources['url'] # Default: download zipfile and unzip if package == 'default' and (processors is None or len(processors) == 0): logger.info(f'Downloading default packages for language: {lang} ({lang_name})...') request_file(f'{url}/{__resources_version__}/{lang}/default.zip', os.path.join(dir, lang, f'default.zip'), md5=resources[lang]['default_md5']) unzip(os.path.join(dir, lang), 'default.zip') # Customize: maintain download list else: download_list = maintain_processor_list(resources, lang, package, processors) download_list = add_dependencies(resources, lang, download_list) download_list = flatten_processor_list(download_list) download_table = make_table(['Processor', 'Package'], download_list) logger.info(f'Downloading these customized packages for language: {lang} ({lang_name})...\n{download_table}') # Download packages for key, value in download_list: try: request_file(f'{url}/{__resources_version__}/{lang}/{key}/{value}.pt', os.path.join(dir, lang, key, f'{value}.pt'), md5=resources[lang][key][value]['md5']) except KeyError as e: raise Exception(f"Cannot find the following processor and model name combination: {key}, {value}. Please check if you have provided the correct model name.") from e logger.info(f'Finished downloading models and saved to {dir}.')
def download(lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None, resources_url=DEFAULT_RESOURCES_URL, resources_branch=None, resources_version=DEFAULT_RESOURCES_VERSION, model_url=DEFAULT_MODEL_URL): # set global logging level set_logging_level(logging_level, verbose) # process different pipeline parameters lang, dir, package, processors = process_pipeline_parameters( lang, dir, package, processors) if resources_url == DEFAULT_RESOURCES_URL and resources_branch is not None: resources_url = STANZA_RESOURCES_GITHUB + resources_branch # Download resources.json to obtain latest packages. logger.debug('Downloading resource file...') # handle short name for resources urls; otherwise treat it as url if resources_url.lower() in ('stanford', 'stanfordnlp'): resources_url = STANFORDNLP_RESOURCES_URL # make request request_file(f'{resources_url}/resources_{resources_version}.json', os.path.join(dir, 'resources.json')) # unpack results try: resources = json.load(open(os.path.join(dir, 'resources.json'))) except: raise Exception( f'Cannot load resource file. Please check your network connection, ' f'or provided resource url and resource version.') if lang not in resources: raise Exception(f'Unsupported language: {lang}.') if 'alias' in resources[lang]: logger.info(f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang]['lang_name'] if 'lang_name' in resources[ lang] else '' url = resources['url'] if model_url.lower() == 'default' else model_url # Default: download zipfile and unzip if package == 'default' and (processors is None or len(processors) == 0): logger.info( f'Downloading default packages for language: {lang} ({lang_name})...' ) request_file(f'{url}/{resources_version}/{lang}/default.zip', os.path.join(dir, lang, f'default.zip'), md5=resources[lang]['default_md5']) unzip(os.path.join(dir, lang), 'default.zip') # Customize: maintain download list else: download_list = maintain_processor_list(resources, lang, package, processors) download_list = add_dependencies(resources, lang, download_list) download_list = flatten_processor_list(download_list) download_table = make_table(['Processor', 'Package'], download_list) logger.info(f'Downloading these customized packages for language: ' f'{lang} ({lang_name})...\n{download_table}') # Download packages for key, value in download_list: try: request_file( f'{url}/{resources_version}/{lang}/{key}/{value}.pt', os.path.join(dir, lang, key, f'{value}.pt'), md5=resources[lang][key][value]['md5']) except KeyError as e: raise Exception( f'Cannot find the following processor and model name combination: ' f'{key}, {value}. Please check if you have provided the correct model name.' ) from e logger.info(f'Finished downloading models and saved to {dir}.')
def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None, use_gpu=True, **kwargs): self.lang, self.dir, self.kwargs = lang, dir, kwargs # set global logging level set_logging_level(logging_level, verbose) self.logging_level = logging.getLevelName(logger.level) # process different pipeline parameters lang, dir, package, processors = process_pipeline_parameters( lang, dir, package, processors) # Load resources.json to obtain latest packages. logger.debug('Loading resource file...') resources_filepath = os.path.join(dir, 'resources.json') if not os.path.exists(resources_filepath): raise Exception( f"Resources file not found at: {resources_filepath}. Try to download the model again." ) with open(resources_filepath) as infile: resources = json.load(infile) if lang in resources: if 'alias' in resources[lang]: logger.info( f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang][ 'lang_name'] if 'lang_name' in resources[lang] else '' else: logger.warning(f'Unsupported language: {lang}.') # Maintain load list self.load_list = maintain_processor_list( resources, lang, package, processors) if lang in resources else [] self.load_list = add_dependencies( resources, lang, self.load_list) if lang in resources else [] self.load_list = self.update_kwargs(kwargs, self.load_list) if len(self.load_list) == 0: raise Exception( 'No processor to load. Please check if your language or package is correctly set.' ) load_table = make_table(['Processor', 'Package'], [row[:2] for row in self.load_list]) logger.info( f'Loading these models for language: {lang} ({lang_name}):\n{load_table}' ) self.config = build_default_config(resources, lang, dir, self.load_list) self.config.update(kwargs) # Load processors self.processors = {} # configs that are the same for all processors pipeline_level_configs = {'lang': lang, 'mode': 'predict'} self.use_gpu = torch.cuda.is_available() and use_gpu logger.info("Use device: {}".format("gpu" if self.use_gpu else "cpu")) # set up processors pipeline_reqs_exceptions = [] for item in self.load_list: processor_name, _, _ = item logger.info('Loading: ' + processor_name) curr_processor_config = self.filter_config(processor_name, self.config) curr_processor_config.update(pipeline_level_configs) logger.debug('With settings: ') logger.debug(curr_processor_config) try: # try to build processor, throw an exception if there is a requirements issue self.processors[processor_name] = NAME_TO_PROCESSOR_CLASS[ processor_name](config=curr_processor_config, pipeline=self, use_gpu=self.use_gpu) except ProcessorRequirementsException as e: # if there was a requirements issue, add it to list which will be printed at end pipeline_reqs_exceptions.append(e) # add the broken processor to the loaded processors for the sake of analyzing the validity of the # entire proposed pipeline, but at this point the pipeline will not be built successfully self.processors[processor_name] = e.err_processor # if there are any processor exceptions, throw an exception to indicate pipeline build failure if pipeline_reqs_exceptions: logger.info('\n') raise PipelineRequirementsException(pipeline_reqs_exceptions) logger.info("Done loading processors!")
def download(lang='en', model_dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level=None, verbose=None, resources_url=DEFAULT_RESOURCES_URL, resources_branch=None, resources_version=DEFAULT_RESOURCES_VERSION, model_url=DEFAULT_MODEL_URL, proxies=None): # set global logging level set_logging_level(logging_level, verbose) # process different pipeline parameters lang, model_dir, package, processors = process_pipeline_parameters( lang, model_dir, package, processors) download_resources_json(model_dir, resources_url, resources_branch, resources_version, proxies) # unpack results with open(os.path.join(model_dir, 'resources.json')) as fin: resources = json.load(fin) if lang not in resources: raise ValueError(f'Unsupported language: {lang}.') if 'alias' in resources[lang]: logger.info(f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang]['lang_name'] if 'lang_name' in resources[ lang] else '' url = resources['url'] if model_url.lower() == 'default' else model_url # Default: download zipfile and unzip if package == 'default' and (processors is None or len(processors) == 0): logger.info( f'Downloading default packages for language: {lang} ({lang_name})...' ) request_file( f'{url}/{resources_version}/{lang}/default.zip', os.path.join(model_dir, lang, f'default.zip'), proxies, md5=resources[lang]['default_md5'], ) unzip(os.path.join(model_dir, lang), 'default.zip') # Customize: maintain download list else: download_list = maintain_processor_list(resources, lang, package, processors) download_list = add_dependencies(resources, lang, download_list) download_list = flatten_processor_list(download_list) download_table = make_table(['Processor', 'Package'], download_list) logger.info(f'Downloading these customized packages for language: ' f'{lang} ({lang_name})...\n{download_table}') # Download packages for key, value in download_list: try: request_file( f'{url}/{resources_version}/{lang}/{key}/{value}.pt', os.path.join(model_dir, lang, key, f'{value}.pt'), proxies, md5=resources[lang][key][value]['md5']) except KeyError as e: raise ValueError( f'Cannot find the following processor and model name combination: ' f'{key}, {value}. Please check if you have provided the correct model name.' ) from e logger.info(f'Finished downloading models and saved to {model_dir}.')
def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level=None, verbose=None, use_gpu=True, model_dir=None, **kwargs): self.lang, self.dir, self.kwargs = lang, dir, kwargs if model_dir is not None and dir == DEFAULT_MODEL_DIR: self.dir = model_dir # set global logging level set_logging_level(logging_level, verbose) # process different pipeline parameters lang, self.dir, package, processors = process_pipeline_parameters( lang, self.dir, package, processors) # Load resources.json to obtain latest packages. logger.debug('Loading resource file...') resources_filepath = os.path.join(self.dir, 'resources.json') if not os.path.exists(resources_filepath): raise ResourcesFileNotFoundError(resources_filepath) with open(resources_filepath) as infile: resources = json.load(infile) if lang in resources: if 'alias' in resources[lang]: logger.info( f'"{lang}" is an alias for "{resources[lang]["alias"]}"') lang = resources[lang]['alias'] lang_name = resources[lang][ 'lang_name'] if 'lang_name' in resources[lang] else '' else: logger.warning(f'Unsupported language: {lang}.') # Maintain load list processors = self.maybe_add_mwt(kwargs, resources, lang, processors) self.load_list = maintain_processor_list( resources, lang, package, processors) if lang in resources else [] self.load_list = add_dependencies( resources, lang, self.load_list) if lang in resources else [] self.load_list = self.update_kwargs(kwargs, self.load_list) if len(self.load_list) == 0: raise ValueError( 'No processors to load for language {}. Please check if your language or package is correctly set.' .format(lang)) load_table = make_table(['Processor', 'Package'], [row[:2] for row in self.load_list]) logger.info( f'Loading these models for language: {lang} ({lang_name}):\n{load_table}' ) self.config = build_default_config(resources, lang, self.dir, self.load_list) self.config.update(kwargs) # Load processors self.processors = {} # configs that are the same for all processors pipeline_level_configs = {'lang': lang, 'mode': 'predict'} self.use_gpu = torch.cuda.is_available() and use_gpu logger.info("Use device: {}".format("gpu" if self.use_gpu else "cpu")) # set up processors pipeline_reqs_exceptions = [] for item in self.load_list: processor_name, _, _ = item logger.info('Loading: ' + processor_name) curr_processor_config = self.filter_config(processor_name, self.config) curr_processor_config.update(pipeline_level_configs) # TODO: this is obviously a hack # a better solution overall would be to make a pretagged version of the pos annotator # and then subsequent modules can use those tags without knowing where those tags came from if "pretagged" in self.config and "pretagged" not in curr_processor_config: curr_processor_config["pretagged"] = self.config["pretagged"] logger.debug('With settings: ') logger.debug(curr_processor_config) try: # try to build processor, throw an exception if there is a requirements issue self.processors[processor_name] = NAME_TO_PROCESSOR_CLASS[ processor_name](config=curr_processor_config, pipeline=self, use_gpu=self.use_gpu) except ProcessorRequirementsException as e: # if there was a requirements issue, add it to list which will be printed at end pipeline_reqs_exceptions.append(e) # add the broken processor to the loaded processors for the sake of analyzing the validity of the # entire proposed pipeline, but at this point the pipeline will not be built successfully self.processors[processor_name] = e.err_processor except FileNotFoundError as e: # For a FileNotFoundError, we try to guess if there's # a missing model directory or file. If so, we # suggest the user try to download the models if 'model_path' in curr_processor_config: model_path = curr_processor_config['model_path'] model_dir, model_name = os.path.split(model_path) lang_dir = os.path.dirname(model_dir) if not os.path.exists(lang_dir): # model files for this language can't be found in the expected directory raise LanguageNotDownloadedError( lang, lang_dir, model_path) from e if processor_name not in resources[lang]: # user asked for a model which doesn't exist for this language? raise UnsupportedProcessorError(processor_name, lang) if not os.path.exists(model_path): model_name, _ = os.path.splitext(model_name) # TODO: before recommending this, check that such a thing exists in resources.json. # currently that case is handled by ignoring the model, anyway raise FileNotFoundError( 'Could not find model file %s, although there are other models downloaded for language %s. Perhaps you need to download a specific model. Try: stanza.download(lang="%s",package=None,processors={"%s":"%s"})' % (model_path, lang, lang, processor_name, model_name)) from e # if we couldn't find a more suitable description of the # FileNotFoundError, just raise the old error raise # if there are any processor exceptions, throw an exception to indicate pipeline build failure if pipeline_reqs_exceptions: logger.info('\n') raise PipelineRequirementsException(pipeline_reqs_exceptions) logger.info("Done loading processors!")