def import_data(data_dir, db_params, ed_component_path): """ call the right component to import the data in the directory we loop through all files until we recognize one on them """ log = logging.getLogger(__name__) files = glob.glob(data_dir + "/*") data_type, file_to_load = utils.type_of_data(files) if not data_type: log.info('unknown data type for dir {}, skipping'.format(data_dir)) return # Note, we consider that we only have to load one kind of data per directory import_component = data_type + '2ed' if ed_component_path: import_component = os.path.join(ed_component_path, import_component) if file_to_load.endswith('.zip') or file_to_load.endswith('.geopal'): # TODO: handle geopal as non zip # if it's a zip, we unzip it zip_file = zipfile.ZipFile(file_to_load) zip_file.extractall(path=data_dir) file_to_load = data_dir if launch_exec( import_component, ["-i", file_to_load, "--connection-string", db_params.old_school_cnx_string()], log ): raise Exception('Error: problem with running {}, stoping'.format(import_component))
def import_in_mimir(_file, instance, asynchronous=True): """ Import pt data stops to autocomplete """ datatype, _ = utils.type_of_data(_file) family_type = utils.family_of_data(datatype) current_app.logger.debug("Import {} data to mimir".format(family_type)) action = None if family_type == 'pt': if instance.import_ntfs_in_mimir: action = ntfs2mimir.si(instance.name, _file) if instance.import_stops_in_mimir and not instance.import_ntfs_in_mimir: action = stops2mimir.si(instance.name, _file) elif family_type == 'poi': action = poi2mimir.si(instance.name, _file) else: current_app.logger.warning("Unsupported family_type {}".format(family_type)) if asynchronous: return action.delay() else: # all job are run in sequence and import_in_mimir will only return when all the jobs are finish return action.apply()
def import_data(data_dir, db_params, ed_component_path): # type: (str, DbParams, str) -> None """ Call the right binary for its data (all the "*2ed") to create data then load it in the database. :param data_dir: the directory containing the data for "*2ed" :param db_params: the parameters of the database :param ed_component_path: the path of the directory containing the binary "*2ed" """ files = glob.glob(data_dir + "/*") # type: List[str] data_type, file_to_load = utils.type_of_data(files) # type: str,str if not data_type: logger.info('unknown data type for dir {}, skipping'.format(data_dir)) return # we consider that we only have to load one kind of data per directory import_component = data_type + '2ed' # type: str if ed_component_path: import_component = os.path.join(ed_component_path, import_component) if file_to_load.endswith('.zip') or file_to_load.endswith('.geopal'): # TODO: handle geopal as non zip ; if it's a zip, we unzip it zip_file = zipfile.ZipFile(file_to_load) # type: zipfile.ZipFile zip_file.extractall(path=data_dir) file_to_load = data_dir if launch_exec.launch_exec(import_component, [ "-i", file_to_load, "--connection-string", db_params.old_school_cnx_string() ], logger): raise Exception( 'Error: problem with running {}, stoping'.format(import_component))
def import_in_mimir(_file, instance, asynchronous=True): """ Import pt data stops to autocomplete """ datatype, _ = utils.type_of_data(_file) family_type = utils.family_of_data(datatype) current_app.logger.debug("Import {} data to mimir".format(family_type)) action = None if family_type == 'pt': if instance.import_ntfs_in_mimir: action = ntfs2mimir.si(instance.name, _file) # Deprecated: https://github.com/CanalTP/mimirsbrunn/blob/4430eed1d81247fffa7cf32ba675a9c5ad8b1cbe/documentation/components.md#stops2mimir if instance.import_stops_in_mimir and not instance.import_ntfs_in_mimir: action = stops2mimir.si(instance.name, _file) elif family_type == 'poi': action = poi2mimir.si(instance.name, _file) else: current_app.logger.warning("Unsupported family_type {}".format(family_type)) if asynchronous: return action.delay() else: # all job are run in sequence and import_in_mimir will only return when all the jobs are finish return action.apply()
def import_in_mimir(_file, instance, asynchronous=True): """ Import pt data stops to autocomplete """ datatype, _ = utils.type_of_data(_file) family_type = utils.family_of_data(datatype) current_app.logger.debug("Import {} data to mimir".format(family_type)) actions = [] for version in (2, 7): if not is_activate_autocomplete_version(version): logging.getLogger(__name__).info( "Disable import mimir version {}".format(version)) continue if family_type == 'pt': if instance.import_ntfs_in_mimir: actions.append(ntfs2mimir.si(instance.name, _file, version)) # Deprecated: https://github.com/hove-io/mimirsbrunn/blob/4430eed1d81247fffa7cf32ba675a9c5ad8b1cbe/documentation/components.md#stops2mimir if instance.import_stops_in_mimir and not instance.import_ntfs_in_mimir: actions.append(stops2mimir.si(instance.name, _file, version)) elif family_type == 'poi': actions.append(poi2mimir.si(instance.name, _file, version)) else: current_app.logger.warning( "Unsupported family_type {}".format(family_type)) if asynchronous: return chain(*actions).delay() else: # all job are run in sequence and import_in_mimir will only return when all the jobs are finish return chain(*actions).apply()
'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, 'shape': shape2ed, } for _file in files: filename = None dataset = models.DataSet() # NOTE: for the moment we do not use the path to load the data here # but we'll need to refactor this to take it into account dataset.type, _ = utils.type_of_data(_file) dataset.family_type = utils.family_of_data(dataset.type) if dataset.type in task: if backup_file: filename = move_to_backupdirectory(_file, instance_config.backup_directory) else: filename = _file actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid)) else: #unknown type, we skip it current_app.logger.debug("unknwn file type: {} for file {}" .format(dataset.type, _file)) continue #currently the name of a dataset is the path to it
def import_data( files, instance, backup_file, asynchronous=True, reload=True, custom_output_dir=None, skip_mimir=False ): """ import the data contains in the list of 'files' in the 'instance' :param files: files to import :param instance: instance to receive the data :param backup_file: If True the files are moved to a backup directory, else they are not moved :param asynchronous: If True all jobs are run in background, else the jobs are run in sequence the function will only return when all of them are finish :param reload: If True kraken would be reload at the end of the treatment :param custom_output_dir: subdirectory for the nav file created. If not given, the instance default one is taken :param skip_mimir: skip importing data into mimir run the whole data import process: - data import in bdd (fusio2ed, gtfs2ed, poi2ed, ...) - export bdd to nav file - update the jormungandr db with the new data for the instance - reload the krakens """ actions = [] job = models.Job() instance_config = load_instance_config(instance.name) job.instance = instance job.state = 'running' task = { 'gtfs': gtfs2ed, 'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, 'shape': shape2ed, } for _file in files: filename = None dataset = models.DataSet() # NOTE: for the moment we do not use the path to load the data here # but we'll need to refactor this to take it into account try: dataset.type, _ = utils.type_of_data(_file) dataset.family_type = utils.family_of_data(dataset.type) except Exception: if backup_file: move_to_backupdirectory(_file, instance_config.backup_directory) current_app.logger.debug( "Corrupted source file : {} moved to {}".format(_file, instance_config.backup_directory) ) continue if dataset.type in task: if backup_file: filename = move_to_backupdirectory(_file, instance_config.backup_directory) else: filename = _file actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid)) else: # unknown type, we skip it current_app.logger.debug("unknown file type: {} for file {}".format(dataset.type, _file)) continue # currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) if actions: models.db.session.add(job) models.db.session.commit() # We pass the job id to each tasks, but job need to be commited for having an id for action in actions: action.kwargs['job_id'] = job.id # Create binary file (New .nav.lz4) binarisation = [ed2nav.si(instance_config, job.id, custom_output_dir)] actions.append(chain(*binarisation)) # Reload kraken with new data after binarisation (New .nav.lz4) if reload: actions.append(reload_data.si(instance_config, job.id)) if not skip_mimir: for dataset in job.data_sets: actions.extend(send_to_mimir(instance, dataset.name, dataset.family_type)) else: current_app.logger.info("skipping mimir import") actions.append(finish_job.si(job.id)) if asynchronous: return chain(*actions).delay() else: # all job are run in sequence and import_data will only return when all the jobs are finish return chain(*actions).apply()
def import_data( files, instance, backup_file, asynchronous=True, reload=True, custom_output_dir=None, skip_mimir=False, skip_2ed=False, ): """ import the data contains in the list of 'files' in the 'instance' :param files: files to import :param instance: instance to receive the data :param backup_file: If True the files are moved to a backup directory, else they are not moved :param asynchronous: If True all jobs are run in background, else the jobs are run in sequence the function will only return when all of them are finish :param reload: If True kraken would be reload at the end of the treatment :param custom_output_dir: subdirectory for the nav file created. If not given, the instance default one is taken :param skip_mimir: skip importing data into mimir :param skip_2ed: skip inserting last_load_dataset files into ed database run the whole data import process: - data import in bdd (fusio2ed, gtfs2ed, poi2ed, ...) - export bdd to nav file - update the jormungandr db with the new data for the instance - reload the krakens """ actions = [] job = models.Job() instance_config = load_instance_config(instance.name) job.instance = instance job.state = 'running' task = { 'gtfs': gtfs2ed, 'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, 'shape': shape2ed, } def process_ed2nav(): models.db.session.add(job) models.db.session.commit() # We pass the job id to each tasks, but job need to be commited for having an id for action in actions: action.kwargs['job_id'] = job.id # Create binary file (New .nav.lz4) binarisation = [ed2nav.si(instance_config, job.id, custom_output_dir)] actions.append(chain(*binarisation)) # Reload kraken with new data after binarisation (New .nav.lz4) if reload: actions.append(reload_data.si(instance_config, job.id)) if not skip_mimir: for dataset in job.data_sets: actions.extend( send_to_mimir(instance, dataset.name, dataset.family_type)) else: current_app.logger.info("skipping mimir import") actions.append(finish_job.si(job.id)) # We should delete old backup directories related to this instance actions.append( purge_instance.si( instance.id, current_app.config['DATASET_MAX_BACKUPS_TO_KEEP'])) if asynchronous: return chain(*actions).delay() else: # all job are run in sequence and import_data will only return when all the jobs are finish return chain(*actions).apply() if skip_2ed: # For skip_2ed, skip inserting last_load_dataset files into ed database return process_ed2nav() for _file in files: filename = None dataset = models.DataSet() # NOTE: for the moment we do not use the path to load the data here # but we'll need to refactor this to take it into account try: dataset.type, _ = utils.type_of_data(_file) dataset.family_type = utils.family_of_data(dataset.type) except Exception: if backup_file: move_to_backupdirectory(_file, instance_config.backup_directory) current_app.logger.debug( "Corrupted source file : {} moved to {}".format( _file, instance_config.backup_directory)) continue if dataset.type in task: if backup_file: filename = move_to_backupdirectory( _file, instance_config.backup_directory, manage_sp_char=True) else: filename = _file has_pt_planner_loki = ( hasattr(instance, 'pt_planners_configurations') and "loki" in instance.pt_planners_configurations) if has_pt_planner_loki: loki_data_source = instance.pt_planners_configurations.get( 'loki', {}).get('data_source') if loki_data_source is not None: if loki_data_source == "minio": if dataset.type == "fusio": actions.append( fusio2s3.si(instance_config, filename, dataset_uid=dataset.uid)) if dataset.type == "gtfs": actions.append( gtfs2s3.si(instance_config, filename, dataset_uid=dataset.uid)) elif loki_data_source == "local" and dataset.type in [ "fusio", "gtfs" ]: zip_file = zip_if_needed(filename) dest = os.path.join( os.path.dirname(instance_config.target_file), "ntfs.zip") shutil.copy(zip_file, dest) else: current_app.logger.debug( "unknown loki data_source '{}' for coverage '{}'". format(loki_data_source, instance.name)) actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid)) else: # unknown type, we skip it current_app.logger.debug( "unknown file type: {} for file {}".format( dataset.type, _file)) continue # currently the name of a dataset is the path to it dataset.name = filename dataset.state = "pending" models.db.session.add(dataset) job.data_sets.append(dataset) if actions: return process_ed2nav()