def create_autocomplete_job_with_data_sets(): with app.app_context(): instance = models.Instance('fr') models.db.session.add(instance) models.db.session.commit() job = models.Job() job.instance = instance # we also create 2 datasets, one for fusio, one for autocomplete_cosmogony for i, dset_type in enumerate(['fusio', 'cosmogony']): dataset = models.DataSet() dataset.type = dset_type dataset.family_type = dataset.type if dataset.type == 'fusio': dataset.family_type = 'pt' dataset.name = '/path/to/dataset_{}'.format(i) else: dataset.family_type = 'autocomplete_cosmogony' dataset.name = '/path/to/dataset_cosmogony/cosmogony_europe.jsonl.gz' models.db.session.add(dataset) job.data_sets.append(dataset) job.state = 'done' models.db.session.add(job) models.db.session.commit()
def create_and_get_dataset(ds_type, family_type, filename): dataset = models.DataSet() dataset.family_type = family_type dataset.type = ds_type # currently the name of a dataset is the path to it dataset.name = filename return dataset
def send_to_mimir(instance, filename, family_type): """ :param instance: instance to receive the data :param filename: file to inject towards mimir :param family_type: dataset's family type - create a job with a data_set - data injection towards mimir(stops2mimir, ntfs2mimir, poi2mimir) returns action list """ # if mimir isn't setup do not try to import data for the autocompletion if not current_app.config.get('MIMIR_URL'): return [] # Bail out if the family type is not one that mimir deals with. if family_type not in ['pt', 'poi']: return [] # This test is to avoid creating a new job if there is no action on mimir. if not (instance.import_ntfs_in_mimir or instance.import_stops_in_mimir): return [] actions = [] job = models.Job() job.instance = instance job.state = 'running' dataset = models.DataSet() dataset.family_type = 'mimir' dataset.type = 'fusio' # currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) models.db.session.add(job) models.db.session.commit() if family_type == 'pt': # Import ntfs in Mimir if instance.import_ntfs_in_mimir: actions.append(ntfs2mimir.si(instance.name, filename, job.id, dataset_uid=dataset.uid)) # Import stops in Mimir. # if we are loading pt data we might want to load the stops to autocomplete # This action is deprecated: https://github.com/CanalTP/mimirsbrunn/blob/4430eed1d81247fffa7cf32ba675a9c5ad8b1cbe/documentation/components.md#stops2mimir if instance.import_stops_in_mimir and not instance.import_ntfs_in_mimir: actions.append(stops2mimir.si(instance.name, filename, job.id, dataset_uid=dataset.uid)) else: # assume family_type == 'poi': actions.append(poi2mimir.si(instance.name, filename, job.id, dataset_uid=dataset.uid)) actions.append(finish_job.si(job.id)) return actions
def create_dataset(dataset_type, dir): dataset = models.DataSet() dataset.type = dataset_type dataset.family_type = '{}_family'.format(dataset_type) dataset.name = '{}/dataset'.format(dir) models.db.session.add(dataset) metric = models.Metric() metric.type = '{}2ed'.format(dataset_type) metric.dataset = dataset models.db.session.add(metric) return dataset, metric
def import_autocomplete(files, autocomplete_instance, asynchronous=True, backup_file=True): """ Import the autocomplete'instance data files """ job = models.Job() actions = [] task = {'bano': bano2mimir, 'oa': openaddresses2mimir, 'osm': osm2mimir, 'cosmogony': cosmogony2mimir} autocomplete_dir = current_app.config['TYR_AUTOCOMPLETE_DIR'] # it's important for the admin to be loaded first, then addresses, then street, then poi import_order = ['cosmogony', 'bano', 'oa', 'osm'] files_and_types = [(f, type_of_autocomplete_data(f)) for f in files] files_and_types = sorted(files_and_types, key=lambda f_t: import_order.index(f_t[1])) for f, ftype in files_and_types: dataset = models.DataSet() dataset.type = ftype dataset.family_type = 'autocomplete_{}'.format(dataset.type) if dataset.type in task: if backup_file: filename = move_to_backupdirectory(f, autocomplete_instance.backup_dir(autocomplete_dir)) else: filename = f actions.append( task[dataset.type].si(autocomplete_instance, filename=filename, dataset_uid=dataset.uid) ) else: # unknown type, we skip it current_app.logger.debug("unknown file type: {} for file {}".format(dataset.type, f)) continue # currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) job.autocomplete_params_id = autocomplete_instance.id if not actions: return models.db.session.add(job) models.db.session.commit() for action in actions: action.kwargs['job_id'] = job.id actions.append(finish_job.si(job.id)) if asynchronous: return chain(*actions).delay(), job else: # all job are run in sequence and import_data will only return when all the jobs are finish return chain(*actions).apply(), job
def send_to_mimir(instance, filename): """ :param instance: instance to receive the data :param filename: file to inject towards mimir - create a job with a data_set - data injection towards mimir(stops2mimir, ntfs2mimir) returns action list """ # This test is to avoid creating a new job if there is no action on mimir. if not (instance.import_ntfs_in_mimir or instance.import_stops_in_mimir): return [] actions = [] job = models.Job() instance_config = load_instance_config(instance.name) job.instance = instance job.state = 'running' dataset = models.DataSet() dataset.family_type = 'mimir' dataset.type = 'fusio' # currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) models.db.session.add(job) models.db.session.commit() # Import ntfs in Mimir if instance.import_ntfs_in_mimir: actions.append( ntfs2mimir.si(instance_config, filename, job.id, dataset_uid=dataset.uid)) # Import stops in Mimir # if we are loading pt data we might want to load the stops to autocomplete if instance.import_stops_in_mimir and not instance.import_ntfs_in_mimir: actions.append( stops2mimir.si(instance_config, filename, job.id, dataset_uid=dataset.uid)) actions.append(finish_job.si(job.id)) return actions
def create_cities_job(creation_date, path, state): job = models.Job() job.state = state dataset_backup_dir = path dataset = models.DataSet() dataset.type = 'cities' dataset.family_type = 'cities_family' dataset.name = '{}'.format(dataset_backup_dir) models.db.session.add(dataset) job.data_sets.append(dataset) job.created_at = creation_date models.db.session.add(job)
def create_dataset(dataset_type): dataset = models.DataSet() dataset.type = dataset_type dataset.family_type = '{}_family'.format(dataset_type) dataset.name = '/path/to/dataset_{}'.format(dataset_type) models.db.session.add(dataset) metric = models.Metric() metric.type = '{}2ed'.format(dataset_type) metric.duration = datetime.timedelta(seconds=9.0001) metric.dataset = dataset models.db.session.add(metric) return dataset, metric
def add_job_with_data_set_mimir(create_basic_job_with_data_sets): with app.app_context(): # we also create 1 job with a dataset for mimir instance = get_instance_from_db(name='fr') job = models.Job() job.instance = instance dataset = models.DataSet() dataset.family_type = 'mimir' dataset.type = 'stop2mimir' dataset.name = '/path/to/dataset_3' models.db.session.add(dataset) job.data_sets.append(dataset) job.state = 'done' models.db.session.add(job) models.db.session.commit()
def add_job_and_data_set_with_jobstate_running(create_basic_job_with_data_sets): with app.app_context(): # we add a new job with a dataset for mimir instance = get_instance_from_db(name='fr') job = models.Job() job.instance = instance dataset = models.DataSet() dataset.family_type = 'osm' dataset.type = 'osm' dataset.name = '/path/to/dataset_osm' models.db.session.add(dataset) job.data_sets.append(dataset) job.state = 'running' models.db.session.add(job) models.db.session.commit()
def create_autocomplete_parameter(): with app.app_context(): autocomplete_param = models.AutocompleteParameter('idf', 'OSM', 'BANO','FUSIO', 'OSM', [8, 9]) models.db.session.add(autocomplete_param) models.db.session.commit() # we also create 3 datasets, one for bano, 2 for osm for i, dset_type in enumerate(['bano', 'osm', 'osm']): job = models.Job() dataset = models.DataSet() dataset.type = dset_type dataset.family_type = 'autocomplete_{}'.format(dataset.type) dataset.name = '/path/to/dataset_{}'.format(i) models.db.session.add(dataset) job.autocomplete_params_id = autocomplete_param.id job.data_sets.append(dataset) job.state = 'done' models.db.session.add(job) models.db.session.commit()
def create_basic_job_with_data_sets(): with app.app_context(): instance = models.Instance('fr') models.db.session.add(instance) models.db.session.commit() job = models.Job() job.instance = instance # we also create 2 datasets, one for fusio, one for synonym for i, dset_type in enumerate(['fusio', 'synonym']): dataset = models.DataSet() dataset.type = dset_type dataset.family_type = dataset.type if dataset.type == 'fusio': dataset.family_type = 'pt' dataset.name = '/path/to/dataset_{}'.format(i) models.db.session.add(dataset) job.data_sets.append(dataset) job.state = 'done' models.db.session.add(job) models.db.session.commit()
def import_data(files, instance, backup_file): """ import the data contains in the list of 'files' in the 'instance' :param files: files to import :param instance: instance to receive the data :param backup_file: If True the files are moved to a backup directory, else they are not moved run the whole data import process: - data import in bdd (fusio2ed, gtfs2ed, poi2ed, ...) - export bdd to nav file - update the jormungandr db with the new data for the instance - reload the krakens """ actions = [] job = models.Job() instance_config = load_instance_config(instance.name) job.instance = instance job.state = 'pending' task = { 'gtfs': gtfs2ed, 'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, } for _file in files: filename = None dataset = models.DataSet() dataset.type = type_of_data(_file) if dataset.type in task: if backup_file: filename = move_to_backupdirectory( _file, instance_config.backup_directory) else: filename = _file actions.append(task[dataset.type].si(instance_config, filename)) else: #unknown type, we skip it current_app.logger.debug("unknwn file type: {} for file {}".format( dataset.type, _file)) continue #currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) if actions: models.db.session.add(job) models.db.session.commit() for action in actions: action.kwargs['job_id'] = job.id #We pass the job id to each tasks, but job need to be commited for #having an id binarisation = [ ed2nav.si(instance_config, job.id), nav2rt.si(instance_config, job.id) ] aggregate = aggregate_places.si(instance_config, job.id) #We pass the job id to each tasks, but job need to be commited for #having an id actions.append(group(chain(*binarisation), aggregate)) actions.append(reload_data.si(instance_config, job.id)) actions.append(finish_job.si(job.id)) chain(*actions).delay()
job.state = 'pending' task = { 'gtfs': gtfs2ed, 'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, 'shape': shape2ed, } for _file in files: filename = None dataset = models.DataSet() # NOTE: for the moment we do not use the path to load the data here # but we'll need to refactor this to take it into account dataset.type, _ = utils.type_of_data(_file) dataset.family_type = utils.family_of_data(dataset.type) if dataset.type in task: if backup_file: filename = move_to_backupdirectory(_file, instance_config.backup_directory) else: filename = _file actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid)) else: #unknown type, we skip it current_app.logger.debug("unknwn file type: {} for file {}" .format(dataset.type, _file))
def import_data( files, instance, backup_file, asynchronous=True, reload=True, custom_output_dir=None, skip_mimir=False ): """ import the data contains in the list of 'files' in the 'instance' :param files: files to import :param instance: instance to receive the data :param backup_file: If True the files are moved to a backup directory, else they are not moved :param asynchronous: If True all jobs are run in background, else the jobs are run in sequence the function will only return when all of them are finish :param reload: If True kraken would be reload at the end of the treatment :param custom_output_dir: subdirectory for the nav file created. If not given, the instance default one is taken :param skip_mimir: skip importing data into mimir run the whole data import process: - data import in bdd (fusio2ed, gtfs2ed, poi2ed, ...) - export bdd to nav file - update the jormungandr db with the new data for the instance - reload the krakens """ actions = [] job = models.Job() instance_config = load_instance_config(instance.name) job.instance = instance job.state = 'running' task = { 'gtfs': gtfs2ed, 'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, 'shape': shape2ed, } for _file in files: filename = None dataset = models.DataSet() # NOTE: for the moment we do not use the path to load the data here # but we'll need to refactor this to take it into account try: dataset.type, _ = utils.type_of_data(_file) dataset.family_type = utils.family_of_data(dataset.type) except Exception: if backup_file: move_to_backupdirectory(_file, instance_config.backup_directory) current_app.logger.debug( "Corrupted source file : {} moved to {}".format(_file, instance_config.backup_directory) ) continue if dataset.type in task: if backup_file: filename = move_to_backupdirectory(_file, instance_config.backup_directory) else: filename = _file actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid)) else: # unknown type, we skip it current_app.logger.debug("unknown file type: {} for file {}".format(dataset.type, _file)) continue # currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) if actions: models.db.session.add(job) models.db.session.commit() # We pass the job id to each tasks, but job need to be commited for having an id for action in actions: action.kwargs['job_id'] = job.id # Create binary file (New .nav.lz4) binarisation = [ed2nav.si(instance_config, job.id, custom_output_dir)] actions.append(chain(*binarisation)) # Reload kraken with new data after binarisation (New .nav.lz4) if reload: actions.append(reload_data.si(instance_config, job.id)) if not skip_mimir: for dataset in job.data_sets: actions.extend(send_to_mimir(instance, dataset.name, dataset.family_type)) else: current_app.logger.info("skipping mimir import") actions.append(finish_job.si(job.id)) if asynchronous: return chain(*actions).delay() else: # all job are run in sequence and import_data will only return when all the jobs are finish return chain(*actions).apply()
def update_data(): for instance in models.Instance.query.all(): current_app.logger.debug("Update data of : %s" % instance.name) instance_config = load_instance_config(instance.name) files = glob.glob(instance_config.source_directory + "/*") actions = [] job = models.Job() job.instance = instance job.state = 'pending' for _file in files: dataset = models.DataSet() filename = None dataset.type = type_of_data(_file) if dataset.type == 'gtfs': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(gtfs2ed.si(instance_config, filename)) elif dataset.type == 'fusio': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(fusio2ed.si(instance_config, filename)) elif dataset.type == 'osm': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(osm2ed.si(instance_config, filename)) elif dataset.type == 'geopal': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(geopal2ed.si(instance_config, filename)) elif dataset.type == 'fare': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(fare2ed.si(instance_config, filename)) elif dataset.type == 'poi': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(poi2ed.si(instance_config, filename)) elif dataset.type == 'synonym': filename = move_to_backupdirectory( _file, instance_config.backup_directory) actions.append(synonym2ed.si(instance_config, filename)) else: #unknown type, we skip it continue #currently the name of a dataset is the path to it dataset.name = filename models.db.session.add(dataset) job.data_sets.append(dataset) if actions: models.db.session.add(job) models.db.session.commit() for action in actions: action.kwargs['job_id'] = job.id #We pass the job id to each tasks, but job need to be commited for #having an id binarisation = [ ed2nav.si(instance_config, job.id), nav2rt.si(instance_config, job.id) ] aggregate = aggregate_places.si(instance_config, job.id) #We pass the job id to each tasks, but job need to be commited for #having an id actions.append(group(chain(*binarisation), aggregate)) actions.append(reload_data.si(instance_config, job.id)) actions.append(finish_job.si(job.id)) chain(*actions).delay()
def import_data( files, instance, backup_file, asynchronous=True, reload=True, custom_output_dir=None, skip_mimir=False, skip_2ed=False, ): """ import the data contains in the list of 'files' in the 'instance' :param files: files to import :param instance: instance to receive the data :param backup_file: If True the files are moved to a backup directory, else they are not moved :param asynchronous: If True all jobs are run in background, else the jobs are run in sequence the function will only return when all of them are finish :param reload: If True kraken would be reload at the end of the treatment :param custom_output_dir: subdirectory for the nav file created. If not given, the instance default one is taken :param skip_mimir: skip importing data into mimir :param skip_2ed: skip inserting last_load_dataset files into ed database run the whole data import process: - data import in bdd (fusio2ed, gtfs2ed, poi2ed, ...) - export bdd to nav file - update the jormungandr db with the new data for the instance - reload the krakens """ actions = [] job = models.Job() instance_config = load_instance_config(instance.name) job.instance = instance job.state = 'running' task = { 'gtfs': gtfs2ed, 'fusio': fusio2ed, 'osm': osm2ed, 'geopal': geopal2ed, 'fare': fare2ed, 'poi': poi2ed, 'synonym': synonym2ed, 'shape': shape2ed, } def process_ed2nav(): models.db.session.add(job) models.db.session.commit() # We pass the job id to each tasks, but job need to be commited for having an id for action in actions: action.kwargs['job_id'] = job.id # Create binary file (New .nav.lz4) binarisation = [ed2nav.si(instance_config, job.id, custom_output_dir)] actions.append(chain(*binarisation)) # Reload kraken with new data after binarisation (New .nav.lz4) if reload: actions.append(reload_data.si(instance_config, job.id)) if not skip_mimir: for dataset in job.data_sets: actions.extend( send_to_mimir(instance, dataset.name, dataset.family_type)) else: current_app.logger.info("skipping mimir import") actions.append(finish_job.si(job.id)) # We should delete old backup directories related to this instance actions.append( purge_instance.si( instance.id, current_app.config['DATASET_MAX_BACKUPS_TO_KEEP'])) if asynchronous: return chain(*actions).delay() else: # all job are run in sequence and import_data will only return when all the jobs are finish return chain(*actions).apply() if skip_2ed: # For skip_2ed, skip inserting last_load_dataset files into ed database return process_ed2nav() for _file in files: filename = None dataset = models.DataSet() # NOTE: for the moment we do not use the path to load the data here # but we'll need to refactor this to take it into account try: dataset.type, _ = utils.type_of_data(_file) dataset.family_type = utils.family_of_data(dataset.type) except Exception: if backup_file: move_to_backupdirectory(_file, instance_config.backup_directory) current_app.logger.debug( "Corrupted source file : {} moved to {}".format( _file, instance_config.backup_directory)) continue if dataset.type in task: if backup_file: filename = move_to_backupdirectory( _file, instance_config.backup_directory, manage_sp_char=True) else: filename = _file has_pt_planner_loki = ( hasattr(instance, 'pt_planners_configurations') and "loki" in instance.pt_planners_configurations) if has_pt_planner_loki: loki_data_source = instance.pt_planners_configurations.get( 'loki', {}).get('data_source') if loki_data_source is not None: if loki_data_source == "minio": if dataset.type == "fusio": actions.append( fusio2s3.si(instance_config, filename, dataset_uid=dataset.uid)) if dataset.type == "gtfs": actions.append( gtfs2s3.si(instance_config, filename, dataset_uid=dataset.uid)) elif loki_data_source == "local" and dataset.type in [ "fusio", "gtfs" ]: zip_file = zip_if_needed(filename) dest = os.path.join( os.path.dirname(instance_config.target_file), "ntfs.zip") shutil.copy(zip_file, dest) else: current_app.logger.debug( "unknown loki data_source '{}' for coverage '{}'". format(loki_data_source, instance.name)) actions.append(task[dataset.type].si(instance_config, filename, dataset_uid=dataset.uid)) else: # unknown type, we skip it current_app.logger.debug( "unknown file type: {} for file {}".format( dataset.type, _file)) continue # currently the name of a dataset is the path to it dataset.name = filename dataset.state = "pending" models.db.session.add(dataset) job.data_sets.append(dataset) if actions: return process_ed2nav()