def _get_source_file(file_name): """Returns an existing or new (un-saved) source file model for the given file name :param file_name: The name of the source file :type file_name: string :returns: The source file model :rtype: :class:`source.models.SourceFile` """ try: src_file = SourceFile.objects.get(file_name=file_name) except SourceFile.DoesNotExist: src_file = SourceFile() # New file src_file.file_name = file_name src_file.is_deleted = True return src_file
def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method migrates existing data files into scale. """ logger.info(u'Command starting: migratedata') workspace, workspace_path, local_path, data_types = None, None, None, [] if options['workspace'] is not None and options[ 'workspace_path'] is not None: workspace, workspace_path = options['workspace'], options[ 'workspace_path'] tmp = Workspace.objects.filter(name=workspace) if tmp.count() > 0: workspace = tmp.first() else: workspace = Workspace.objects.get(id=int(workspace)) else: logger.error('Must specify workspace and workspace-path.') return False if options['data_type'] is not None: data_types.extend(options['data_type']) mnt_dirs = None if options['local_path'] is not None: local_path = options['local_path'] else: # mount mnt_dirs = "/tmp", tempfile.mkdtemp() workspace.setup_download_dir(*mnt_dirs) local_path = os.path.join(mnt_dirs[1], workspace_path) logger.info("Ingesting files from %s/%s", workspace.name, workspace_path) filenames = self.generate_file_list(local_path, options['include'], options['exclude']) logger.info("Found %d files", len(filenames)) # prepare for ingest ala strike ingest_records = {} for filename in filenames: logger.info("Generating ingest record for %s" % filename) ingest = Ingest() ingest.file_name = os.path.basename(filename) ingest.file_path = os.path.join( workspace_path, os.path.relpath(filename, local_path)) ingest.transfer_started = datetime.utcfromtimestamp( os.path.getatime(filename)) ingest.file_size = ingest.bytes_transferred = os.path.getsize( filename) ingest.transfer_ended = timezone.now() ingest.media_type = get_media_type(filename) ingest.workspace = workspace for data_type in data_types: ingest.add_data_type_tag(data_type) ingest.status = 'TRANSFERRED' if options['no_commit']: s = IngestDetailsSerializer() logger.info(s.to_representation(ingest)) else: ingest.save() ingest_records[filename] = ingest.id logging.info("Ingests records created") # start ingest tasks for all the files if not options['no_commit']: logging.info("Starting ingest tasks") for filename in filenames: ingest = Ingest.objects.get(id=ingest_records[filename]) logging.info("Processing ingest %s" % ingest.file_name) with transaction.atomic(): ingest.ingest_started = timezone.now() sf = ingest.source_file = SourceFile.create() sf.update_uuid(ingest.file_name) for tag in ingest.get_data_type_tags(): sf.add_data_type_tag(tag) sf.media_type = ingest.media_type sf.file_name = ingest.file_name sf.file_size = ingest.file_size sf.file_path = ingest.file_path sf.workspace = workspace sf.is_deleted = False sf.deleted = None sf.save() sf.set_countries() sf.save() ingest.status = 'INGESTED' ingest.ingest_ended = timezone.now() ingest.source_file = sf ingest.save() IngestTriggerHandler().process_ingested_source_file( ingest.source_file, ingest.ingest_ended) logging.info( "Ingests processed, monitor the queue for triggered jobs.") if mnt_dirs is not None: workspace.cleanup_download_dir(*mnt_dirs) logger.info(u'Command completed: migratedata')
def perform_ingest(ingest_id): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int """ ingest = _get_ingest(ingest_id) file_name = ingest.file_name if ingest.status in ['INGESTED', 'DUPLICATE']: logger.warning('%s already marked %s, nothing to do', file_name, ingest.status) return _start_ingest(ingest) if ingest.status != 'INGESTING': return try: source_file = ingest.source_file if source_file.is_deleted: # Source file still marked as deleted, so we must copy/move/register the file source_file.set_basic_fields(file_name, ingest.file_size, ingest.media_type, ingest.get_data_type_tags()) source_file.update_uuid( file_name) # Add a stable identifier based on the file name source_file.workspace = ingest.workspace source_file.file_path = ingest.file_path source_file.is_deleted = False source_file.is_parsed = False source_file.deleted = None source_file.parsed = None if ingest.new_workspace: # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must # download the file and copy from there # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra # download is not necessary paths = ingest.workspace.get_file_system_paths([source_file]) if paths: local_path = paths[0] else: local_path = os.path.join('/tmp', file_name) file_download = FileDownload(source_file, local_path, False) ScaleFile.objects.download_files([file_download]) source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path logger.info('Copying %s in workspace %s to %s in workspace %s', ingest.file_path, ingest.workspace.name, source_file.file_path, ingest.new_workspace.name) file_upload = FileUpload(source_file, local_path) ScaleFile.objects.upload_files(ingest.new_workspace, [file_upload]) elif ingest.new_file_path: logger.info('Moving %s to %s in workspace %s', ingest.file_path, ingest.new_file_path, ingest.workspace.name) file_move = FileMove(source_file, ingest.new_file_path) ScaleFile.objects.move_files([file_move]) else: logger.info('Registering %s in workspace %s', ingest.file_path, ingest.workspace.name) _save_source_file(source_file) if ingest.new_workspace: # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so) file_with_old_path = SourceFile.create() file_with_old_path.file_name = file_name file_with_old_path.file_path = ingest.file_path paths = ingest.workspace.get_file_system_paths( [file_with_old_path]) if paths: _delete_file(paths[0]) except Exception: _complete_ingest(ingest, 'ERRORED') raise _complete_ingest(ingest, 'INGESTED') logger.info('Ingest successful for %s', file_name)
def perform_ingest(ingest_id): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int """ ingest = _get_ingest(ingest_id) file_name = ingest.file_name if ingest.status in ['INGESTED', 'DUPLICATE']: logger.warning('%s already marked %s, nothing to do', file_name, ingest.status) return _start_ingest(ingest) if ingest.status != 'INGESTING': return try: source_file = ingest.source_file if source_file.is_deleted: # Source file still marked as deleted, so we must copy/move/register the file source_file.set_basic_fields(file_name, ingest.file_size, ingest.media_type, ingest.get_data_type_tags()) source_file.update_uuid(file_name) # Add a stable identifier based on the file name source_file.workspace = ingest.workspace source_file.file_path = ingest.file_path source_file.is_deleted = False source_file.is_parsed = False source_file.deleted = None source_file.parsed = None if ingest.new_workspace: # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must # download the file and copy from there # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra # download is not necessary paths = ingest.workspace.get_file_system_paths([source_file]) if paths: local_path = paths[0] else: local_path = os.path.join('/tmp', file_name) file_download = FileDownload(source_file, local_path) ScaleFile.objects.download_files([file_download]) source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path logger.info('Copying %s in workspace %s to %s in workspace %s', ingest.file_path, ingest.workspace.name, source_file.file_path, ingest.new_workspace.name) file_upload = FileUpload(source_file, local_path) ScaleFile.objects.upload_files(ingest.new_workspace, [file_upload]) elif ingest.new_file_path: logger.info('Moving %s to %s in workspace %s', ingest.file_path, ingest.new_file_path, ingest.workspace.name) file_move = FileMove(source_file, ingest.new_file_path) ScaleFile.objects.move_files([file_move]) else: logger.info('Registering %s in workspace %s', ingest.file_path, ingest.workspace.name) _save_source_file(source_file) if ingest.new_workspace: # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so) file_with_old_path = SourceFile() file_with_old_path.file_name = file_name file_with_old_path.file_path = ingest.file_path paths = ingest.workspace.get_file_system_paths([file_with_old_path]) if paths: _delete_file(paths[0]) except Exception: _complete_ingest(ingest, 'ERRORED') raise _complete_ingest(ingest, 'INGESTED') logger.info('Ingest successful for %s', file_name)