def set_basic_fields(self, file_name, file_size, media_type=None, data_type=None): """Sets the basic fields for the Scale file :param file_name: The name of the file :type file_name: string :param file_size: The size of the file in bytes :type file_size: long :param media_type: The IANA media type of the file :type media_type: string :param data_type: The set of data type tags for the file :type data_type: set """ if not media_type: media_type = get_media_type(file_name) self.file_name = file_name self.file_size = file_size self.media_type = media_type if data_type: for tag in data_type: self.add_data_type_tag(tag)
def create_ingest(self, file_name, workspace, scan_id=None, strike_id=None): """Creates a new ingest for the given file name. The database save is the caller's responsibility. :param file_name: The name of the file being ingested :type file_name: string :param workspace: :type workspace: string :param scan_id: :type scan_id: int :param strike_id: :type strike_id: int :returns: The new ingest model :rtype: :class:`ingest.models.Ingest` """ ingest = Ingest() if scan_id: ingest.scan_id = scan_id if strike_id: ingest.strike_id = strike_id ingest.file_name = file_name ingest.media_type = get_media_type(ingest.file_name) ingest.workspace = workspace return ingest
def upload_files(self, workspace, file_uploads): """Uploads the given files from the given local file system paths into the given workspace. Each ScaleFile model should have its file_path field populated with the relative location where the file should be stored within the workspace. This method will update the workspace and other fields (including possibly changing file_path) in each ScaleFile model and will save the models to the database. :param workspace: The workspace to upload files into :type workspace: :class:`storage.models.Workspace` :param file_uploads: List of files to upload :type file_uploads: [:class:`storage.brokers.broker.FileUpload`] :returns: The list of saved file models :rtype: [:class:`storage.models.ScaleFile`] :raises :class:`storage.exceptions.ArchivedWorkspace`: If one of the files has a workspace that is archived :raises :class:`storage.exceptions.MissingRemoteMount`: If a required mount location is missing """ if not workspace.is_active: raise ArchivedWorkspace('%s is no longer active' % workspace.name) file_list = [] for file_upload in file_uploads: scale_file = file_upload.file media_type = scale_file.media_type # Determine file properties file_name = os.path.basename(file_upload.local_path) if not media_type: media_type = get_media_type(file_name) file_size = os.path.getsize(file_upload.local_path) scale_file.file_name = file_name scale_file.media_type = media_type scale_file.file_size = file_size scale_file.workspace = workspace scale_file.is_deleted = False scale_file.deleted = None file_list.append(scale_file) # Store files in workspace workspace.upload_files(file_uploads) # Populate the country list for all files that were saved for file_upload in file_uploads: scale_file = file_upload.file if scale_file.pk: scale_file.set_countries() scale_file.save() return file_list
def _create_ingest(self, file_name): """Creates a new ingest for the given file name. The database save is the caller's responsibility. :param file_name: The name of the file being ingested :type file_name: string :returns: The new ingest model :rtype: :class:`ingest.models.Ingest` """ ingest = Ingest() ingest.file_name = file_name ingest.strike_id = self.strike_id ingest.media_type = get_media_type(file_name) ingest.workspace = self._monitored_workspace logger.info('New file on %s: %s', ingest.workspace.name, file_name) return ingest
def add_file(self, file_name, workspace, scan_id=None, strike_id=None): """Add file source metadata to ingest record :param file_name: File name excluding full path :type file_name: string :param workspace: :type workspace: string :param scan_id: :type scan_id: int :param strike_id: :type strike_id: int """ if scan_id: self.scan_id = scan_id if strike_id: self.strike_id = strike_id self.file_name = file_name self.media_type = get_media_type(self.file_name) self.workspace = workspace logger.info('New file on %s: %s', self.workspace.name, self.file_name)
def _complete_transfer(self, ingest, size): '''Completes the transfer for the given ingest and updates the database :param transfer: The ingest model :type transfer: :class:`ingest.models.Ingest` :param size: Total size of the file in bytes :type size: long ''' file_name = ingest.file_name file_path = os.path.join(self.strike_dir, file_name) if not ingest.status == 'TRANSFERRING': msg = 'Completing transfer for %s requires TRANSFERRING status' raise Exception(msg % file_path) logger.info('Transfer complete: %s', file_path) last_modified = os.path.getmtime(file_path) ingest.transfer_ended = datetime.utcfromtimestamp(last_modified) ingest.media_type = get_media_type(file_name) ingest.file_size = size # Check configuration for what to do with this file file_config = self.configuration.match_file_name(file_name) if file_config: for data_type in file_config[0]: ingest.add_data_type_tag(data_type) today = now() # Store file within workspace at /configuration_path/current_year/current_month/current_day/file_name year_dir = str(today.year) month_dir = '%02d' % today.month day_dir = '%02d' % today.day ingest.file_path = os.path.join(file_config[1], year_dir, month_dir, day_dir, file_name) ingest.workspace = file_config[2] ingest.ingest_path = os.path.join(self.rel_ingest_dir, file_name) ingest.status = 'TRANSFERRED' ingest.save() logger.info('Ingest marked as TRANSFERRED: %s', file_name)
def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method migrates existing data files into scale. """ logger.info(u'Command starting: migratedata') workspace, workspace_path, local_path, data_types = None, None, None, [] if options['workspace'] is not None and options[ 'workspace_path'] is not None: workspace, workspace_path = options['workspace'], options[ 'workspace_path'] tmp = Workspace.objects.filter(name=workspace) if tmp.count() > 0: workspace = tmp.first() else: workspace = Workspace.objects.get(id=int(workspace)) else: logger.error('Must specify workspace and workspace-path.') return False if options['data_type'] is not None: data_types.extend(options['data_type']) mnt_dirs = None if options['local_path'] is not None: local_path = options['local_path'] else: # mount mnt_dirs = "/tmp", tempfile.mkdtemp() workspace.setup_download_dir(*mnt_dirs) local_path = os.path.join(mnt_dirs[1], workspace_path) logger.info("Ingesting files from %s/%s", workspace.name, workspace_path) filenames = self.generate_file_list(local_path, options['include'], options['exclude']) logger.info("Found %d files", len(filenames)) # prepare for ingest ala strike ingest_records = {} for filename in filenames: logger.info("Generating ingest record for %s" % filename) ingest = Ingest() ingest.file_name = os.path.basename(filename) ingest.file_path = os.path.join( workspace_path, os.path.relpath(filename, local_path)) ingest.transfer_started = datetime.utcfromtimestamp( os.path.getatime(filename)) ingest.file_size = ingest.bytes_transferred = os.path.getsize( filename) ingest.transfer_ended = timezone.now() ingest.media_type = get_media_type(filename) ingest.workspace = workspace for data_type in data_types: ingest.add_data_type_tag(data_type) ingest.status = 'TRANSFERRED' if options['no_commit']: s = IngestDetailsSerializer() logger.info(s.to_representation(ingest)) else: ingest.save() ingest_records[filename] = ingest.id logging.info("Ingests records created") # start ingest tasks for all the files if not options['no_commit']: logging.info("Starting ingest tasks") for filename in filenames: ingest = Ingest.objects.get(id=ingest_records[filename]) logging.info("Processing ingest %s" % ingest.file_name) with transaction.atomic(): ingest.ingest_started = timezone.now() sf = ingest.source_file = SourceFile.create() sf.update_uuid(ingest.file_name) for tag in ingest.get_data_type_tags(): sf.add_data_type_tag(tag) sf.media_type = ingest.media_type sf.file_name = ingest.file_name sf.file_size = ingest.file_size sf.file_path = ingest.file_path sf.workspace = workspace sf.is_deleted = False sf.deleted = None sf.save() sf.set_countries() sf.save() ingest.status = 'INGESTED' ingest.ingest_ended = timezone.now() ingest.source_file = sf ingest.save() IngestTriggerHandler().process_ingested_source_file( ingest.source_file, ingest.ingest_ended) logging.info( "Ingests processed, monitor the queue for triggered jobs.") if mnt_dirs is not None: workspace.cleanup_download_dir(*mnt_dirs) logger.info(u'Command completed: migratedata')
def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method migrates existing data files into scale. """ logger.info(u'Command starting: migratedata') workspace, workspace_path, local_path, data_types = None, None, None, [] if options['workspace'] is not None and options['workspace_path'] is not None: workspace, workspace_path = options['workspace'], options['workspace_path'] tmp = Workspace.objects.filter(name=workspace) if tmp.count() > 0: workspace = tmp.first() else: workspace = Workspace.objects.get(id=int(workspace)) else: logger.error('Must specify workspace and workspace-path.') return False if options['data_type'] is not None: data_types.extend(options['data_type']) mnt_dirs = None if options['local_path'] is not None: local_path = options['local_path'] else: # mount mnt_dirs = "/tmp", tempfile.mkdtemp() workspace.setup_download_dir(*mnt_dirs) local_path = os.path.join(mnt_dirs[1], workspace_path) logger.info("Ingesting files from %s/%s", workspace.name, workspace_path) filenames = self.generate_file_list(local_path, options['include'], options['exclude']) logger.info("Found %d files", len(filenames)) # prepare for ingest ala strike ingest_records = {} for filename in filenames: logger.info("Generating ingest record for %s" % filename) ingest = Ingest() ingest.file_name = os.path.basename(filename) ingest.transfer_path = filename ingest.file_path = os.path.join(workspace_path, os.path.relpath(filename, local_path)) ingest.transfer_started = datetime.utcfromtimestamp(os.path.getatime(filename)) ingest.file_size = ingest.bytes_transferred = os.path.getsize(filename) ingest.transfer_ended = datetime.utcnow() ingest.media_type = get_media_type(filename) ingest.workspace = workspace for data_type in data_types: ingest.add_data_type_tag(data_type) ingest.status = 'TRANSFERRED' if options['no_commit']: s = IngestDetailsSerializer() logger.info(s.to_representation(ingest)) else: ingest.save() ingest_records[filename] = ingest.id logging.info("Ingests records created") # start ingest tasks for all the files if not options['no_commit']: logging.info("Starting ingest tasks") for filename in filenames: ingest = Ingest.objects.get(id=ingest_records[filename]) logging.info("Processing ingest %s" % ingest.file_name) with transaction.atomic(): ingest.ingest_started = datetime.utcnow() sf = ingest.source_file = SourceFile() sf.update_uuid(ingest.file_name) for tag in ingest.get_data_type_tags(): sf.add_data_type_tag(tag) sf.media_type = ingest.media_type sf.file_name = ingest.file_name sf.file_size = ingest.file_size sf.file_path = ingest.file_path sf.workspace = workspace sf.is_deleted = False sf.deleted = None sf.save() sf.set_countries() sf.save() ingest.status = 'INGESTED' ingest.ingest_ended = datetime.utcnow() ingest.source_file = sf ingest.save() IngestTriggerHandler().process_ingested_source_file(ingest.source_file, ingest.ingest_ended) logging.info("Ingests processed, monitor the queue for triggered jobs.") if mnt_dirs is not None: workspace.cleanup_download_dir(*mnt_dirs) logger.info(u'Command completed: migratedata')
def upload_files(self, upload_dir, work_dir, workspace, files_to_upload): """Uploads the given files in the given upload directory into the workspace. This method assumes that setup_upload_dir() has already been called with the same upload and work directories. The ScaleFile models will be saved in an atomic database transaction. :param upload_dir: Absolute path to the local directory of the files to upload :type upload_dir: str :param work_dir: Absolute path to a local work directory available to assist in uploading :type work_dir: str :param workspace: The workspace to upload files into :type workspace: :class:`storage.models.Workspace` :param files_to_upload: List of tuples (ScaleFile model, source path relative to upload directory, workspace path for storing the file) :type files_to_upload: list of (:class:`storage.models.ScaleFile`, str, str) :returns: The list of the saved file models :rtype: list of :class:`storage.models.ScaleFile` """ upload_dir = os.path.normpath(upload_dir) work_dir = os.path.normpath(work_dir) workspace_work_dir = self._get_workspace_work_dir(work_dir, workspace) file_list = [] wksp_upload_list = [ ] # Info to pass the workspace so it can upload files wksp_delete_list = [ ] # Info needed to delete the files if the database save fails for entry in files_to_upload: scale_file = entry[0] upload_path = entry[1] workspace_path = entry[2] full_upload_path = os.path.join(upload_dir, upload_path) media_type = scale_file.media_type # Determine file properties file_name = os.path.basename(full_upload_path) if not media_type: media_type = get_media_type(file_name) file_size = os.path.getsize(full_upload_path) scale_file.file_name = file_name scale_file.media_type = media_type scale_file.file_size = file_size scale_file.file_path = workspace_path scale_file.workspace = workspace scale_file.is_deleted = False scale_file.deleted = None file_list.append(scale_file) wksp_upload_list.append((upload_path, workspace_path)) wksp_delete_list.append(workspace_path) try: # Store files in workspace workspace.upload_files(upload_dir, workspace_work_dir, wksp_upload_list) with transaction.atomic(): for scale_file in file_list: # save to create a pkey, update the country list, then save again scale_file.save() scale_file.set_countries() scale_file.save() return file_list except Exception as ex: # Attempt to clean up failed files before propagating exception try: delete_work_dir = self._get_delete_work_dir( work_dir, workspace) logger.info('Creating %s', delete_work_dir) os.makedirs(delete_work_dir, mode=0755) workspace.delete_files(delete_work_dir, wksp_delete_list) except Exception: # Failure to delete should not override ex logger.exception( 'Error cleaning up files that failed to upload') raise ex
def upload_files(self, upload_dir, work_dir, workspace, files_to_upload): '''Uploads the given files in the given upload directory into the workspace. This method assumes that setup_upload_dir() has already been called with the same upload and work directories. The ScaleFile models will be saved in an atomic database transaction. :param upload_dir: Absolute path to the local directory of the files to upload :type upload_dir: str :param work_dir: Absolute path to a local work directory available to assist in uploading :type work_dir: str :param workspace: The workspace to upload files into :type workspace: :class:`storage.models.Workspace` :param files_to_upload: List of tuples (ScaleFile model, source path relative to upload directory, workspace path for storing the file) :type files_to_upload: list of (:class:`storage.models.ScaleFile`, str, str) :returns: The list of the saved file models :rtype: list of :class:`storage.models.ScaleFile` ''' upload_dir = os.path.normpath(upload_dir) workspace_work_dir = self._get_workspace_work_dir(work_dir, workspace) file_list = [] wksp_upload_list = [] # Info to pass the workspace so it can upload files wksp_delete_list = [] # Info needed to delete the files if the database save fails for entry in files_to_upload: scale_file = entry[0] upload_path = entry[1] workspace_path = entry[2] full_upload_path = os.path.join(upload_dir, upload_path) media_type = scale_file.media_type # Determine file properties file_name = os.path.basename(full_upload_path) if not media_type: media_type = get_media_type(file_name) file_size = os.path.getsize(full_upload_path) scale_file.file_name = file_name scale_file.media_type = media_type scale_file.file_size = file_size scale_file.file_path = workspace_path scale_file.workspace = workspace scale_file.is_deleted = False scale_file.deleted = None file_list.append(scale_file) wksp_upload_list.append((upload_path, workspace_path)) wksp_delete_list.append(workspace_path) try: # Store files in workspace workspace.upload_files(upload_dir, workspace_work_dir, wksp_upload_list) with transaction.atomic(): for scale_file in file_list: # save to create a pkey, update the country list, then save again scale_file.save() scale_file.set_countries() scale_file.save() return file_list except Exception as ex: # Attempt to clean up failed files before propagating exception try: delete_work_dir = os.path.join(os.path.normpath(work_dir), 'delete', get_valid_filename(workspace.name)) workspace.delete_files(delete_work_dir, wksp_delete_list) except Exception: # Failure to delete should not override ex logger.exception(u'Error cleaning up files that failed to upload') raise ex