예제 #1
0
def _get_source_file(file_name):
    """Returns an existing or new (un-saved) source file model for the given file name

    :param file_name: The name of the source file
    :type file_name: string
    :returns: The source file model
    :rtype: :class:`source.models.SourceFile`
    """

    try:
        src_file = SourceFile.objects.get(file_name=file_name)
    except SourceFile.DoesNotExist:
        src_file = SourceFile()  # New file
        src_file.file_name = file_name
        src_file.is_deleted = True
    return src_file
예제 #2
0
파일: ingest_job.py 프로젝트: wong-j/scale
def _get_source_file(file_name):
    """Returns an existing or new (un-saved) source file model for the given file name

    :param file_name: The name of the source file
    :type file_name: string
    :returns: The source file model
    :rtype: :class:`source.models.SourceFile`
    """

    try:
        src_file = SourceFile.objects.get(file_name=file_name)
    except SourceFile.DoesNotExist:
        src_file = SourceFile()  # New file
        src_file.file_name = file_name
        src_file.is_deleted = True
    return src_file
예제 #3
0
    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method migrates existing data files into scale.
        """
        logger.info(u'Command starting: migratedata')

        workspace, workspace_path, local_path, data_types = None, None, None, []
        if options['workspace'] is not None and options[
                'workspace_path'] is not None:
            workspace, workspace_path = options['workspace'], options[
                'workspace_path']
            tmp = Workspace.objects.filter(name=workspace)
            if tmp.count() > 0:
                workspace = tmp.first()
            else:
                workspace = Workspace.objects.get(id=int(workspace))
        else:
            logger.error('Must specify workspace and workspace-path.')
            return False
        if options['data_type'] is not None:
            data_types.extend(options['data_type'])

        mnt_dirs = None
        if options['local_path'] is not None:
            local_path = options['local_path']
        else:  # mount
            mnt_dirs = "/tmp", tempfile.mkdtemp()
            workspace.setup_download_dir(*mnt_dirs)
            local_path = os.path.join(mnt_dirs[1], workspace_path)

        logger.info("Ingesting files from %s/%s", workspace.name,
                    workspace_path)
        filenames = self.generate_file_list(local_path, options['include'],
                                            options['exclude'])
        logger.info("Found %d files", len(filenames))

        # prepare for ingest ala strike
        ingest_records = {}
        for filename in filenames:
            logger.info("Generating ingest record for %s" % filename)
            ingest = Ingest()
            ingest.file_name = os.path.basename(filename)
            ingest.file_path = os.path.join(
                workspace_path, os.path.relpath(filename, local_path))
            ingest.transfer_started = datetime.utcfromtimestamp(
                os.path.getatime(filename))
            ingest.file_size = ingest.bytes_transferred = os.path.getsize(
                filename)
            ingest.transfer_ended = timezone.now()
            ingest.media_type = get_media_type(filename)
            ingest.workspace = workspace
            for data_type in data_types:
                ingest.add_data_type_tag(data_type)
            ingest.status = 'TRANSFERRED'
            if options['no_commit']:
                s = IngestDetailsSerializer()
                logger.info(s.to_representation(ingest))
            else:
                ingest.save()
                ingest_records[filename] = ingest.id
        logging.info("Ingests records created")

        # start ingest tasks for all the files
        if not options['no_commit']:
            logging.info("Starting ingest tasks")
            for filename in filenames:
                ingest = Ingest.objects.get(id=ingest_records[filename])
                logging.info("Processing ingest %s" % ingest.file_name)
                with transaction.atomic():
                    ingest.ingest_started = timezone.now()
                    sf = ingest.source_file = SourceFile.create()
                    sf.update_uuid(ingest.file_name)
                    for tag in ingest.get_data_type_tags():
                        sf.add_data_type_tag(tag)
                    sf.media_type = ingest.media_type
                    sf.file_name = ingest.file_name
                    sf.file_size = ingest.file_size
                    sf.file_path = ingest.file_path
                    sf.workspace = workspace
                    sf.is_deleted = False
                    sf.deleted = None
                    sf.save()
                    sf.set_countries()
                    sf.save()
                    ingest.status = 'INGESTED'
                    ingest.ingest_ended = timezone.now()
                    ingest.source_file = sf
                    ingest.save()
                    IngestTriggerHandler().process_ingested_source_file(
                        ingest.source_file, ingest.ingest_ended)

        logging.info(
            "Ingests processed, monitor the queue for triggered jobs.")

        if mnt_dirs is not None:
            workspace.cleanup_download_dir(*mnt_dirs)

        logger.info(u'Command completed: migratedata')
예제 #4
0
def perform_ingest(ingest_id):
    """Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: int
    """

    ingest = _get_ingest(ingest_id)
    file_name = ingest.file_name

    if ingest.status in ['INGESTED', 'DUPLICATE']:
        logger.warning('%s already marked %s, nothing to do', file_name,
                       ingest.status)
        return

    _start_ingest(ingest)
    if ingest.status != 'INGESTING':
        return

    try:
        source_file = ingest.source_file
        if source_file.is_deleted:
            # Source file still marked as deleted, so we must copy/move/register the file
            source_file.set_basic_fields(file_name, ingest.file_size,
                                         ingest.media_type,
                                         ingest.get_data_type_tags())
            source_file.update_uuid(
                file_name)  # Add a stable identifier based on the file name
            source_file.workspace = ingest.workspace
            source_file.file_path = ingest.file_path
            source_file.is_deleted = False
            source_file.is_parsed = False
            source_file.deleted = None
            source_file.parsed = None

            if ingest.new_workspace:
                # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must
                # download the file and copy from there
                # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra
                # download is not necessary
                paths = ingest.workspace.get_file_system_paths([source_file])
                if paths:
                    local_path = paths[0]
                else:
                    local_path = os.path.join('/tmp', file_name)
                    file_download = FileDownload(source_file, local_path,
                                                 False)
                    ScaleFile.objects.download_files([file_download])
                source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path
                logger.info('Copying %s in workspace %s to %s in workspace %s',
                            ingest.file_path, ingest.workspace.name,
                            source_file.file_path, ingest.new_workspace.name)
                file_upload = FileUpload(source_file, local_path)
                ScaleFile.objects.upload_files(ingest.new_workspace,
                                               [file_upload])
            elif ingest.new_file_path:
                logger.info('Moving %s to %s in workspace %s',
                            ingest.file_path, ingest.new_file_path,
                            ingest.workspace.name)
                file_move = FileMove(source_file, ingest.new_file_path)
                ScaleFile.objects.move_files([file_move])
            else:
                logger.info('Registering %s in workspace %s', ingest.file_path,
                            ingest.workspace.name)
                _save_source_file(source_file)

        if ingest.new_workspace:
            # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so)
            file_with_old_path = SourceFile.create()
            file_with_old_path.file_name = file_name
            file_with_old_path.file_path = ingest.file_path
            paths = ingest.workspace.get_file_system_paths(
                [file_with_old_path])
            if paths:
                _delete_file(paths[0])

    except Exception:
        _complete_ingest(ingest, 'ERRORED')
        raise

    _complete_ingest(ingest, 'INGESTED')
    logger.info('Ingest successful for %s', file_name)
예제 #5
0
def perform_ingest(ingest_id):
    """Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: int
    """

    ingest = _get_ingest(ingest_id)
    file_name = ingest.file_name

    if ingest.status in ['INGESTED', 'DUPLICATE']:
        logger.warning('%s already marked %s, nothing to do', file_name, ingest.status)
        return

    _start_ingest(ingest)
    if ingest.status != 'INGESTING':
        return

    try:
        source_file = ingest.source_file
        if source_file.is_deleted:
            # Source file still marked as deleted, so we must copy/move/register the file
            source_file.set_basic_fields(file_name, ingest.file_size, ingest.media_type, ingest.get_data_type_tags())
            source_file.update_uuid(file_name)  # Add a stable identifier based on the file name
            source_file.workspace = ingest.workspace
            source_file.file_path = ingest.file_path
            source_file.is_deleted = False
            source_file.is_parsed = False
            source_file.deleted = None
            source_file.parsed = None

            if ingest.new_workspace:
                # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must
                # download the file and copy from there
                # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra
                # download is not necessary
                paths = ingest.workspace.get_file_system_paths([source_file])
                if paths:
                    local_path = paths[0]
                else:
                    local_path = os.path.join('/tmp', file_name)
                    file_download = FileDownload(source_file, local_path)
                    ScaleFile.objects.download_files([file_download])
                source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path
                logger.info('Copying %s in workspace %s to %s in workspace %s', ingest.file_path, ingest.workspace.name,
                            source_file.file_path, ingest.new_workspace.name)
                file_upload = FileUpload(source_file, local_path)
                ScaleFile.objects.upload_files(ingest.new_workspace, [file_upload])
            elif ingest.new_file_path:
                logger.info('Moving %s to %s in workspace %s', ingest.file_path, ingest.new_file_path,
                            ingest.workspace.name)
                file_move = FileMove(source_file, ingest.new_file_path)
                ScaleFile.objects.move_files([file_move])
            else:
                logger.info('Registering %s in workspace %s', ingest.file_path, ingest.workspace.name)
                _save_source_file(source_file)

        if ingest.new_workspace:
            # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so)
            file_with_old_path = SourceFile()
            file_with_old_path.file_name = file_name
            file_with_old_path.file_path = ingest.file_path
            paths = ingest.workspace.get_file_system_paths([file_with_old_path])
            if paths:
                _delete_file(paths[0])

    except Exception:
        _complete_ingest(ingest, 'ERRORED')
        raise

    _complete_ingest(ingest, 'INGESTED')
    logger.info('Ingest successful for %s', file_name)