Пример #1
0
def index_aip(job):
    """Write AIP information to ElasticSearch. """
    sip_uuid = job.args[1]  # %SIPUUID%
    sip_name = job.args[2]  # %SIPName%
    sip_staging_path = job.args[3]  # %SIPDirectory%
    sip_type = job.args[4]  # %SIPType%
    if "aips" not in mcpclient_settings.SEARCH_ENABLED:
        logger.info("Skipping indexing: AIPs indexing is currently disabled.")
        return 0
    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()
    aip_info = storage_service.get_file_info(uuid=sip_uuid)
    job.pyprint("AIP info:", aip_info)
    aip_info = aip_info[0]
    mets_staging_path = os.path.join(sip_staging_path,
                                     "METS.{}.xml".format(sip_uuid))
    identifiers = get_identifiers(job, sip_staging_path)
    # If this is an AIC, find the number of AIP stored in it and index that
    aips_in_aic = None
    if sip_type == "AIC":
        try:
            uv = UnitVariable.objects.get(unittype="SIP",
                                          unituuid=sip_uuid,
                                          variable="AIPsinAIC")
            aips_in_aic = uv.variablevalue
        except UnitVariable.DoesNotExist:
            pass
    # Delete ES index before creating new one if reingesting
    if "REIN" in sip_type:
        job.pyprint(
            "Deleting outdated entry for AIP and AIP files with UUID",
            sip_uuid,
            "from archival storage",
        )
        elasticSearchFunctions.delete_aip(client, sip_uuid)
        elasticSearchFunctions.delete_aip_files(client, sip_uuid)
    job.pyprint("Indexing AIP and AIP files")
    # Even though we treat MODS identifiers as SIP-level, we need to index them
    # here because the archival storage tab actually searches on the
    # aips/aipfile index.
    ret = elasticSearchFunctions.index_aip_and_files(
        client=client,
        uuid=sip_uuid,
        aip_stored_path=aip_info["current_full_path"],
        mets_staging_path=mets_staging_path,
        name=sip_name,
        aip_size=aip_info["size"],
        aips_in_aic=aips_in_aic,
        identifiers=identifiers,
        encrypted=aip_info["encrypted"],
        printfn=job.pyprint,
    )
    if ret == 1:
        job.pyprint("Error indexing AIP and AIP files", file=sys.stderr)
    return ret
Пример #2
0
def sync_es_aip_status_with_storage_service(uuid, es_status):
    """Update AIP's status in ES indices to match Storage Service.

    This is a bit of a kludge that is made necessary by the fact that
    the Storage Service does not update ElasticSearch directly when
    a package's status has changed.

    Updates to ES are visible in Archival Storage after running a new
    search or refreshing the page.

    :param uuid: AIP UUID.
    :param es_status: Current package status in ES.

    :returns: Boolean indicating whether AIP should be kept in search
    results (i.e. has not been deleted from Storage Service).
    """
    keep_in_results = True

    amclient = setup_amclient()
    amclient.package_uuid = uuid
    api_results = amclient.get_package_details()

    if api_results in AMCLIENT_ERROR_CODES:
        logger.warning(
            "Package {} not found in Storage Service. AMClient error code: {}".format(
                uuid, api_results
            )
        )
        return keep_in_results

    aip_status = api_results.get("status")

    if not aip_status:
        logger.warning(
            "Status for package {} could not be retrived from Storage Service."
        )
        return keep_in_results

    if (
        aip_status == es.STATUS_DELETE_REQUESTED
        and es_status != es.STATUS_DELETE_REQUESTED
    ):
        es_client = es.get_client()
        es.mark_aip_deletion_requested(es_client, uuid)
    elif aip_status == es.STATUS_UPLOADED and es_status != es.STATUS_UPLOADED:
        es_client = es.get_client()
        es.revert_aip_deletion_request(es_client, uuid)
    elif aip_status == es.STATUS_DELETED:
        keep_in_results = False
        es_client = es.get_client()
        es.delete_aip(es_client, uuid)
        es.delete_aip_files(es_client, uuid)

    return keep_in_results
Пример #3
0
 def test_delete_aip_files(self):
     # Verify AIP files exist
     results = self.client.search(
         index="aipfiles", body={"query": {"term": {"AIPUUID": self.aip_uuid}}}
     )
     assert results["hits"]["total"] == 2
     # Delete AIP files
     elasticSearchFunctions.delete_aip_files(self.client, self.aip_uuid)
     # Verify AIP files gone
     results = self.client.search(
         index="aipfiles", body={"query": {"term": {"AIPUUID": self.aip_uuid}}}
     )
     assert results["hits"]["total"] == 0
Пример #4
0
def call(jobs):
    for job in jobs:
        with job.JobContext(logger=logger):
            aip_uuid = job.args[1]

            if "aips" not in mcpclient_settings.SEARCH_ENABLED:
                logger.info("Skipping. AIPs indexing is currently disabled.")
                return

            elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
            client = elasticSearchFunctions.get_client()

            logger.info("Removing indexed files for AIP %s...", aip_uuid)
            elasticSearchFunctions.delete_aip_files(client, aip_uuid)
Пример #5
0
 def test_delete_aip_files(self):
     # Verify AIP files exist
     results = self.client.search(
         index='aipfiles',
         body={'query': {
             'term': {
                 'AIPUUID': self.aip_uuid
             }
         }},
     )
     assert results['hits']['total'] == 2
     # Delete AIP files
     elasticSearchFunctions.delete_aip_files(self.client, self.aip_uuid)
     # Verify AIP files gone
     results = self.client.search(
         index='aipfiles',
         body={'query': {
             'term': {
                 'AIPUUID': self.aip_uuid
             }
         }},
     )
     assert results['hits']['total'] == 0
Пример #6
0
 def test_delete_aip_files(self):
     # Verify AIP exists
     results = self.client.search(
         index='aips',
         doc_type='aipfile',
         body={'query': {
             'term': {
                 'AIPUUID': self.aip_uuid
             }
         }},
         fields='AIPUUID,FILEUUID',
         sort='FILEUUID:desc',
     )
     assert results['hits']['total'] == 3
     assert results['hits']['hits'][0]['fields']['AIPUUID'] == [
         self.aip_uuid
     ]
     assert results['hits']['hits'][0]['fields']['FILEUUID'] == [
         'b8bd3cdd-f224-4237-b0d7-99c217ff8e67'
     ]
     assert results['hits']['hits'][1]['fields']['AIPUUID'] == [
         self.aip_uuid
     ]
     assert results['hits']['hits'][1]['fields']['FILEUUID'] == [
         '68babd3e-7e6b-40e5-99f6-00ea724d4ce8'
     ]
     assert results['hits']['hits'][2]['fields']['AIPUUID'] == [
         self.aip_uuid
     ]
     assert results['hits']['hits'][2]['fields']['FILEUUID'] == [
         '547bbd92-d8a0-4624-a9d3-69ba706eacee'
     ]
     # Delete AIP
     success = elasticSearchFunctions.delete_aip_files(
         self.client, self.aip_uuid)
     # Verify AIP gone
     assert success is True
     results = self.client.search(
         index='aips',
         doc_type='aipfile',
         body={'query': {
             'term': {
                 'AIPUUID': self.aip_uuid
             }
         }},
         fields='AIPUUID,FILEUUID',
     )
     assert results['hits']['total'] == 0
Пример #7
0
# Archivematica is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Archivematica.  If not, see <http://www.gnu.org/licenses/>.

# @package Archivematica
# @subpackage archivematicaClientScript
# @author Joseph Perry <*****@*****.**>
import sys

# elasticSearchFunctions requires Django to be set up
import django
django.setup()
# archivematicaCommon
from custom_handlers import get_script_logger
import elasticSearchFunctions

logger = get_script_logger("archivematica.mcp.client.removeAIPFilesFromIndex")

if __name__ == '__main__':
    aip_uuid = sys.argv[1]

    elasticSearchFunctions.setup_reading_from_client_conf()
    client = elasticSearchFunctions.get_client()

    logger.info('Removing indexed files for AIP %s...', aip_uuid)
    elasticSearchFunctions.delete_aip_files(client, AIPUUID)
Пример #8
0
def processAIPThenDeleteMETSFile(path,
                                 temp_dir,
                                 es_client,
                                 delete_existing_data=False):
    archive_file = os.path.basename(path)

    # Regex match the UUID - AIP might end with .7z, .tar.bz2, or
    # something else.
    match = re.search(
        r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
        archive_file)
    if match is not None:
        aip_uuid = match.group()
    else:
        return -1

    print("Processing AIP", aip_uuid)

    if delete_existing_data is True:
        print("Deleting AIP", aip_uuid, "from aips/aip and aips/aipfile.")
        elasticSearchFunctions.delete_aip(es_client, aip_uuid)
        elasticSearchFunctions.delete_aip_files(es_client, aip_uuid)

    # AIP filenames are <name>-<uuid><extension>
    # Index of match end is right before the extension
    subdir = archive_file[:match.end()]
    aip_name = subdir[:-37]
    mets_file = "METS." + aip_uuid + ".xml"
    mets_file_relative_path = os.path.join("data", mets_file)
    if os.path.isfile(path):
        mets_file_relative_path = os.path.join(subdir, mets_file_relative_path)
    path_to_mets = extract_file(
        archive_path=path,
        destination_dir=temp_dir,
        relative_path=mets_file_relative_path,
    )

    # If AIC, need to extract number of AIPs in AIC to index as well
    aips_in_aic = None
    root = etree.parse(path_to_mets)
    try:
        aip_type = ns.xml_find_premis(
            root,
            "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:type"
        ).text
    except AttributeError:
        pass
    else:
        if aip_type == "Archival Information Collection":
            aips_in_aic = get_aips_in_aic(root, path, temp_dir)

    aip_info = storage_service.get_file_info(uuid=aip_uuid)

    if not aip_info:
        print("Information not found in Storage Service for AIP UUID: ",
              aip_uuid)
        return 1

    return elasticSearchFunctions.index_aip_and_files(
        client=es_client,
        uuid=aip_uuid,
        aip_stored_path=path,
        mets_staging_path=path_to_mets,
        name=aip_name,
        aip_size=aip_info[0]["size"],
        aips_in_aic=aips_in_aic,
        identifiers=[],  # TODO get these
    )
Пример #9
0
def list_display(request):

    if 'aips' not in settings.SEARCH_ENABLED:
        return render(request, 'archival_storage/list.html')
    current_page_number = int(request.GET.get('page', 1))
    logger.debug('Current page: %s', current_page_number)

    # get count of AIP files
    es_client = elasticSearchFunctions.get_client()
    aip_indexed_file_count = aip_file_count(es_client)

    # get AIPs
    order_by = request.GET.get('order_by', 'name_unanalyzed')
    sort_by = request.GET.get('sort_by', 'up')

    if sort_by == 'down':
        sort_direction = 'desc'
    else:
        sort_direction = 'asc'

    sort_specification = order_by + ':' + sort_direction
    sort_params = 'order_by=' + order_by + '&sort_by=' + sort_by

    # get list of UUIDs of AIPs that are deleted or pending deletion
    aips_deleted_or_pending_deletion = []
    should_haves = [
        {'match': {'status': 'DEL_REQ'}},
        {'match': {'status': 'DELETED'}},
    ]
    query = {
        "query": {
            "bool": {
                "should": should_haves
            }
        }
    }
    deleted_aip_results = es_client.search(
        body=query,
        index='aips',
        doc_type='aip',
        fields='uuid,status'
    )
    for deleted_aip in deleted_aip_results['hits']['hits']:
        aips_deleted_or_pending_deletion.append(deleted_aip['fields']['uuid'][0])

    # Fetch results and paginate
    def es_pager(page, page_size):
        """
        Fetch one page of normalized entries from Elasticsearch.

        :param page: 1-indexed page to fetch
        :param page_size: Number of entries on a page
        :return: List of dicts for each entry, where keys and values have been cleaned up
        """
        start = (page - 1) * page_size
        results = es_client.search(
            index='aips',
            doc_type='aip',
            body=elasticSearchFunctions.MATCH_ALL_QUERY,
            fields='origin,uuid,filePath,created,name,size,encrypted',
            sort=sort_specification,
            size=page_size,
            from_=start,
        )
        # normalize results - each of the fields contains a single value,
        # but is returned from the ES API as a single-length array
        # e.g. {"fields": {"uuid": ["abcd"], "name": ["aip"] ...}}
        return [elasticSearchFunctions.normalize_results_dict(d) for d in results['hits']['hits']]

    items_per_page = 10
    count = es_client.count(index='aips', doc_type='aip', body=elasticSearchFunctions.MATCH_ALL_QUERY)['count']
    results = LazyPagedSequence(es_pager, page_size=items_per_page, length=count)

    # Paginate
    page = helpers.pager(
        results,
        items_per_page,
        current_page_number
    )

    # process deletion, etc., and format results
    aips = []
    for aip in page.object_list:
        # If an AIP was deleted or is pending deletion, react if status changed
        if aip['uuid'] in aips_deleted_or_pending_deletion:
            # check with storage server to see current status
            api_results = storage_service.get_file_info(uuid=aip['uuid'])
            try:
                aip_status = api_results[0]['status']
            except IndexError:
                # Storage service does not know about this AIP
                # TODO what should happen here?
                logger.info("AIP not found in storage service: {}".format(aip))
                continue

            # delete AIP metadata in ElasticSearch if AIP has been deleted from the
            # storage server
            # TODO: handle this asynchronously
            if aip_status == 'DELETED':
                elasticSearchFunctions.delete_aip(es_client, aip['uuid'])
                elasticSearchFunctions.delete_aip_files(es_client, aip['uuid'])
            elif aip_status != 'DEL_REQ':
                # update the status in ElasticSearch for this AIP
                elasticSearchFunctions.mark_aip_stored(es_client, aip['uuid'])
        else:
            aip_status = 'UPLOADED'

        # Tweak AIP presentation and add to display array
        if aip_status != 'DELETED':
            aip['status'] = AIP_STATUS_DESCRIPTIONS[aip_status]

            try:
                size = '{0:.2f} MB'.format(float(aip['size']))
            except (TypeError, ValueError):
                size = 'Removed'

            aip['size'] = size

            aip['href'] = aip['filePath'].replace(AIPSTOREPATH + '/', "AIPsStore/")
            aip['date'] = aip['created']

            aips.append(aip)

    total_size = total_size_of_aips(es_client)
    # Find out which AIPs are encrypted

    return render(request, 'archival_storage/list.html',
                  {
                      'total_size': total_size,
                      'aip_indexed_file_count': aip_indexed_file_count,
                      'aips': aips,
                      'page': page,
                      'search_params': sort_params,
                  }
                  )
Пример #10
0
def index_aip():
    """ Write AIP information to ElasticSearch. """
    sip_uuid = sys.argv[1]  # %SIPUUID%
    sip_name = sys.argv[2]  # %SIPName%
    sip_path = sys.argv[3]  # %SIPDirectory%
    sip_type = sys.argv[4]  # %SIPType%

    # Check if ElasticSearch is enabled
    client_config_path = '/etc/archivematica/MCPClient/clientConfig.conf'
    config = ConfigParser.SafeConfigParser()
    config.read(client_config_path)
    elastic_search_disabled = False
    try:
        elastic_search_disabled = config.getboolean(
            'MCPClient', "disableElasticsearchIndexing")
    except ConfigParser.NoOptionError:
        pass
    if elastic_search_disabled:
        print('Skipping indexing: indexing is currently disabled in',
              client_config_path)
        return 0

    elasticSearchFunctions.setup_reading_from_client_conf(config)
    client = elasticSearchFunctions.get_client()

    print('SIP UUID:', sip_uuid)
    aip_info = storage_service.get_file_info(uuid=sip_uuid)
    print('AIP info:', aip_info)
    aip_info = aip_info[0]

    mets_name = 'METS.{}.xml'.format(sip_uuid)
    mets_path = os.path.join(sip_path, mets_name)

    identifiers = get_identifiers(sip_path)

    # If this is an AIC, find the number of AIP stored in it and index that
    aips_in_aic = None
    if sip_type == "AIC":
        try:
            uv = UnitVariable.objects.get(unittype="SIP",
                                          unituuid=sip_uuid,
                                          variable="AIPsinAIC")
            aips_in_aic = uv.variablevalue
        except UnitVariable.DoesNotExist:
            pass

    print('Indexing AIP info')
    # Delete ES index before creating new one if reingesting
    if 'REIN' in sip_type:
        print('Deleting outdated entry for AIP and AIP files with UUID',
              sip_uuid, 'from archival storage')
        elasticSearchFunctions.delete_aip(client, sip_uuid)
        elasticSearchFunctions.delete_aip_files(client, sip_uuid)

    # Index AIP
    elasticSearchFunctions.index_aip(client,
                                     sip_uuid,
                                     sip_name,
                                     aip_info['current_full_path'],
                                     mets_path,
                                     size=aip_info['size'],
                                     aips_in_aic=aips_in_aic,
                                     identifiers=identifiers)

    # Index AIP files
    print('Indexing AIP files')
    # Even though we treat MODS identifiers as SIP-level, we need to index them
    # here because the archival storage tab actually searches on the
    # aips/aipfile index.
    exitCode = elasticSearchFunctions.index_files(
        client,
        index='aips',
        type_='aipfile',
        uuid=sip_uuid,
        pathToArchive=sip_path,
        identifiers=identifiers,
        sipName=sip_name,
    )
    if exitCode == 1:
        print('Error indexing AIP files', file=sys.stderr)
        return 1

    return 0
Пример #11
0
# along with Archivematica.  If not, see <http://www.gnu.org/licenses/>.

# @package Archivematica
# @subpackage archivematicaClientScript
# @author Joseph Perry <*****@*****.**>
import sys

# elasticSearchFunctions requires Django to be set up
import django

django.setup()
# archivematicaCommon
from custom_handlers import get_script_logger
import elasticSearchFunctions

from django.conf import settings as mcpclient_settings

logger = get_script_logger("archivematica.mcp.client.removeAIPFilesFromIndex")

if __name__ == '__main__':
    aip_uuid = sys.argv[1]

    if not mcpclient_settings.SEARCH_ENABLED:
        logger.info('Skipping. Indexing is currently disabled.')

    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()

    logger.info('Removing indexed files for AIP %s...', aip_uuid)
    elasticSearchFunctions.delete_aip_files(client, aip_uuid)
Пример #12
0
def list_display(request):

    if "aips" not in settings.SEARCH_ENABLED:
        return render(request, "archival_storage/list.html")
    current_page_number = int(request.GET.get("page", 1))
    logger.debug("Current page: %s", current_page_number)

    # get count of AIP files
    es_client = elasticSearchFunctions.get_client()
    aip_indexed_file_count = aip_file_count(es_client)

    # get AIPs
    order_by = request.GET.get("order_by", "name")
    sort_by = request.GET.get("sort_by", "up")

    sort_params = "order_by=" + order_by + "&sort_by=" + sort_by

    # use raw subfield to sort by name
    if order_by == "name":
        order_by = order_by + ".raw"

    # change sort_by param to ES sort directions
    if sort_by == "down":
        sort_by = "desc"
    else:
        sort_by = "asc"

    sort_specification = order_by + ":" + sort_by

    # get list of UUIDs of AIPs that are deleted or pending deletion
    aips_deleted_or_pending_deletion = []
    should_haves = [{
        "match": {
            "status": "DEL_REQ"
        }
    }, {
        "match": {
            "status": "DELETED"
        }
    }]
    query = {"query": {"bool": {"should": should_haves}}}
    deleted_aip_results = es_client.search(body=query,
                                           index="aips",
                                           _source="uuid,status")
    for deleted_aip in deleted_aip_results["hits"]["hits"]:
        aips_deleted_or_pending_deletion.append(deleted_aip["_source"]["uuid"])

    # Fetch results and paginate
    def es_pager(page, page_size):
        """
        Fetch one page of normalized entries from Elasticsearch.

        :param page: 1-indexed page to fetch
        :param page_size: Number of entries on a page
        :return: List of dicts for each entry, where keys and values have been cleaned up
        """
        start = (page - 1) * page_size
        results = es_client.search(
            index="aips",
            body={"query": {
                "match_all": {}
            }},
            _source="origin,uuid,filePath,created,name,size,encrypted",
            sort=sort_specification,
            size=page_size,
            from_=start,
        )
        return [d["_source"] for d in results["hits"]["hits"]]

    items_per_page = 10
    count = es_client.count(index="aips", body={"query": {
        "match_all": {}
    }})["count"]
    results = LazyPagedSequence(es_pager,
                                page_size=items_per_page,
                                length=count)

    # Paginate
    page = helpers.pager(results, items_per_page, current_page_number)

    # process deletion, etc., and format results
    aips = []
    for aip in page.object_list:
        # If an AIP was deleted or is pending deletion, react if status changed
        if aip["uuid"] in aips_deleted_or_pending_deletion:
            # check with storage server to see current status
            api_results = storage_service.get_file_info(uuid=aip["uuid"])
            try:
                aip_status = api_results[0]["status"]
            except IndexError:
                # Storage service does not know about this AIP
                # TODO what should happen here?
                logger.info("AIP not found in storage service: {}".format(aip))
                continue

            # delete AIP metadata in ElasticSearch if AIP has been deleted from the
            # storage server
            # TODO: handle this asynchronously
            if aip_status == "DELETED":
                elasticSearchFunctions.delete_aip(es_client, aip["uuid"])
                elasticSearchFunctions.delete_aip_files(es_client, aip["uuid"])
            elif aip_status != "DEL_REQ":
                # update the status in ElasticSearch for this AIP
                elasticSearchFunctions.mark_aip_stored(es_client, aip["uuid"])
        else:
            aip_status = "UPLOADED"

        # Tweak AIP presentation and add to display array
        if aip_status != "DELETED":
            aip["status"] = AIP_STATUS_DESCRIPTIONS[aip_status]

            try:
                size = "{0:.2f} MB".format(float(aip["size"]))
            except (TypeError, ValueError):
                size = "Removed"

            aip["size"] = size

            aip["href"] = aip["filePath"].replace(AIPSTOREPATH + "/",
                                                  "AIPsStore/")
            aip["date"] = aip["created"]

            aips.append(aip)

    total_size = total_size_of_aips(es_client)
    # Find out which AIPs are encrypted

    return render(
        request,
        "archival_storage/list.html",
        {
            "total_size": total_size,
            "aip_indexed_file_count": aip_indexed_file_count,
            "aips": aips,
            "page": page,
            "search_params": sort_params,
        },
    )
Пример #13
0
def index_aip():
    """ Write AIP information to ElasticSearch. """
    sip_uuid = sys.argv[1]  # %SIPUUID%
    sip_name = sys.argv[2]  # %SIPName%
    sip_path = sys.argv[3]  # %SIPDirectory%
    sip_type = sys.argv[4]  # %SIPType%

    if not mcpclient_settings.SEARCH_ENABLED:
        logger.info('Skipping indexing: indexing is currently disabled.')
        return 0

    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()

    print('SIP UUID:', sip_uuid)
    aip_info = storage_service.get_file_info(uuid=sip_uuid)
    print('AIP info:', aip_info)
    aip_info = aip_info[0]

    mets_name = 'METS.{}.xml'.format(sip_uuid)
    mets_path = os.path.join(sip_path, mets_name)

    identifiers = get_identifiers(sip_path)

    # If this is an AIC, find the number of AIP stored in it and index that
    aips_in_aic = None
    if sip_type == "AIC":
        try:
            uv = UnitVariable.objects.get(unittype="SIP",
                                          unituuid=sip_uuid,
                                          variable="AIPsinAIC")
            aips_in_aic = uv.variablevalue
        except UnitVariable.DoesNotExist:
            pass

    print('Indexing AIP info')
    # Delete ES index before creating new one if reingesting
    if 'REIN' in sip_type:
        print('Deleting outdated entry for AIP and AIP files with UUID',
              sip_uuid, 'from archival storage')
        elasticSearchFunctions.delete_aip(client, sip_uuid)
        elasticSearchFunctions.delete_aip_files(client, sip_uuid)

    # Index AIP
    elasticSearchFunctions.index_aip(client,
                                     sip_uuid,
                                     sip_name,
                                     aip_info['current_full_path'],
                                     mets_path,
                                     size=aip_info['size'],
                                     aips_in_aic=aips_in_aic,
                                     identifiers=identifiers,
                                     encrypted=aip_info['encrypted'])

    # Index AIP files
    print('Indexing AIP files')
    # Even though we treat MODS identifiers as SIP-level, we need to index them
    # here because the archival storage tab actually searches on the
    # aips/aipfile index.
    exitCode = elasticSearchFunctions.index_files(
        client,
        index='aips',
        type_='aipfile',
        uuid=sip_uuid,
        pathToArchive=sip_path,
        identifiers=identifiers,
        sipName=sip_name,
    )
    if exitCode == 1:
        print('Error indexing AIP files', file=sys.stderr)
        return 1

    return 0
Пример #14
0
    def process_package(
        self, es_client, package_info, temp_dir, delete_before_reindexing, is_aic=False
    ):
        """Index package in 'aips' and 'aipfiles' indices.

        :param es_client: Elasticsearch client.
        :param package_info: Package info dict returned by Storage
        Service.
        :param temp_dir: Path to tempdir for downloaded METS files.
        :param delete_before_reindexing: Boolean of whether to delete
        package from indices prior to reindexing.
        :is_aic: Optional boolean to indicate if package being indexed
        is an AIC.

        :returns: Boolean indicating success.
        """
        uuid = package_info["uuid"]

        # Download the AIP METS file to a temporary directory.
        mets_relative_path = am.relative_path_to_aip_mets_file(
            package_info["uuid"], package_info["current_path"]
        )
        mets_filename = os.path.basename(mets_relative_path)
        mets_download_path = os.path.join(temp_dir, mets_filename)
        storageService.extract_file(uuid, mets_relative_path, mets_download_path)

        if not os.path.isfile(mets_download_path):
            error_message = "Unable to download AIP METS file from Storage Service"
            self.error(
                "Error indexing package {0}. Details: {1}".format(uuid, error_message)
            )
            return False

        aips_in_aic = None
        if is_aic:
            mets_root = etree.parse(mets_download_path)
            aips_in_aic = get_aips_in_aic(mets_root, temp_dir, uuid)

        package_name = am.package_name_from_path(
            package_info["current_path"], remove_uuid_suffix=True
        )

        aip_location = package_info.get("current_location", "")
        location_description = storageService.retrieve_storage_location_description(
            aip_location
        )

        if delete_before_reindexing:
            self.info(
                "Deleting package {} from 'aips' and 'aipfiles' indices.".format(uuid)
            )
            es.delete_aip(es_client, uuid)
            es.delete_aip_files(es_client, uuid)

        # Index the AIP and then immediately delete the METS file.
        try:
            es.index_aip_and_files(
                client=es_client,
                uuid=uuid,
                aip_stored_path=package_info["current_full_path"],
                mets_staging_path=mets_download_path,
                name=package_name,
                aip_size=package_info["size"],
                aips_in_aic=aips_in_aic,
                encrypted=package_info.get("encrypted", False),
                location=location_description,
            )
            self.info("Successfully indexed package {}".format(uuid))
            os.remove(mets_download_path)
            return True
        except (ElasticsearchException, etree.XMLSyntaxError) as err:
            self.error("Error indexing package {0}. Details: {1}".format(uuid, err))
            os.remove(mets_download_path)
            return False
def processAIPThenDeleteMETSFile(path, temp_dir, es_client,
                                 delete_existing_data=False):
    archive_file = os.path.basename(path)

    # Regex match the UUID - AIP might end with .7z, .tar.bz2, or
    # something else.
    match = re.search(
        r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
        archive_file)
    if match is not None:
        aip_uuid = match.group()
    else:
        return -1

    print('Processing AIP', aip_uuid)

    if delete_existing_data is True:
        print('Deleting AIP', aip_uuid, 'from aips/aip and aips/aipfile.')
        elasticSearchFunctions.delete_aip(es_client, aip_uuid)
        elasticSearchFunctions.delete_aip_files(es_client, aip_uuid)

    # AIP filenames are <name>-<uuid><extension>
    # Index of match end is right before the extension
    subdir = archive_file[:match.end()]
    aip_name = subdir[:-37]
    mets_file = "METS." + aip_uuid + ".xml"
    mets_file_relative_path = os.path.join("data", mets_file)
    if os.path.isfile(path):
        mets_file_relative_path = os.path.join(subdir, mets_file_relative_path)
    path_to_mets = extract_file(
        archive_path=path,
        destination_dir=temp_dir,
        relative_path=mets_file_relative_path)

    # If AIC, need to extract number of AIPs in AIC to index as well
    aips_in_aic = None
    root = etree.parse(path_to_mets)
    try:
        aip_type = root.find(
            "m:dmdSec/m:mdWrap/m:xmlData/dc:dublincore/dc:type",
            namespaces=NSMAP).text
    except AttributeError:
        pass
    else:
        if aip_type == "Archival Information Collection":
            aips_in_aic = get_aips_in_aic(root, path, temp_dir)

    aip_info = storage_service.get_file_info(uuid=aip_uuid)

    if aip_info:
        elasticSearchFunctions.index_aip(
            client=es_client,
            uuid=aip_uuid,
            name=aip_name,
            filePath=path,
            pathToMETS=path_to_mets,
            aips_in_aic=aips_in_aic,
            identifiers=[],  # TODO get these
            size=aip_info[0]['size'],
        )
        elasticSearchFunctions.index_mets_file_metadata(
            client=es_client,
            uuid=aip_uuid,
            metsFilePath=path_to_mets,
            index='aips',
            type_='aipfile',
            sipName=aip_name,
            identifiers=[],  # TODO get these
        )