예제 #1
0
def openInformationPackage(request, username):
    user_data_path = os.path.join(ip_data_path, username)
    vars = environment_variables(request)
    selected_ip = vars['selected_ip']
    object_path = os.path.join(user_data_path, selected_ip.ip_filename)
    t = tarfile.open(object_path, 'r')
    return t
예제 #2
0
def ip_structure(request, tab):
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    selected_ip = vars['selected_ip']
    template = loader.get_template('ipviewer/ip_structure.html')
    context = get_ip_structure(request)
    context["tab"] = tab
    return HttpResponse(template.render(context=context, request=request))
예제 #3
0
def file_from_ip(request, file_path):
    user_data_path = os.path.join(ip_data_path, request.user.username)
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    archive_file_path = os.path.join(user_data_path,
                                     vars['selected_ip'].ip_filename)
    t = tarfile.open(archive_file_path, 'r')
    info = t.getmember(file_path)
    f = t.extractfile(info)
    start_bytes = f.read(256)

    inst = ChunkedTarEntryReader(t)
    magic_mime_detect = magic.Magic(mime=True)
    mime = magic_mime_detect.from_buffer(start_bytes)

    return HttpResponse(inst.chunks(file_path), content_type=mime)
예제 #4
0
def get_basic_metadata(request, file_path):
    user_data_path = os.path.join(ip_data_path, request.user.username)
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    selected_ip = vars['selected_ip']
    print(file_path)
    archive_file_path = os.path.join(user_data_path, selected_ip.ip_filename)
    t = tarfile.open(archive_file_path, 'r')
    file_content = read_textfile_from_tar(t, file_path)
    tmp_file_path = "/tmp/%s" % randomword(10)
    res_events = []
    try:
        title = ""
        date = ""
        with open(tmp_file_path, 'w') as tmp_file:
            tmp_file.write(file_content)
        if fnmatch.fnmatch(file_path, metadata_file_pattern_ead):
            pead = ParsedEad("/tmp", tmp_file_path)
            dao_elements = pead.get_dao_elements()
            actual = pead.ead_tree.getroot().tag
            unit_titles = []
            unit_dates = []
            for dao_elm in dao_elements:
                unit_titles.append(
                    pead._first_md_val_ancpath(dao_elm, "unittitle"))
                unit_dates.append(
                    pead._first_md_val_ancpath(dao_elm, "unitdate"))
            title = unit_titles[0]
            date = unit_dates[0]
            events = ""
        elif fnmatch.fnmatch(file_path, metadata_file_pattern_premis):
            structure = get_ip_structure(request)
            logical_view = search(structure, "logical_view_data")
            events = search(logical_view, "events")
            for event in events:
                if len(event):
                    res_events.append({
                        'type':
                        event[0]['type'],
                        'datetime':
                        event[0]['datetime'],
                        'agent':
                        event[0]['linking_agent_id']['value']
                    })
            title = "Root PREMIS"
            date = "20.09.2017"

        md_type = ead_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_ead)  \
            else premis_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_premis) else "Other"

        return JsonResponse(
            {
                'success': True,
                'type': md_type,
                'title': title,
                'date': date,
                'events': res_events,
                'file_path': file_path
            },
            status=200)
    except Exception as error:
        logger.exception(error)
        return JsonResponse({
            'success': False,
            'error': str(error)
        },
                            status=500)
예제 #5
0
def representation_dependency_graph(request):  # noqa
    template = loader.get_template('ipviewer/representations.html')
    events = {}
    vars = environment_variables(request)
    selected_ip = vars['selected_ip']
    tarFile = openInformationPackage(request, request.user.username)
    root_mets = readRootMetsFromIP(tarFile)

    # TODO: read migration paths from PREMIS file

    try:
        if root_mets is not None:
            root_structs = root_mets[0].iter(
                '{http://www.loc.gov/METS/}structMap')
            for root_structMap in root_structs:
                if root_structMap.get('TYPE') == 'PHYSICAL':
                    for div in root_structMap.find(
                            '{http://www.loc.gov/METS/}div'):
                        mets_info_entries = [
                            member for member in tarFile.getmembers()
                            if re.match(mets_entry_pattern, member.name)
                        ]
                        root_mets_file_entry = mets_info_entries[0].name
                        root_mets_file_entry_base_dir = os.path.dirname(
                            root_mets_file_entry)
                        representation = get_representation_section(
                            div, root_mets, tarFile,
                            root_mets_file_entry_base_dir)
                        if 'nodes' in representation:
                            for node in representation['nodes']:
                                if 'metadata' in node['text']:
                                    if 'premis' in node:
                                        premis = node['premis']
                                        if 'events' in premis:
                                            for event in premis['events']:
                                                events[
                                                    event['datetime']] = event[
                                                        'type']
    except Exception as error:
        print("Error: %s" % str(error))

    nodes = [
        {
            "id": 1,
            "label": "MS Word 2003 XML Document (SIP)",
            "shape": "box"
        },
        {
            "id": 2,
            "label": "PDF document (ingest)",
            "shape": "box"
        },
        {
            "id": 3,
            "label": "PDF/A document (migration)",
            "shape": "box"
        },
    ]
    edges = [
        {
            "from": 1,
            "to": 2,
            "arrows": "to",
            "label": "Adobe Acrobat Office PDF Maker  v9.0"
        },
        {
            "from": 2,
            "to": 3,
            "arrows": "to",
            "label": "Ghostscript v1.3"
        },
    ]
    return JsonResponse({"nodes": nodes, "edges": edges}, status=200)
예제 #6
0
def representations(request, tab):
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    template = loader.get_template('ipviewer/representations.html')
    events = {}

    tarFile = openInformationPackage(request, request.user.username)
    root_mets, root_mets_file_entry_base_dir = readRootMetsFromIP(tarFile)
    if root_mets is not None:
        for root_structMap in root_mets.iter(
                '{http://www.loc.gov/METS/}structMap'):
            if root_structMap.get('TYPE') == 'PHYSICAL':
                for div in root_structMap.find(
                        '{http://www.loc.gov/METS/}div'):
                    representation = get_representation_section(
                        div, root_mets, tarFile, root_mets_file_entry_base_dir)
                    if 'nodes' in representation:
                        for node in representation['nodes']:
                            if 'metadata' in node['text']:
                                if 'premis' in node:
                                    premis = node['premis']
                                    if 'events' in premis:
                                        for event in premis['events']:
                                            events[event['datetime']] = event[
                                                'type']
    version_label = 0
    inventory = {"versions": {}}
    for datetime in events:
        inventory['versions'][str(version_label)] = {
            "created": datetime,
            "message": events[datetime]
        }
        version_label += 1

    version_timeline_data = [{
        "id": int(key),
        "content": "%s (%s)" % (val["message"], key),
        "start": val["created"],
        "className": "myClassName"
    } for key, val in inventory["versions"].items()]
    times = [val["created"] for key, val in inventory["versions"].items()]
    times.sort()
    if len(times) > 1:
        min_dtstr = times[0]
        max_dtstr = times[len(times) - 1]
        min_dt = get_date_from_iso_str(min_dtstr, DT_ISO_FMT_SEC_PREC)
        max_dt = get_date_from_iso_str(max_dtstr, DT_ISO_FMT_SEC_PREC)
        delta = max_dt - min_dt
        scale = ("seconds", delta.seconds) if delta.seconds < 60 \
            else ("minutes", int(delta.seconds / 60)) if delta.seconds < 3600 \
            else ("hours", int(delta.seconds / 3600)) if delta.seconds < 86400 \
            else ("days", delta.days) if delta.seconds < 2592000 \
            else ("months", int(delta.days / 30)) if delta.seconds < 31536000 \
            else ("years", int(delta.days / 365))
        scale_unit, scale_value = scale
    else:
        min_dtstr = max_dtstr = times[0]
        scale_unit = "days"
        scale_value = "3"
    context = {
        "version_timeline_data": version_timeline_data,
        "scale_unit": scale_unit,
        "scale_value": (scale_value * 10),
        "min_dt": min_dtstr,
        "max_dt": max_dtstr,
        "tab": tab,
        "demo": bool(request.GET.get('demo'))
    }
    return HttpResponse(template.render(context=context, request=request))
예제 #7
0
def get_ip_overview_context(request):
    user_data_path = os.path.join(ip_data_path, request.user.username)
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    object_path = os.path.join(user_data_path, vars['selected_ip'].ip_filename)
    tarFile = tarfile.open(object_path, 'r')
    mets_info_entries = [
        member for member in tarFile.getmembers()
        if re.match(mets_entry_pattern, member.name)
    ]
    if len(mets_info_entries) == 1:
        logger.info("Root METS file found in container file")
        root_mets_file_entry = mets_info_entries[0].name
        root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry)
        root_mets_content = read_textfile_from_tar(tarFile,
                                                   root_mets_file_entry)
        root_mets = ET.fromstring(bytes(root_mets_content, 'utf-8'))

        all_schemas = []
        for root_structMap in root_mets.iter(
                '{http://www.loc.gov/METS/}structMap'):
            if root_structMap.get('TYPE') == 'PHYSICAL':
                for div in root_structMap.find(
                        '{http://www.loc.gov/METS/}div'):
                    label = div.get('LABEL')
                    if label == 'schemas':
                        schemas = get_schemas_section(
                            div, root_mets, root_mets_file_entry_base_dir)
                        all_schemas += [
                            schema['text'] for schema in schemas['nodes']
                        ]
                        continue

        #print(root_mets.attrib['OBJID'])
        # for child in parsed_mets:
        #    print(child.tag, child.attrib)
        # for neighbor in parsed_mets.iter('neighbor'):
        #    print(neighbor.attrib)

        overview = {}
        total_size = 0
        total_number_content_files = 0
        content_mime_types = []
        representations = []
        overview['object_id'] = root_mets.attrib['OBJID']

        ead_info_entries = [
            member for member in tarFile.getmembers()
            if re.match(ead_entry_pattern, member.name)
        ]
        if len(ead_info_entries) == 1:
            logger.info("EAD file found in container file")
            root_ead_file_entry = ead_info_entries[0].name
            root_ead_file_entry_base_dir = os.path.dirname(root_ead_file_entry)
            root_ead_content = read_textfile_from_tar(tarFile,
                                                      root_ead_file_entry)
            root_ead = ET.fromstring(bytes(root_ead_content, 'utf-8'))

            found = [
                element.text for element in root_ead.iter(
                    '{http://ead3.archivists.org/schema/}titleproper')
            ]
            #TODO: test for empty
            overview['title'] = found[0]
        else:
            overview['title'] = "Unknown. EAD file missing."

        for root_fileGrp in root_mets.iter(
                '{http://www.loc.gov/METS/}fileGrp'):
            if root_fileGrp.attrib['USE'] == 'representations':
                print(root_fileGrp.tag, root_fileGrp.attrib)
                for root_file in root_fileGrp.iter(
                        '{http://www.loc.gov/METS/}file'):
                    FLocat = root_file.find('{http://www.loc.gov/METS/}FLocat')
                    rep_mets_file_entry = FLocat.get(
                        "{http://www.w3.org/1999/xlink}href")
                    rep_mets_file_entry = root_mets_file_entry_base_dir + rep_mets_file_entry.strip(
                        '.')
                    rep_mets_content = read_textfile_from_tar(
                        tarFile, rep_mets_file_entry)
                    rep_mets = ET.fromstring(bytes(rep_mets_content, 'utf-8'))
                    representation = {}
                    representation['identifier'] = rep_mets.get('OBJID')
                    print(rep_mets)
                    for rep_fileGrp in rep_mets.iter(
                            '{http://www.loc.gov/METS/}fileGrp'):
                        print(rep_fileGrp.tag, rep_fileGrp.attrib)
                        for rep_file in rep_fileGrp.iter(
                                '{http://www.loc.gov/METS/}file'):
                            mimetype = rep_file.get('MIMETYPE')
                            #print(mimetype)
                            #mime = MimeTypes()
                            #file_mimetype, _ = mime.guess_type(file_url)
                            representation[
                                'label'] = get_representation_label_for_id(
                                    root_mets, root_file.get('ID'))
                            #representation['description'] = "From Where???"
                            content_mime_types.append(rep_file.get('MIMETYPE'))
                            total_size += int(rep_file.get('SIZE'))
                            total_number_content_files += 1
                    representations.append(representation)
        overview['representations'] = representations
        total_number_representations = len(representations)
        overview['stats'] = {
            "total_size": total_size,
            "total_number_content_files": total_number_content_files,
            "total_number_representations": total_number_representations,
            "schemas": ','.join(all_schemas),
            "content_mime_types": ", ".join(list(set(content_mime_types))),
        }
    return overview
예제 #8
0
def get_ip_structure(request):
    username = request.user.username
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    selected_ip = vars['selected_ip']

    logical_view_data = []
    tarFile = openInformationPackage(request, username)
    root_mets, root_mets_file_entry_base_dir = readRootMetsFromIP(tarFile)
    if root_mets is not None:
        # iterate structMap get ids and reference in dmdSec/amdSec/fileSec
        obj_id = root_mets.attrib['OBJID']
        logical_view_section = {
            "text": obj_id,
            "icon": "fa fa-archive fa-fw",
            "nodes": []
        }
        logical_view_data.append(logical_view_section)
        representations = {
            "text": "representations",
            "icon": "fa fa-inbox fa-fw",
            "nodes": []
        }
        for root_structMap in root_mets.iter(
                '{http://www.loc.gov/METS/}structMap'):
            if root_structMap.get('TYPE') == 'PHYSICAL':
                for div in root_structMap.find(
                        '{http://www.loc.gov/METS/}div'):
                    label = div.get('LABEL')
                    if label == 'schemas':
                        schemas = get_schemas_section(
                            div, root_mets, root_mets_file_entry_base_dir)
                        logical_view_section['nodes'].append(schemas)
                        continue
                    if label == 'metadata':
                        metadata = get_metadata_section(
                            div, root_mets, root_mets_file_entry_base_dir)
                        logical_view_section['nodes'].append(metadata)
                        continue
                    representation = get_representation_section(
                        div, root_mets, tarFile, root_mets_file_entry_base_dir)
                    representations['nodes'].append(representation)
        logical_view_section['nodes'].append(representations)
    physical_view_data = [{
        "text":
        "Container files",
        "icon":
        "fa fa-boxes fa-fw",
        "nodes": [{
            "text": selected_ip.ip_filename,
            "icon": "fa fa-archive fa-fw",
            "nodes": logical_view_section['nodes'],
        }]
    }]
    new_physical_view_data = copy.deepcopy(physical_view_data)
    for logical_package_node in logical_view_data:
        package_item_nodes = logical_package_node['nodes']
        for folder_node in package_item_nodes:
            if folder_node['text'] == 'metadata':
                item_nodes = folder_node['nodes']
                for i, item_node in enumerate(item_nodes):
                    if fnmatch.fnmatch(item_node['text'],
                                       metadata_file_pattern_ead):
                        item_node['text'] = ead_md_type
                        item_node['href'] = item_node['href'].replace(
                            "file-from-ip", "get-basic-metadata")
                    elif fnmatch.fnmatch(item_node['text'],
                                         metadata_file_pattern_premis):
                        item_node['text'] = premis_md_type
                        item_node['href'] = item_node['href'].replace(
                            "file-from-ip", "get-basic-metadata")
                    else:
                        item_node['class'] = "hidden"
            elif folder_node['text'] == 'schemas':
                folder_node['class'] = "hidden"
    context = {
        "logical_view_data": logical_view_data,
        "physical_view_data": new_physical_view_data,
    }
    return context