def openInformationPackage(request, username): user_data_path = os.path.join(ip_data_path, username) vars = environment_variables(request) selected_ip = vars['selected_ip'] object_path = os.path.join(user_data_path, selected_ip.ip_filename) t = tarfile.open(object_path, 'r') return t
def ip_structure(request, tab): vars = environment_variables(request) if not vars['selected_ip']: return {} selected_ip = vars['selected_ip'] template = loader.get_template('ipviewer/ip_structure.html') context = get_ip_structure(request) context["tab"] = tab return HttpResponse(template.render(context=context, request=request))
def file_from_ip(request, file_path): user_data_path = os.path.join(ip_data_path, request.user.username) vars = environment_variables(request) if not vars['selected_ip']: return {} archive_file_path = os.path.join(user_data_path, vars['selected_ip'].ip_filename) t = tarfile.open(archive_file_path, 'r') info = t.getmember(file_path) f = t.extractfile(info) start_bytes = f.read(256) inst = ChunkedTarEntryReader(t) magic_mime_detect = magic.Magic(mime=True) mime = magic_mime_detect.from_buffer(start_bytes) return HttpResponse(inst.chunks(file_path), content_type=mime)
def get_basic_metadata(request, file_path): user_data_path = os.path.join(ip_data_path, request.user.username) vars = environment_variables(request) if not vars['selected_ip']: return {} selected_ip = vars['selected_ip'] print(file_path) archive_file_path = os.path.join(user_data_path, selected_ip.ip_filename) t = tarfile.open(archive_file_path, 'r') file_content = read_textfile_from_tar(t, file_path) tmp_file_path = "/tmp/%s" % randomword(10) res_events = [] try: title = "" date = "" with open(tmp_file_path, 'w') as tmp_file: tmp_file.write(file_content) if fnmatch.fnmatch(file_path, metadata_file_pattern_ead): pead = ParsedEad("/tmp", tmp_file_path) dao_elements = pead.get_dao_elements() actual = pead.ead_tree.getroot().tag unit_titles = [] unit_dates = [] for dao_elm in dao_elements: unit_titles.append( pead._first_md_val_ancpath(dao_elm, "unittitle")) unit_dates.append( pead._first_md_val_ancpath(dao_elm, "unitdate")) title = unit_titles[0] date = unit_dates[0] events = "" elif fnmatch.fnmatch(file_path, metadata_file_pattern_premis): structure = get_ip_structure(request) logical_view = search(structure, "logical_view_data") events = search(logical_view, "events") for event in events: if len(event): res_events.append({ 'type': event[0]['type'], 'datetime': event[0]['datetime'], 'agent': event[0]['linking_agent_id']['value'] }) title = "Root PREMIS" date = "20.09.2017" md_type = ead_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_ead) \ else premis_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_premis) else "Other" return JsonResponse( { 'success': True, 'type': md_type, 'title': title, 'date': date, 'events': res_events, 'file_path': file_path }, status=200) except Exception as error: logger.exception(error) return JsonResponse({ 'success': False, 'error': str(error) }, status=500)
def representation_dependency_graph(request): # noqa template = loader.get_template('ipviewer/representations.html') events = {} vars = environment_variables(request) selected_ip = vars['selected_ip'] tarFile = openInformationPackage(request, request.user.username) root_mets = readRootMetsFromIP(tarFile) # TODO: read migration paths from PREMIS file try: if root_mets is not None: root_structs = root_mets[0].iter( '{http://www.loc.gov/METS/}structMap') for root_structMap in root_structs: if root_structMap.get('TYPE') == 'PHYSICAL': for div in root_structMap.find( '{http://www.loc.gov/METS/}div'): mets_info_entries = [ member for member in tarFile.getmembers() if re.match(mets_entry_pattern, member.name) ] root_mets_file_entry = mets_info_entries[0].name root_mets_file_entry_base_dir = os.path.dirname( root_mets_file_entry) representation = get_representation_section( div, root_mets, tarFile, root_mets_file_entry_base_dir) if 'nodes' in representation: for node in representation['nodes']: if 'metadata' in node['text']: if 'premis' in node: premis = node['premis'] if 'events' in premis: for event in premis['events']: events[ event['datetime']] = event[ 'type'] except Exception as error: print("Error: %s" % str(error)) nodes = [ { "id": 1, "label": "MS Word 2003 XML Document (SIP)", "shape": "box" }, { "id": 2, "label": "PDF document (ingest)", "shape": "box" }, { "id": 3, "label": "PDF/A document (migration)", "shape": "box" }, ] edges = [ { "from": 1, "to": 2, "arrows": "to", "label": "Adobe Acrobat Office PDF Maker v9.0" }, { "from": 2, "to": 3, "arrows": "to", "label": "Ghostscript v1.3" }, ] return JsonResponse({"nodes": nodes, "edges": edges}, status=200)
def representations(request, tab): vars = environment_variables(request) if not vars['selected_ip']: return {} template = loader.get_template('ipviewer/representations.html') events = {} tarFile = openInformationPackage(request, request.user.username) root_mets, root_mets_file_entry_base_dir = readRootMetsFromIP(tarFile) if root_mets is not None: for root_structMap in root_mets.iter( '{http://www.loc.gov/METS/}structMap'): if root_structMap.get('TYPE') == 'PHYSICAL': for div in root_structMap.find( '{http://www.loc.gov/METS/}div'): representation = get_representation_section( div, root_mets, tarFile, root_mets_file_entry_base_dir) if 'nodes' in representation: for node in representation['nodes']: if 'metadata' in node['text']: if 'premis' in node: premis = node['premis'] if 'events' in premis: for event in premis['events']: events[event['datetime']] = event[ 'type'] version_label = 0 inventory = {"versions": {}} for datetime in events: inventory['versions'][str(version_label)] = { "created": datetime, "message": events[datetime] } version_label += 1 version_timeline_data = [{ "id": int(key), "content": "%s (%s)" % (val["message"], key), "start": val["created"], "className": "myClassName" } for key, val in inventory["versions"].items()] times = [val["created"] for key, val in inventory["versions"].items()] times.sort() if len(times) > 1: min_dtstr = times[0] max_dtstr = times[len(times) - 1] min_dt = get_date_from_iso_str(min_dtstr, DT_ISO_FMT_SEC_PREC) max_dt = get_date_from_iso_str(max_dtstr, DT_ISO_FMT_SEC_PREC) delta = max_dt - min_dt scale = ("seconds", delta.seconds) if delta.seconds < 60 \ else ("minutes", int(delta.seconds / 60)) if delta.seconds < 3600 \ else ("hours", int(delta.seconds / 3600)) if delta.seconds < 86400 \ else ("days", delta.days) if delta.seconds < 2592000 \ else ("months", int(delta.days / 30)) if delta.seconds < 31536000 \ else ("years", int(delta.days / 365)) scale_unit, scale_value = scale else: min_dtstr = max_dtstr = times[0] scale_unit = "days" scale_value = "3" context = { "version_timeline_data": version_timeline_data, "scale_unit": scale_unit, "scale_value": (scale_value * 10), "min_dt": min_dtstr, "max_dt": max_dtstr, "tab": tab, "demo": bool(request.GET.get('demo')) } return HttpResponse(template.render(context=context, request=request))
def get_ip_overview_context(request): user_data_path = os.path.join(ip_data_path, request.user.username) vars = environment_variables(request) if not vars['selected_ip']: return {} object_path = os.path.join(user_data_path, vars['selected_ip'].ip_filename) tarFile = tarfile.open(object_path, 'r') mets_info_entries = [ member for member in tarFile.getmembers() if re.match(mets_entry_pattern, member.name) ] if len(mets_info_entries) == 1: logger.info("Root METS file found in container file") root_mets_file_entry = mets_info_entries[0].name root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry) root_mets_content = read_textfile_from_tar(tarFile, root_mets_file_entry) root_mets = ET.fromstring(bytes(root_mets_content, 'utf-8')) all_schemas = [] for root_structMap in root_mets.iter( '{http://www.loc.gov/METS/}structMap'): if root_structMap.get('TYPE') == 'PHYSICAL': for div in root_structMap.find( '{http://www.loc.gov/METS/}div'): label = div.get('LABEL') if label == 'schemas': schemas = get_schemas_section( div, root_mets, root_mets_file_entry_base_dir) all_schemas += [ schema['text'] for schema in schemas['nodes'] ] continue #print(root_mets.attrib['OBJID']) # for child in parsed_mets: # print(child.tag, child.attrib) # for neighbor in parsed_mets.iter('neighbor'): # print(neighbor.attrib) overview = {} total_size = 0 total_number_content_files = 0 content_mime_types = [] representations = [] overview['object_id'] = root_mets.attrib['OBJID'] ead_info_entries = [ member for member in tarFile.getmembers() if re.match(ead_entry_pattern, member.name) ] if len(ead_info_entries) == 1: logger.info("EAD file found in container file") root_ead_file_entry = ead_info_entries[0].name root_ead_file_entry_base_dir = os.path.dirname(root_ead_file_entry) root_ead_content = read_textfile_from_tar(tarFile, root_ead_file_entry) root_ead = ET.fromstring(bytes(root_ead_content, 'utf-8')) found = [ element.text for element in root_ead.iter( '{http://ead3.archivists.org/schema/}titleproper') ] #TODO: test for empty overview['title'] = found[0] else: overview['title'] = "Unknown. EAD file missing." for root_fileGrp in root_mets.iter( '{http://www.loc.gov/METS/}fileGrp'): if root_fileGrp.attrib['USE'] == 'representations': print(root_fileGrp.tag, root_fileGrp.attrib) for root_file in root_fileGrp.iter( '{http://www.loc.gov/METS/}file'): FLocat = root_file.find('{http://www.loc.gov/METS/}FLocat') rep_mets_file_entry = FLocat.get( "{http://www.w3.org/1999/xlink}href") rep_mets_file_entry = root_mets_file_entry_base_dir + rep_mets_file_entry.strip( '.') rep_mets_content = read_textfile_from_tar( tarFile, rep_mets_file_entry) rep_mets = ET.fromstring(bytes(rep_mets_content, 'utf-8')) representation = {} representation['identifier'] = rep_mets.get('OBJID') print(rep_mets) for rep_fileGrp in rep_mets.iter( '{http://www.loc.gov/METS/}fileGrp'): print(rep_fileGrp.tag, rep_fileGrp.attrib) for rep_file in rep_fileGrp.iter( '{http://www.loc.gov/METS/}file'): mimetype = rep_file.get('MIMETYPE') #print(mimetype) #mime = MimeTypes() #file_mimetype, _ = mime.guess_type(file_url) representation[ 'label'] = get_representation_label_for_id( root_mets, root_file.get('ID')) #representation['description'] = "From Where???" content_mime_types.append(rep_file.get('MIMETYPE')) total_size += int(rep_file.get('SIZE')) total_number_content_files += 1 representations.append(representation) overview['representations'] = representations total_number_representations = len(representations) overview['stats'] = { "total_size": total_size, "total_number_content_files": total_number_content_files, "total_number_representations": total_number_representations, "schemas": ','.join(all_schemas), "content_mime_types": ", ".join(list(set(content_mime_types))), } return overview
def get_ip_structure(request): username = request.user.username vars = environment_variables(request) if not vars['selected_ip']: return {} selected_ip = vars['selected_ip'] logical_view_data = [] tarFile = openInformationPackage(request, username) root_mets, root_mets_file_entry_base_dir = readRootMetsFromIP(tarFile) if root_mets is not None: # iterate structMap get ids and reference in dmdSec/amdSec/fileSec obj_id = root_mets.attrib['OBJID'] logical_view_section = { "text": obj_id, "icon": "fa fa-archive fa-fw", "nodes": [] } logical_view_data.append(logical_view_section) representations = { "text": "representations", "icon": "fa fa-inbox fa-fw", "nodes": [] } for root_structMap in root_mets.iter( '{http://www.loc.gov/METS/}structMap'): if root_structMap.get('TYPE') == 'PHYSICAL': for div in root_structMap.find( '{http://www.loc.gov/METS/}div'): label = div.get('LABEL') if label == 'schemas': schemas = get_schemas_section( div, root_mets, root_mets_file_entry_base_dir) logical_view_section['nodes'].append(schemas) continue if label == 'metadata': metadata = get_metadata_section( div, root_mets, root_mets_file_entry_base_dir) logical_view_section['nodes'].append(metadata) continue representation = get_representation_section( div, root_mets, tarFile, root_mets_file_entry_base_dir) representations['nodes'].append(representation) logical_view_section['nodes'].append(representations) physical_view_data = [{ "text": "Container files", "icon": "fa fa-boxes fa-fw", "nodes": [{ "text": selected_ip.ip_filename, "icon": "fa fa-archive fa-fw", "nodes": logical_view_section['nodes'], }] }] new_physical_view_data = copy.deepcopy(physical_view_data) for logical_package_node in logical_view_data: package_item_nodes = logical_package_node['nodes'] for folder_node in package_item_nodes: if folder_node['text'] == 'metadata': item_nodes = folder_node['nodes'] for i, item_node in enumerate(item_nodes): if fnmatch.fnmatch(item_node['text'], metadata_file_pattern_ead): item_node['text'] = ead_md_type item_node['href'] = item_node['href'].replace( "file-from-ip", "get-basic-metadata") elif fnmatch.fnmatch(item_node['text'], metadata_file_pattern_premis): item_node['text'] = premis_md_type item_node['href'] = item_node['href'].replace( "file-from-ip", "get-basic-metadata") else: item_node['class'] = "hidden" elif folder_node['text'] == 'schemas': folder_node['class'] = "hidden" context = { "logical_view_data": logical_view_data, "physical_view_data": new_physical_view_data, } return context