def make_record(file_path, original_name=None): """ build a PremisNode.Object from a file and use it to instantiate a record __Args__ 1. file_path (str): The full path to a file 2. item (LDRItem): The LDRItem representative of the file contents __Returns__ 1. (PremisRecord): The populated record instance """ obj = _make_object(file_path, original_name) rec = PremisRecord(objects=[obj]) rec.add_event(_make_event()) _link_obj_and_event(rec) return rec
def edit_a_premis_agent(dto): identifier = dto.identifier pairtree_identifier = str(identifier_to_path(identifier)) path_to_agent_record = create_agent_path(dto, identifier) record_to_edit = PremisRecord(frompath=path_to_agent_record) agents_list = record_to_edit.get_agent_list() agent_node = agents_list[0] for n_field in dto.edit_fields: if n_field == "name": agent_node.set_agentName(getattr(dto, n_field)) elif n_field == "type": agent_node.set_agentType(getattr(dto, n_field)) agent_list = [agent_node] record_to_edit = PremisRecord(agents=agent_list) try: write_a_premis_record(record_to_edit, path_to_agent_record) return (True, identifier) except IOError: return (False, None)
def add_event_to_a_premis_agent(dto): """a function to add a PREMIS event to a particular premis record __Args__ 1. premis_record (PremisRecord) an instance of pyremis.lib.PremisRecord 2. an_event (Event): an instance of pypremis.nodes.Event """ path_to_agent_record = join(dto.root, str(identifier_to_path(dto.identifier)), "prf", "agent.xml") record_to_edit = PremisRecord(frompath=path_to_agent_record) agents = record_to_edit.get_agent_list() agent = agents[0] stderr.write(dto.event) new_linked_event = LinkingEventIdentifier("DOI", dto.event) stderr.write(str(new_linked_event)) agent.add_linkingEventIdentifier(new_linked_event) records_to_edit = PremisRecord(agents=[agent]) write_a_premis_record(record_to_edit, path_to_agent_record) return True
def add_event_to_premis_record(path_to_record, new_event): the_record = PremisRecord(frompath=path_to_record) the_record.add_event(new_event) print(path_to_record) print(the_record) the_record.write_to_file(path_to_record) return True
def open_premis_record(premis_file_path): """a function to attempt to create an instance of a PremisRecord __Args__ 1. premis_file_path (str): a string pointing to the location of a premis xml file on-disk """ output = None try: output = PremisRecord(frompath=premis_file_path) except ValueError: stderr.write( "{} is not a valid premis record\n".format(premis_file_path)) return output
def create_a_new_premis_agent(dto): identifier = uuid4().hex path_to_agent = create_agent_path(dto, identifier) id_node = AgentIdentifier("DOI", identifier) new_agent = Agent(id_node) new_agent.set_agentType(dto.type) new_agent.set_agentName(dto.name) new_record = PremisRecord(agents=[new_agent]) try: write_out_a_complete_file_tree(dirname(path_to_agent)) write_a_premis_record(new_record, path_to_agent) return (True, identifier) except IOError: return (False, None)
def post(self): def add_ingest_event(rec): def _build_eventDetailInformation(): return EventDetailInformation( eventDetail="bystream copied into " + "the long term storage environment.") def _build_eventIdentifier(): return EventIdentifier("uuid4", uuid4().hex) def _build_event(): e = Event(_build_eventIdentifier(), "ingestion", datetime.now().isoformat()) e.add_eventDetailInformation(_build_eventDetailInformation()) return e event = _build_event() obj = rec.get_object_list()[0] event.add_linkingObjectIdentifier( LinkingObjectIdentifierFactory(obj).produce_linking_node()) obj.add_linkingEventIdentifier( LinkingEventIdentifierFactory(event).produce_linking_node()) rec.add_event(event) def get_md5_from_premis(rec): obj = rec.get_object_list()[0] for objChar in obj.get_objectCharacteristics(): for fixity in objChar.get_fixity(): if fixity.get_messageDigestAlgorithm() == "md5": return fixity.get_messageDigest() log.info("POST received @ AddMaterialSuite endpoint") log.debug("Parsing arguments") parser = reqparse.RequestParser() parser.add_argument("content", help="Specify the content file", type=FileStorage, location='files', required=True) parser.add_argument("premis", help="Specify the PREMIS file", type=FileStorage, location='files', required=True) args = parser.parse_args() log.debug("Arguments parsed") premis_rec = None log.debug("Instantiating and reading PREMIS") with tempfile.TemporaryDirectory() as tmpdir: tmp_premis_path = str(Path(tmpdir, uuid4().hex)) args['premis'].save(tmp_premis_path) premis_rec = PremisRecord(frompath=tmp_premis_path) log.debug("Getting the identifier") identifier = premis_rec.get_object_list()[0].\ get_objectIdentifier()[0].\ get_objectIdentifierValue() if identifier != secure_filename(identifier): log.critical( "Insecure identifier detected! ({})".format(identifier)) abort(500) else: log.debug("Identifier Found: {}".format(identifier)) log.debug("Creating containing dirs") log.debug("Saving content") content_target = BLUEPRINT.config['_LTS_FS'].new_file(_id=identifier) args['content'].save(content_target) content_target.close() log.debug("Content saved") log.debug("Adding ingest event to PREMIS record") add_ingest_event(premis_rec) log.debug("Ingest event added") log.debug("Writing PREMIS to tmp disk") with tempfile.TemporaryDirectory() as tmpdir: tmp_premis_path = str(Path(tmpdir, uuid4().hex)) premis_rec.write_to_file(tmp_premis_path) premis_target = BLUEPRINT.config['_PREMIS_FS'].new_file( _id=identifier) with open(tmp_premis_path, 'rb') as f: premis_target.write(f.read()) premis_target.close() log.debug("PREMIS written") return {"created": API.url_for(MaterialSuite, id=identifier)}
def test_duplicate_identifier_fails(self): identifier = EventIdentifier("stupid", "1") event_one = Event(identifier, "something", "now") event_two = Event(identifier, "something else", "later") with self.assertRaises(pypremislib.DuplicateIdentifierError): PremisRecord(events=[event_one, event_two])
def setUp(self): """Import the 'kitchen-sink' XML file""" global kitchen_sink kitchen_sink = PremisRecord(frompath='kitchen-sink.xml')
def post(self): def retrieve_obj_id(rec): obj = rec.get_object_list()[0] obj_id = obj.get_objectIdentifier()[0] obj_id_value = obj_id.get_objectIdentifierValue() return obj_id_value log.info("POST received.") log.debug("Parsing arguments") parser = reqparse.RequestParser() parser.add_argument("md5", required=True, help="The md5 checksum of the file.", type=str) parser.add_argument("name", help="The name of the resource", type=str, default=None, required=False) parser.add_argument( "file", required=True, help="The file to put into the Long Term Storage environment.", type=FileStorage, location="files") parser.add_argument("accession_id", required=True, help="The accession to which this file belongs", type=str) args = parser.parse_args() log.debug("Arguments parsed") # Set up a little working environment, a tmpdir to write files into log.debug("Creating a temporary directory to work in.") _tmpdir = tempfile.TemporaryDirectory() tmpdir = _tmpdir.name # Make a placeholder path - note we never use the client provided # filename to instantiate the file _ever_ in order to avoid security # considerations that would entail. in_file_path = str(Path(tmpdir, uuid4().hex)) # Save the file to a tmp location log.debug("Saving file into tmpdir") args['file'].save(in_file_path) # Generate a baseline md5 of what we now have saved... log.info("Generating md5 of received file") md5 = None with open(in_file_path, 'rb') as f: hasher = _md5() data = f.read(65536) while data: hasher.update(data) data = f.read(65536) md5 = hasher.hexdigest() # Be sure it matches what the client provided off the bat # TODO: handle failure differently than raising an exception in the # future. log.info("md5 calculated for file: {}".format(md5)) if md5 == args['md5']: log.debug("md5 matches provided md5") else: log.critical( "md5 mismatch. " + "Calculated: {} | Provided: {}".format(md5, args['md5'])) abort(500) # Kick the file off the PREMISer, as defined in the config log.debug("Transmitting file to PREMISer") with open(in_file_path, 'rb') as f: data = {"md5": md5} if args.get("name"): data['originalName'] = args['name'] data['file'] = ('file', f) premis_response_multipart_encoder = MultipartEncoder(data) premis_response = requests.post( BLUEPRINT.config['PREMIS_ENDPOINT'], data=premis_response_multipart_encoder, headers={ "Content-Type": premis_response_multipart_encoder.content_type }, stream=True) try: premis_response.raise_for_status() except: log.critical("Error in transmission to or response from " + "PREMISer") try: premis_str = premis_response.content.decode("utf-8") except: log.critical("Response from PREMISer could not be " + "decoded as utf-8") # Instantiate the PREMIS file we got back, again as a random filename in # our working dir log.debug("Instantiating PREMIS file") premis_path = str(Path(tmpdir, uuid4().hex)) with open(premis_path, 'w') as f: f.write(premis_str) log.debug("Reading PREMIS file...") # Grab the ID the PREMISer minted rec = PremisRecord(frompath=premis_path) objID = retrieve_obj_id(rec) log.debug("Retrieved PREMIS ID: {}".format(objID)) # POST the file and the PREMIS up into the materialsuite endpoint log.debug("POSTing file to materialsuite endpoint") ingest_output = None with open(in_file_path, 'rb') as content_stream: with open(premis_path, 'rb') as premis_stream: materialsuite_multipart_encoder = MultipartEncoder({ "content": ('content', content_stream), "premis": ('premis', premis_stream) }) ms_response = requests.post( BLUEPRINT.config['MATERIALSUITE_ENDPOINT'], data=materialsuite_multipart_encoder, headers={ 'Content-Type': materialsuite_multipart_encoder.content_type }, stream=True) try: ms_response.raise_for_status() except: log.critical("Error in response from materialsuite " + "endpoint") abort(500) try: ingest_output = ms_response.json() except: log.critical("Response from materialsuite endpoint " + "could not be interpreted as JSON") abort(500) # Check to see if the accession identifier exists log.debug("Checking the acc exists in the id nest") acc_output = {} target_acc_url = BLUEPRINT.config['ACCS_ENDPOINT'] + args[ 'accession_id'] + "/" acc_exists = requests.head(target_acc_url).status_code == 200 if not acc_exists: log.critical("Acc specified ({}) doesn't exist".format( args['accession_id'])) abort(500) else: log.debug("Acc identifier ({}) detected in id nest".format( args['accession_id'])) # Add the id to the acc record log.debug("Adding member to acc") acc_response = requests.post(BLUEPRINT.config['ACCS_ENDPOINT'] + args['accession_id'] + "/", data={"member": objID}) try: acc_response.raise_for_status() except: log.critical("Problem with the response from the idnest") abort(500) try: acc_output["member_addition"] = acc_response.json() except: log.critical("response from the idnest could not be " + "interpreted as JSON") abort(500) log.debug("Cleaning up tmpdir") # Cleanup del _tmpdir return { "status": "success", "ingest_output": ingest_output, "acc_output": acc_output }
def main(longterm, arkid, num_files_per_segment=None): reader = FileSystemArchiveReader(longterm, arkid) archive = reader.read() landing_page = namedtuple( "landing_page", "accession_id collection_title description segments accession_record") msuites = archive.get_materialsuite_list() tally = 0 current_bunch = [] all_bunches = [] if not num_files_per_segment: num_files_per_segment = 5 total_msuites = len(msuites) count = 0 for msuite in msuites: count += 1 precord = PremisRecord(frompath=str(msuite.premis.path)) original_name = precord.objects_list[0].get_originalName() n_tuple = namedtuple("an_item", "name contenturl premisurl")(original_name, URL_BASE + arkid + "/" +\ msuite.identifier + "/content", URL_BASE + arkid + "/" +\ msuite.identifier + "/premis") current_bunch.append(n_tuple) tally += 1 if len(current_bunch) == num_files_per_segment: tally = 0 all_bunches.append(current_bunch) current_bunch = [] elif count == total_msuites: all_bunches.append(current_bunch) tally_bunch = 0 total = 0 pages = [x for x in range(0, len(all_bunches) + 1)] pages_dict = {} for page in pages: pages_dict[str(page)] = {"active": True, "startPoint": True} json_string = dumps(pages_dict) mkdir("./" + arkid) for n_bunch in all_bunches: tally_id = "{}.html".format(tally_bunch) total += len(n_bunch) tally_bunch += 1 pages_dict[str(tally_bunch)]['active'] = True pages_dict[str(tally_bunch)]['startPoint'] = True pages_dict[str(tally_bunch)]['numfiles'] = len(n_bunch) segment_template = ENV.get_template("section_list.html") segment_html = segment_template.render(arkid=arkid, label=str(tally_bunch), files=n_bunch, pagerecord=json_string) with open(join(arkid, tally_id), "w") as a_file_to_write: a_file_to_write.write(segment_html) landing_template = ENV.get_template("accession_landing.html") pages = sorted( [(x, pages_dict[x].get("numfiles")) for x in pages_dict.keys() if pages_dict[x].get("numfiles")], key=lambda x: x[0]) landing_html = landing_template.render( arkid=arkid, pages=pages, accessions=[ ACCESSION_URL_BASE + "/accession/" + basename(str(x.path)) for x in archive.get_accessionrecord_list() ], legalnotes=[ ACCESSION_URL_BASE + "/legalnote/" + basename(str(x.path)) for x in archive.get_legalnote_list() ], adminnotes=[ ACCESSION_URL_BASE + "/adminnote/" + basename(str(x.path)) for x in archive.get_adminnote_list() ]) with open(join(arkid, "index.html"), "w") as write_file: write_file.write(landing_html) return 0