Пример #1
0
def make_record(file_path, original_name=None):
    """
    build a PremisNode.Object from a file and use it to instantiate a record

    __Args__

    1. file_path (str): The full path to a file
    2. item (LDRItem): The LDRItem representative of the file contents

    __Returns__

    1. (PremisRecord): The populated record instance
    """
    obj = _make_object(file_path, original_name)
    rec = PremisRecord(objects=[obj])
    rec.add_event(_make_event())
    _link_obj_and_event(rec)
    return rec
def edit_a_premis_agent(dto):
    identifier = dto.identifier
    pairtree_identifier = str(identifier_to_path(identifier))
    path_to_agent_record = create_agent_path(dto, identifier)
    record_to_edit = PremisRecord(frompath=path_to_agent_record)
    agents_list = record_to_edit.get_agent_list()
    agent_node = agents_list[0]
    for n_field in dto.edit_fields:
        if n_field == "name":
            agent_node.set_agentName(getattr(dto, n_field))
        elif n_field == "type":
            agent_node.set_agentType(getattr(dto, n_field))
    agent_list = [agent_node]
    record_to_edit = PremisRecord(agents=agent_list)
    try:
        write_a_premis_record(record_to_edit, path_to_agent_record)
        return (True, identifier)
    except IOError:
        return (False, None)
def add_event_to_a_premis_agent(dto):
    """a function to add a PREMIS event to a particular premis record

    __Args__
    1. premis_record (PremisRecord) an instance of pyremis.lib.PremisRecord
    2. an_event (Event): an instance of pypremis.nodes.Event
    """
    path_to_agent_record = join(dto.root,
                                str(identifier_to_path(dto.identifier)), "prf",
                                "agent.xml")
    record_to_edit = PremisRecord(frompath=path_to_agent_record)
    agents = record_to_edit.get_agent_list()
    agent = agents[0]
    stderr.write(dto.event)
    new_linked_event = LinkingEventIdentifier("DOI", dto.event)
    stderr.write(str(new_linked_event))
    agent.add_linkingEventIdentifier(new_linked_event)
    records_to_edit = PremisRecord(agents=[agent])
    write_a_premis_record(record_to_edit, path_to_agent_record)
    return True
def add_event_to_premis_record(path_to_record, new_event):
    the_record = PremisRecord(frompath=path_to_record)
    the_record.add_event(new_event)
    print(path_to_record)
    print(the_record)
    the_record.write_to_file(path_to_record)
    return True
def open_premis_record(premis_file_path):
    """a function to attempt to create an instance of a PremisRecord

    __Args__
    1. premis_file_path (str): a string pointing to the location of a premis xml file on-disk
    """
    output = None
    try:
        output = PremisRecord(frompath=premis_file_path)
    except ValueError:
        stderr.write(
            "{} is not a valid premis record\n".format(premis_file_path))
    return output
def create_a_new_premis_agent(dto):
    identifier = uuid4().hex
    path_to_agent = create_agent_path(dto, identifier)
    id_node = AgentIdentifier("DOI", identifier)
    new_agent = Agent(id_node)
    new_agent.set_agentType(dto.type)
    new_agent.set_agentName(dto.name)
    new_record = PremisRecord(agents=[new_agent])
    try:
        write_out_a_complete_file_tree(dirname(path_to_agent))
        write_a_premis_record(new_record, path_to_agent)
        return (True, identifier)
    except IOError:
        return (False, None)
    def post(self):
        def add_ingest_event(rec):
            def _build_eventDetailInformation():
                return EventDetailInformation(
                    eventDetail="bystream copied into " +
                    "the long term storage environment.")

            def _build_eventIdentifier():
                return EventIdentifier("uuid4", uuid4().hex)

            def _build_event():
                e = Event(_build_eventIdentifier(), "ingestion",
                          datetime.now().isoformat())
                e.add_eventDetailInformation(_build_eventDetailInformation())
                return e

            event = _build_event()
            obj = rec.get_object_list()[0]
            event.add_linkingObjectIdentifier(
                LinkingObjectIdentifierFactory(obj).produce_linking_node())
            obj.add_linkingEventIdentifier(
                LinkingEventIdentifierFactory(event).produce_linking_node())
            rec.add_event(event)

        def get_md5_from_premis(rec):
            obj = rec.get_object_list()[0]
            for objChar in obj.get_objectCharacteristics():
                for fixity in objChar.get_fixity():
                    if fixity.get_messageDigestAlgorithm() == "md5":
                        return fixity.get_messageDigest()

        log.info("POST received @ AddMaterialSuite endpoint")
        log.debug("Parsing arguments")
        parser = reqparse.RequestParser()
        parser.add_argument("content",
                            help="Specify the content file",
                            type=FileStorage,
                            location='files',
                            required=True)
        parser.add_argument("premis",
                            help="Specify the PREMIS file",
                            type=FileStorage,
                            location='files',
                            required=True)
        args = parser.parse_args()
        log.debug("Arguments parsed")

        premis_rec = None
        log.debug("Instantiating and reading PREMIS")
        with tempfile.TemporaryDirectory() as tmpdir:
            tmp_premis_path = str(Path(tmpdir, uuid4().hex))
            args['premis'].save(tmp_premis_path)
            premis_rec = PremisRecord(frompath=tmp_premis_path)
        log.debug("Getting the identifier")
        identifier = premis_rec.get_object_list()[0].\
            get_objectIdentifier()[0].\
            get_objectIdentifierValue()
        if identifier != secure_filename(identifier):
            log.critical(
                "Insecure identifier detected! ({})".format(identifier))
            abort(500)
        else:
            log.debug("Identifier Found: {}".format(identifier))

        log.debug("Creating containing dirs")

        log.debug("Saving content")
        content_target = BLUEPRINT.config['_LTS_FS'].new_file(_id=identifier)
        args['content'].save(content_target)
        content_target.close()
        log.debug("Content saved")
        log.debug("Adding ingest event to PREMIS record")
        add_ingest_event(premis_rec)
        log.debug("Ingest event added")
        log.debug("Writing PREMIS to tmp disk")
        with tempfile.TemporaryDirectory() as tmpdir:
            tmp_premis_path = str(Path(tmpdir, uuid4().hex))
            premis_rec.write_to_file(tmp_premis_path)
            premis_target = BLUEPRINT.config['_PREMIS_FS'].new_file(
                _id=identifier)
            with open(tmp_premis_path, 'rb') as f:
                premis_target.write(f.read())
                premis_target.close()
        log.debug("PREMIS written")
        return {"created": API.url_for(MaterialSuite, id=identifier)}
 def test_duplicate_identifier_fails(self):
     identifier = EventIdentifier("stupid", "1")
     event_one = Event(identifier, "something", "now")
     event_two = Event(identifier, "something else", "later")
     with self.assertRaises(pypremislib.DuplicateIdentifierError):
         PremisRecord(events=[event_one, event_two])
    def setUp(self):
        """Import the 'kitchen-sink' XML file"""

        global kitchen_sink
        kitchen_sink = PremisRecord(frompath='kitchen-sink.xml')
Пример #10
0
    def post(self):
        def retrieve_obj_id(rec):
            obj = rec.get_object_list()[0]
            obj_id = obj.get_objectIdentifier()[0]
            obj_id_value = obj_id.get_objectIdentifierValue()
            return obj_id_value

        log.info("POST received.")
        log.debug("Parsing arguments")
        parser = reqparse.RequestParser()
        parser.add_argument("md5",
                            required=True,
                            help="The md5 checksum of the file.",
                            type=str)
        parser.add_argument("name",
                            help="The name of the resource",
                            type=str,
                            default=None,
                            required=False)
        parser.add_argument(
            "file",
            required=True,
            help="The file to put into the Long Term Storage environment.",
            type=FileStorage,
            location="files")
        parser.add_argument("accession_id",
                            required=True,
                            help="The accession to which this file belongs",
                            type=str)
        args = parser.parse_args()
        log.debug("Arguments parsed")

        # Set up a little working environment, a tmpdir to write files into
        log.debug("Creating a temporary directory to work in.")
        _tmpdir = tempfile.TemporaryDirectory()
        tmpdir = _tmpdir.name

        # Make a placeholder path - note we never use the client provided
        # filename to instantiate the file _ever_ in order to avoid security
        # considerations that would entail.
        in_file_path = str(Path(tmpdir, uuid4().hex))

        # Save the file to a tmp location
        log.debug("Saving file into tmpdir")
        args['file'].save(in_file_path)

        # Generate a baseline md5 of what we now have saved...
        log.info("Generating md5 of received file")
        md5 = None
        with open(in_file_path, 'rb') as f:
            hasher = _md5()
            data = f.read(65536)
            while data:
                hasher.update(data)
                data = f.read(65536)
            md5 = hasher.hexdigest()

        # Be sure it matches what the client provided off the bat
        # TODO: handle failure differently than raising an exception in the
        # future.
        log.info("md5 calculated for file: {}".format(md5))
        if md5 == args['md5']:
            log.debug("md5 matches provided md5")
        else:
            log.critical(
                "md5 mismatch. " +
                "Calculated: {} | Provided: {}".format(md5, args['md5']))
            abort(500)

        # Kick the file off the PREMISer, as defined in the config
        log.debug("Transmitting file to PREMISer")
        with open(in_file_path, 'rb') as f:
            data = {"md5": md5}
            if args.get("name"):
                data['originalName'] = args['name']
                data['file'] = ('file', f)
            premis_response_multipart_encoder = MultipartEncoder(data)
            premis_response = requests.post(
                BLUEPRINT.config['PREMIS_ENDPOINT'],
                data=premis_response_multipart_encoder,
                headers={
                    "Content-Type":
                    premis_response_multipart_encoder.content_type
                },
                stream=True)
            try:
                premis_response.raise_for_status()
            except:
                log.critical("Error in transmission to or response from " +
                             "PREMISer")
            try:
                premis_str = premis_response.content.decode("utf-8")
            except:
                log.critical("Response from PREMISer could not be " +
                             "decoded as utf-8")

        # Instantiate the PREMIS file we got back, again as a random filename in
        # our working dir
        log.debug("Instantiating PREMIS file")
        premis_path = str(Path(tmpdir, uuid4().hex))
        with open(premis_path, 'w') as f:
            f.write(premis_str)

        log.debug("Reading PREMIS file...")
        # Grab the ID the PREMISer minted
        rec = PremisRecord(frompath=premis_path)
        objID = retrieve_obj_id(rec)
        log.debug("Retrieved PREMIS ID: {}".format(objID))

        # POST the file and the PREMIS up into the materialsuite endpoint
        log.debug("POSTing file to materialsuite endpoint")
        ingest_output = None
        with open(in_file_path, 'rb') as content_stream:
            with open(premis_path, 'rb') as premis_stream:
                materialsuite_multipart_encoder = MultipartEncoder({
                    "content": ('content', content_stream),
                    "premis": ('premis', premis_stream)
                })
                ms_response = requests.post(
                    BLUEPRINT.config['MATERIALSUITE_ENDPOINT'],
                    data=materialsuite_multipart_encoder,
                    headers={
                        'Content-Type':
                        materialsuite_multipart_encoder.content_type
                    },
                    stream=True)
                try:
                    ms_response.raise_for_status()
                except:
                    log.critical("Error in response from materialsuite " +
                                 "endpoint")
                    abort(500)
                try:
                    ingest_output = ms_response.json()
                except:
                    log.critical("Response from materialsuite endpoint " +
                                 "could not be interpreted as JSON")
                    abort(500)

        # Check to see if the accession identifier exists
        log.debug("Checking the acc exists in the id nest")
        acc_output = {}
        target_acc_url = BLUEPRINT.config['ACCS_ENDPOINT'] + args[
            'accession_id'] + "/"
        acc_exists = requests.head(target_acc_url).status_code == 200
        if not acc_exists:
            log.critical("Acc specified ({}) doesn't exist".format(
                args['accession_id']))
            abort(500)
        else:
            log.debug("Acc identifier ({}) detected in id nest".format(
                args['accession_id']))

        # Add the id to the acc record

        log.debug("Adding member to acc")
        acc_response = requests.post(BLUEPRINT.config['ACCS_ENDPOINT'] +
                                     args['accession_id'] + "/",
                                     data={"member": objID})
        try:
            acc_response.raise_for_status()
        except:
            log.critical("Problem with the response from the idnest")
            abort(500)
        try:
            acc_output["member_addition"] = acc_response.json()
        except:
            log.critical("response from the idnest could not be " +
                         "interpreted as JSON")
            abort(500)

        log.debug("Cleaning up tmpdir")
        # Cleanup
        del _tmpdir

        return {
            "status": "success",
            "ingest_output": ingest_output,
            "acc_output": acc_output
        }
Пример #11
0
def main(longterm, arkid, num_files_per_segment=None):
    reader = FileSystemArchiveReader(longterm, arkid)
    archive = reader.read()
    landing_page = namedtuple(
        "landing_page",
        "accession_id collection_title description segments accession_record")
    msuites = archive.get_materialsuite_list()
    tally = 0
    current_bunch = []
    all_bunches = []
    if not num_files_per_segment:
        num_files_per_segment = 5
    total_msuites = len(msuites)
    count = 0
    for msuite in msuites:
        count += 1
        precord = PremisRecord(frompath=str(msuite.premis.path))
        original_name = precord.objects_list[0].get_originalName()
        n_tuple = namedtuple("an_item",
                             "name contenturl premisurl")(original_name,
                                                          URL_BASE + arkid + "/" +\
                                                            msuite.identifier + "/content",
                                                          URL_BASE + arkid + "/" +\
                                                            msuite.identifier + "/premis")

        current_bunch.append(n_tuple)
        tally += 1
        if len(current_bunch) == num_files_per_segment:
            tally = 0
            all_bunches.append(current_bunch)
            current_bunch = []
        elif count == total_msuites:
            all_bunches.append(current_bunch)
    tally_bunch = 0
    total = 0
    pages = [x for x in range(0, len(all_bunches) + 1)]
    pages_dict = {}
    for page in pages:
        pages_dict[str(page)] = {"active": True, "startPoint": True}
    json_string = dumps(pages_dict)
    mkdir("./" + arkid)
    for n_bunch in all_bunches:
        tally_id = "{}.html".format(tally_bunch)
        total += len(n_bunch)
        tally_bunch += 1
        pages_dict[str(tally_bunch)]['active'] = True
        pages_dict[str(tally_bunch)]['startPoint'] = True
        pages_dict[str(tally_bunch)]['numfiles'] = len(n_bunch)
        segment_template = ENV.get_template("section_list.html")
        segment_html = segment_template.render(arkid=arkid,
                                               label=str(tally_bunch),
                                               files=n_bunch,
                                               pagerecord=json_string)
        with open(join(arkid, tally_id), "w") as a_file_to_write:
            a_file_to_write.write(segment_html)
    landing_template = ENV.get_template("accession_landing.html")
    pages = sorted(
        [(x, pages_dict[x].get("numfiles"))
         for x in pages_dict.keys() if pages_dict[x].get("numfiles")],
        key=lambda x: x[0])
    landing_html = landing_template.render(
        arkid=arkid,
        pages=pages,
        accessions=[
            ACCESSION_URL_BASE + "/accession/" + basename(str(x.path))
            for x in archive.get_accessionrecord_list()
        ],
        legalnotes=[
            ACCESSION_URL_BASE + "/legalnote/" + basename(str(x.path))
            for x in archive.get_legalnote_list()
        ],
        adminnotes=[
            ACCESSION_URL_BASE + "/adminnote/" + basename(str(x.path))
            for x in archive.get_adminnote_list()
        ])
    with open(join(arkid, "index.html"), "w") as write_file:
        write_file.write(landing_html)
    return 0