예제 #1
0
 def test00_dump_zip_resource_list(self):
     rl = ResourceDumpManifest()
     rl.add(Resource('http://ex.org/a', length=7, path='tests/testdata/a'))
     rl.add(Resource('http://ex.org/b', length=21, path='tests/testdata/b'))
     d = Dump()
     zipf = os.path.join(self.tmpdir, "test00_dump.zip")
     d.write_zip(resources=rl, dumpfile=zipf)  # named args
     self.assertTrue(os.path.exists(zipf))
     self.assertTrue(zipfile.is_zipfile(zipf))
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(len(zo.namelist()), 3)
     zo.close()
     os.unlink(zipf)
예제 #2
0
 def test01_dump_zip_change_list(self):
     cl=ChangeDumpManifest()
     cl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a', change="updated") )
     cl.add( Resource('http://ex.org/b', length=21, path='resync/test/testdata/b', change="updated") )
     d=Dump()
     zipf=os.path.join(self.tmpdir,"test01_dump.zip")
     d.write_zip(cl,zipf) # positional args
     self.assertTrue( os.path.exists(zipf) )
     self.assertTrue( zipfile.is_zipfile(zipf) )
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( len(zo.namelist()), 3 )
     zo.close()
     os.unlink(zipf)
예제 #3
0
 def test00_dump_zip_resource_list(self):
     rl=ResourceDumpManifest()
     rl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a') )
     rl.add( Resource('http://ex.org/b', length=21, path='resync/test/testdata/b') )
     d=Dump()
     zipf=os.path.join(self.tmpdir,"test00_dump.zip")
     d.write_zip(resources=rl,dumpfile=zipf) # named args
     self.assertTrue( os.path.exists(zipf) )
     self.assertTrue( zipfile.is_zipfile(zipf) )
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( len(zo.namelist()), 3 )
     zo.close()
     os.unlink(zipf)
예제 #4
0
 def test01_dump_zip_change_list(self):
     cl = ChangeDumpManifest()
     cl.add(Resource('http://ex.org/a', length=7,
                     path='tests/testdata/a', change="updated"))
     cl.add(Resource('http://ex.org/b', length=21,
                     path='tests/testdata/b', change="updated"))
     d = Dump()
     zipf = os.path.join(self.tmpdir, "test01_dump.zip")
     d.write_zip(cl, zipf)  # positional args
     self.assertTrue(os.path.exists(zipf))
     self.assertTrue(zipfile.is_zipfile(zipf))
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(len(zo.namelist()), 3)
     zo.close()
     os.unlink(zipf)
예제 #5
0
        def generator() -> [SitemapData, Resource]:
            resourcedump = None
            ordinal = self.find_ordinal(Capability.resourcedump.name)
            resource_count = 0
            doc_start = None
            resource_generator = self.resource_generator()
            for resource_count, resource in resource_generator(resource_metadata):
                # stuff resource into resourcedump
                if resourcedump is None:
                    # resourcedump = ResourceDumpManifest()
                    resourcedump = ResourceDump()
                    doc_start = defaults.w3c_now()
                    resourcedump.md_at = doc_start

                resourcedump.add(resource)

                # under conditions: yield the current resourcedump
                if resource_count % self.param.max_items_in_list == 0:
                    ordinal += 1
                    doc_end = defaults.w3c_now()
                    resourcedump.md_completed = doc_end
                    d = Dump(resources = resourcedump)
                    zipf = self.param.abs_metadata_path("rd_" + str(ordinal) + ".zip")

                    print (str(zipf))
                    d.write_zip(resources=resourcedump, dumpfile=zipf)
                    dumpResource = Resource(uri=str(zipf))
                    yield dumpResource
                    resourcedump = None


            # under conditions: yield the current and last resourcedump
            if resourcedump:
                ordinal += 1
                doc_end = defaults.w3c_now()
                resourcedump.md_completed = doc_end
                d = Dump()
                zipf = self.param.abs_metadata_path("rd_" + str(ordinal) + ".zip")
                print (str(zipf))
                dumpResource = Resource(uri=str(zipf))
                yield dumpResource
                d.write_zip(resources=resourcedump, dumpfile=zipf)
예제 #6
0
    def create_zip(self,
                   resourcelist,
                   prefix,
                   write_list=False,
                   write_manifest=True):
        """
        Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name
        will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the
        zip.
        --  The resync.Dump.write_zip method used in this method has the side effect of changing local paths in
            resourcelist into paths relative in zip.
        :param resourcelist: resources to zip
        :param prefix: prefix of the zip file
        :param write_list: True if resourcelist should be written to local disc. Default: False
        :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True
        :return: the created zip as a resync.Resource.
        """

        md_at = None  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        index = -1
        zipfiles = sorted(
            glob(os.path.join(self.publish_dir, prefix + "*.zip")))
        if len(zipfiles) > 0:
            last_zip_file = zipfiles[len(zipfiles) - 1]
            basename = os.path.basename(last_zip_file)
            index = int(re.findall('\d+', basename)[0])

        zip_name = "%s%05d" % (prefix, index + 1)
        if (write_list):
            # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest.
            rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"),
                           "w")
            rl_file.write(resourcelist.as_xml())
            rl_file.close()

        zip_path = os.path.join(self.publish_dir, zip_name + ".zip")
        dump = Dump()
        dump.path_prefix = self.resource_dir
        dump.write_zip(resourcelist,
                       zip_path)  # paths in resourcelist will be stripped.
        md_completed = None  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        #print "Zipped %d resources in %s" % (len(resourcelist), zip_path)

        loc = self.publish_url + zip_name + ".zip"  # mandatory
        lastmod = self.last_modified(resourcelist)  # optional
        md_type = "application/zip"  # recommended
        md_length = os.stat(zip_path).st_size
        md5 = compute_md5_for_file(zip_path)

        zip_resource = Resource(uri=loc,
                                lastmod=lastmod,
                                length=md_length,
                                md5=md5,
                                mime_type=md_type,
                                md_at=md_at,
                                md_completed=md_completed)
        if write_manifest:
            rdm = ResourceDumpManifest(resources=resourcelist.resources)
            rdm_file = open(
                os.path.join(self.publish_dir,
                             PREFIX_MANIFEST + zip_name + ".xml"), "w")
            rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml"
            rdm_file.write(rdm.as_xml())
            rdm_file.close()
            zip_resource.link_set(rel="content", href=rdm_url)

        return zip_resource
예제 #7
0
        def generator(changedump=None) -> [SitemapData, ChangeDump]:

            resource_generator = self.resource_generator()
            self.update_previous_state()
            prev_r = self.previous_resources
            curr_r = {
                resource.uri: resource
                for count, resource in resource_generator(resource_metadata)
            }
            created = [r for r in curr_r.values() if r.uri not in prev_r]
            updated = [
                r for r in curr_r.values()
                if r.uri in prev_r and r.md5 != prev_r[r.uri].md5
            ]
            deleted = [r for r in prev_r.values() if r.uri not in curr_r]
            unchang = [
                r for r in curr_r.values()
                if r.uri in prev_r and r.md5 == prev_r[r.uri].md5
            ]

            # remove lastmod from deleted resource metadata
            for resource in deleted:
                resource.lastmod = None

            num_created = len(created)
            num_updated = len(updated)
            num_deleted = len(deleted)
            tot_changes = num_created + num_updated + num_deleted
            self.observers_inform(self,
                                  ExecutorEvent.found_changes,
                                  created=num_created,
                                  updated=num_updated,
                                  deleted=num_deleted,
                                  unchanged=len(unchang))
            all_changes = {
                "created": created,
                "updated": updated,
                "deleted": deleted
            }

            ordinal = self.find_ordinal(Capability.changedump.name)

            resource_count = 0
            if changedump:
                ordinal -= 1
                resource_count = len(changedump)
                if resource_count >= self.param.max_items_in_list:
                    changedump = None
                    ordinal += 1
                    resource_count = 0

            for kv in all_changes.items():
                for resource in kv[1]:
                    if changedump is None:
                        changedump = ChangeDump()
                        changedump.md_from = self.date_changedump_from

                    resource.change = kv[
                        0]  # type of change: created, updated or deleted
                    resource.md_datetime = self.date_start_processing
                    changedump.add(resource)

                    resource_count += 1

                    # under conditions: yield the current changedump
                    if resource_count % self.param.max_items_in_list == 0:
                        ordinal += 1
                        # sitemap_data = self.finish_sitemap(ordinal, changedump)
                        d = Dump(resources=changedump)
                        # zipf = os.path.join('/tmp', "cd_" + str(ordinal) + ".zip")
                        zipf = self.param.abs_metadata_path("cd_" +
                                                            str(ordinal) +
                                                            ".zip")
                        print(str(zipf))
                        d.write_zip(resources=changedump, dumpfile=zipf)
                        doc_end = defaults.w3c_now()

                        sitemap_data = self.finish_sitemap(
                            ordinal,
                            changedump,
                            doc_start=self.date_start_processing,
                            doc_end=doc_end)
                        # dumpResource = ChangeDump(Resource(uri=str(zipf)))
                        dumpResource = ChangeDump(uri=str(zipf))
                        # yield sitemap_data, changedump
                        yield sitemap_data, dumpResource
                        # yield sitemap_data, zipf
                        # yield zipf
                        changedump = None

            # under conditions: yield the current and last changedump
            if changedump and tot_changes > 0:
                ordinal += 1
                doc_end = defaults.w3c_now()
                changedump.md_completed = doc_end
                d = Dump()
                zipf = self.param.abs_metadata_path("cd_" + str(ordinal) +
                                                    ".zip")
                print(str(zipf))
                sitemap_data = self.finish_sitemap(
                    ordinal,
                    changedump,
                    doc_start=self.date_start_processing,
                    doc_end=doc_end)
                # dumpResource = ChangeDump(Resource(uri=str(zipf)))
                dumpResource = ChangeDump(uri=str(zipf))
                # dumpResource = ChangeDump(uri=str(zipf))
                # yield sitemap_data, changedump
                yield sitemap_data, dumpResource
                # yield sitemap_data, zipf
                # yield zipf
                d.write_zip(resources=changedump, dumpfile=zipf)
예제 #8
0
    def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True):
        """
        Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name
        will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the
        zip.
        --  The resync.Dump.write_zip method used in this method has the side effect of changing local paths in
            resourcelist into paths relative in zip.
        :param resourcelist: resources to zip
        :param prefix: prefix of the zip file
        :param write_list: True if resourcelist should be written to local disc. Default: False
        :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True
        :return: the created zip as a resync.Resource.
        """

        md_at = (
            None
        )  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        index = -1
        zipfiles = sorted(glob(os.path.join(self.publish_dir, prefix + "*.zip")))
        if len(zipfiles) > 0:
            last_zip_file = zipfiles[len(zipfiles) - 1]
            basename = os.path.basename(last_zip_file)
            index = int(re.findall("\d+", basename)[0])

        zip_name = "%s%05d" % (prefix, index + 1)
        if write_list:
            # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest.
            rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w")
            rl_file.write(resourcelist.as_xml())
            rl_file.close()

        zip_path = os.path.join(self.publish_dir, zip_name + ".zip")
        dump = Dump()
        dump.path_prefix = self.resource_dir
        dump.write_zip(resourcelist, zip_path)  # paths in resourcelist will be stripped.
        md_completed = (
            None
        )  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        # print "Zipped %d resources in %s" % (len(resourcelist), zip_path)

        loc = self.publish_url + zip_name + ".zip"  # mandatory
        lastmod = self.last_modified(resourcelist)  # optional
        md_type = "application/zip"  # recommended
        md_length = os.stat(zip_path).st_size
        md5 = compute_md5_for_file(zip_path)

        zip_resource = Resource(
            uri=loc,
            lastmod=lastmod,
            length=md_length,
            md5=md5,
            mime_type=md_type,
            md_at=md_at,
            md_completed=md_completed,
        )
        if write_manifest:
            rdm = ResourceDumpManifest(resources=resourcelist.resources)
            rdm_file = open(os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w")
            rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml"
            rdm_file.write(rdm.as_xml())
            rdm_file.close()
            zip_resource.link_set(rel="content", href=rdm_url)

        return zip_resource