예제 #1
0
    def create_index(self, sitemap_data_iter: iter) -> SitemapData:
        changelist_index_path = self.param.abs_metadata_path(
            "changelist-index.xml")
        changelist_index_uri = self.param.uri_from_path(changelist_index_path)
        if os.path.exists(changelist_index_path):
            os.remove(changelist_index_path)

        changelist_files = sorted(
            glob(self.param.abs_metadata_path("changelist_*.xml")))
        if len(changelist_files) > 1:
            changelist_index = ChangeList()
            changelist_index.sitemapindex = True
            changelist_index.md_from = self.date_resourcelist_completed
            for cl_file in changelist_files:
                changelist = self.read_sitemap(cl_file, ChangeList())
                uri = self.param.uri_from_path(cl_file)
                changelist_index.resources.append(
                    Resource(uri=uri,
                             md_from=changelist.md_from,
                             md_until=changelist.md_until))

                if self.param.is_saving_sitemaps:
                    index_link = changelist.link("index")
                    if index_link is None:
                        changelist.link_set(rel="index",
                                            href=changelist_index_uri)
                        self.save_sitemap(changelist, cl_file)

            self.finish_sitemap(-1, changelist_index)
예제 #2
0
    def all_resources(self):
        all_resources = {}

        # search for resourcelists
        resourcelist_files = sorted(
            glob(self.paras.abs_metadata_path("resourcelist_*.xml")))
        for rl_file_name in resourcelist_files:
            resourcelist = ResourceList()
            with open(rl_file_name, "r", encoding="utf-8") as rl_file:
                sm = Sitemap()
                sm.parse_xml(rl_file, resources=resourcelist)

            all_resources.update({
                resource.uri: resource
                for resource in resourcelist.resources
            })

        # search for changelists
        changelist_files = sorted(
            glob(self.paras.abs_metadata_path("changelist_*.xml")))
        for cl_file_name in changelist_files:
            changelist = ChangeList()
            with open(cl_file_name, "r", encoding="utf-8") as cl_file:
                sm = Sitemap()
                sm.parse_xml(cl_file, resources=changelist)

            for resource in changelist.resources:
                if resource.change == "created" or resource.change == "updated":
                    all_resources.update({resource.uri: resource})
                elif resource.change == "deleted" and resource.uri in all_resources:
                    del all_resources[resource.uri]

        return all_resources
예제 #3
0
 def post_process_documents(self, sitemap_data_iter: iter):
     # change md:until value of older changelists - if we created new changelists.
     # self.changelist_files was globed before new documents were generated (self.update_previous_state).
     if len(sitemap_data_iter) > 0 and self.param.is_saving_sitemaps:
         for filename in self.changelist_files:
             changelist = self.read_sitemap(filename, ChangeList())
             if changelist.md_until is None:
                 changelist.md_until = self.date_start_processing
                 self.save_sitemap(changelist, filename)
예제 #4
0
    def generate_rs_documents(self, resource_metadata: iter):
        self.update_previous_state()
        self.date_changelist_from = self.date_resourcelist_completed
        changelist = None
        if len(self.changelist_files) > 0:
            changelist = self.read_sitemap(self.changelist_files[-1],
                                           ChangeList())

        sitemap_data_iter = []
        generator = self.changelist_generator(resource_metadata)

        for sitemap_data, changelist in generator(changelist=changelist):
            sitemap_data_iter.append(sitemap_data)

        return sitemap_data_iter
예제 #5
0
    def update_previous_state(self):
        if self.previous_resources is None:
            self.previous_resources = {}

            # search for resourcelists
            self.resourcelist_files = sorted(
                glob(self.param.abs_metadata_path("resourcelist_*.xml")))
            for rl_file_name in self.resourcelist_files:
                resourcelist = ResourceList()
                with open(rl_file_name, "r", encoding="utf-8") as rl_file:
                    sm = Sitemap()
                    sm.parse_xml(rl_file, resources=resourcelist)

                self.date_resourcelist_completed = resourcelist.md_completed
                if self.date_resourcelist_completed is None:
                    self.date_resourcelist_completed = resourcelist.md_at

                self.previous_resources.update({
                    resource.uri: resource
                    for resource in resourcelist.resources
                })

            # search for changelists
            self.changelist_files = sorted(
                glob(self.param.abs_metadata_path("changelist_*.xml")))
            for cl_file_name in self.changelist_files:
                changelist = ChangeList()
                with open(cl_file_name, "r", encoding="utf-8") as cl_file:
                    sm = Sitemap()
                    sm.parse_xml(cl_file, resources=changelist)

                for resource in changelist.resources:
                    if resource.change == "created" or resource.change == "updated":
                        self.previous_resources.update(
                            {resource.uri: resource})
                    elif resource.change == "deleted" and resource.uri in self.previous_resources:
                        del self.previous_resources[resource.uri]
예제 #6
0
        def generator(changelist=None) -> [SitemapData, ChangeList]:
            resource_generator = self.resource_generator()
            self.update_previous_state()
            prev_r = self.previous_resources
            curr_r = {
                resource.uri: resource
                for count, resource in resource_generator(resource_metadata)
            }
            created = [r for r in curr_r.values() if r.uri not in prev_r]
            updated = [
                r for r in curr_r.values()
                if r.uri in prev_r and r.md5 != prev_r[r.uri].md5
            ]
            deleted = [r for r in prev_r.values() if r.uri not in curr_r]
            unchang = [
                r for r in curr_r.values()
                if r.uri in prev_r and r.md5 == prev_r[r.uri].md5
            ]

            # remove lastmod from deleted resource metadata
            for resource in deleted:
                resource.lastmod = None

            num_created = len(created)
            num_updated = len(updated)
            num_deleted = len(deleted)
            tot_changes = num_created + num_updated + num_deleted
            self.observers_inform(self,
                                  ExecutorEvent.found_changes,
                                  created=num_created,
                                  updated=num_updated,
                                  deleted=num_deleted,
                                  unchanged=len(unchang))
            all_changes = {
                "created": created,
                "updated": updated,
                "deleted": deleted
            }

            ordinal = self.find_ordinal(Capability.changelist.name)

            resource_count = 0
            if changelist:
                ordinal -= 1
                resource_count = len(changelist)
                if resource_count >= self.param.max_items_in_list:
                    changelist = None
                    ordinal += 1
                    resource_count = 0

            for kv in all_changes.items():
                for resource in kv[1]:
                    if changelist is None:
                        changelist = ChangeList()
                        changelist.md_from = self.date_changelist_from

                    resource.change = kv[
                        0]  # type of change: created, updated or deleted
                    resource.md_datetime = self.date_start_processing
                    changelist.add(resource)
                    resource_count += 1

                    # under conditions: yield the current changelist
                    if resource_count % self.param.max_items_in_list == 0:
                        ordinal += 1
                        sitemap_data = self.finish_sitemap(ordinal, changelist)
                        yield sitemap_data, changelist
                        changelist = None

            # under conditions: yield the current and last changelist
            if changelist and tot_changes > 0:
                ordinal += 1
                sitemap_data = self.finish_sitemap(ordinal, changelist)
                yield sitemap_data, changelist