def post_pipeline(self):
     logger.info("Seen: %s, changed: %s",
              self.seen_count,
              self.changed_count,
              extra=dict(seen_count=self.seen_count,
                         changed_count=self.changed_count))
     print "*" * 500, self.seen_count, self.changed_count
Пример #2
0
 def post_pipeline(self):
     logger.info("Seen: %s, changed: %s",
                 self.seen_count,
                 self.changed_count,
                 extra=dict(seen_count=self.seen_count,
                            changed_count=self.changed_count))
     print "*" * 500, self.seen_count, self.changed_count
Пример #3
0
    def transmogrify(self, item):
        typekey = self.typekey(*item.keys())[0]
        path = self.get_path(item)
        if self.upload_prefix:
            path = self.upload_prefix + path
            item["_path"] = path
        if not typekey or not item[typekey] in self.from_types:
            return item, None
        item[typekey] = TO_TYPE
        # Description in NITF should contain an abstract of the article
        # on the contents being moved it usually contains a subtitle
        if self.subtitle_from_description:
            item["subtitle"] = item.pop("description", u"")
        item["byline"] = item.get("creators", [u""])[0]
        item["genre"] = self.nitf_genre
        if self.pick_section_from_path != -1:
            section = path.split("/")[self.pick_section_from_path]
            item["section"] = section
        elif self.nitf_section:
            item["section"] = self.nitf_section
        elif item["_type"] == "Story":
            if not item.get("section", None) in self.SECTION_NAMES:
                item["section"] = u"Other"
            self.seen_sections.add(item["section"])

        item["urgency"] = self.nitf_urgency
        item["location"] = item.get("location", u"")
        item["subjects"] = item.get("subject", [])
        item["language"] = self.nitf_language

        # clears default page, since we are changing types
        # maybe add a configure option for this?
        item.pop("_defaultpage", None)

        transition_from_history(item)
        #fix_creators(item)
        logger.info("Converted item at %s to NITF" % path)

        image = {}
        if item.get("image", None) and item["image"].get("data", ""):
            image["_type"] = "Image"
            image["description"] = item.pop("imageCaption", u"")
            name = item["image"].get("filename", "")
            if not name:
                name = image.get("description", "image")[:15] + ".jpg"
            name = normalize_name(name).strip("_")
            image["_path"] = path + "/" + name
            transition_from_history(item)
            image["creation_date"] = item.get("creation_date", None)
            image["modification_date"] = item.get("modification_date", None)
            image["image"] = item["image"]["data"]
            del item["image"]
            logger.info("Yielding separate image for item at %s" % path)


        return item, image
 def __iter__(self):
     for item in self.previous:
         for time_traveler in self.storage["wormhole"].iterpop():
             logger.info("Item %s emerged from the entrails of "
                 "Deep Space" % time_traveler.get("_path",u"<unknown>"))
             yield time_traveler
         yield item
     for time_traveler in self.storage["wormhole"].iterpop():
         logger.info("Item %s emerged from the entrails of "
              "Deep Space, even as the Universe collapses!"
               % time_traveler.get("_path",u"<unknown>"))
         yield time_traveler
def get_remote_image(url, item, img_title="", pathkey="_path",
                     jsonmigrator = False):

    # FIXME: not making an extra call to get the real pathkey
    logger.info("""Fetching image %s for article %s """ % (url,
            item.get(pathkey, "")))
    # Strip plone view from URL:
    url, it_worked = _strip_view(url)
    # it won't work if the image url does not have a proper image
    if not it_worked:
        jsonmigrator = False
    if jsonmigrator:
        url += "/get_item"
    try:
        http = urllib.urlopen(url)
        image_data = http.read()
        if http.code > 399:
            # we can't  get the image here
            raise IOError
        real_url = http.url

    except Exception as error:
        logger.error("Could not retrieve image at %s: %s - skipping" %
                     (url, error))
        return None, None, []

    if jsonmigrator and real_url.endswith("/get_item"):
            real_url = real_url[:len("/get_item")]
    image_filename, post_parts = _get_filename(real_url)

    if jsonmigrator:
        try:
            image = json.loads(image_data)
        except ValueError:
            logger.warn("Could not retrieve image json contents at %s " % url)
            raise None, None, []

    else: # build object item for the pipeline
        image = {}
        image["_type"] = "Image"
        image["image"] = image_data

        image["creation_date"] = item.get("creation_date", None)
        image["modification_date"] = item.get("modification_date", None)
        image["_transitions"] = item.get("_transitions", "published")

        if not img_title:
            img_title = image_filename.split(".")[0].encode("utf-8")
        image["title"] = img_title

    image["_filename"] = image_filename
    return real_url, image, post_parts
Пример #6
0
 def __iter__(self):
     for item in self.previous:
         for time_traveler in self.storage["wormhole"].iterpop():
             logger.info("Item %s emerged from the entrails of "
                         "Deep Space" %
                         time_traveler.get("_path", u"<unknown>"))
             yield time_traveler
         yield item
     for time_traveler in self.storage["wormhole"].iterpop():
         logger.info("Item %s emerged from the entrails of "
                     "Deep Space, even as the Universe collapses!" %
                     time_traveler.get("_path", u"<unknown>"))
         yield time_traveler
Пример #7
0
 def transmogrify(self, item):
     pathkey = self.pathkey(*item.keys())[0]
     if not self.from_regexp or not pathkey:
         raise NothingToDoHere
     new_path = re.sub(self.from_regexp, self.to, item[pathkey], count=1)
     if new_path == item[pathkey]:
         raise NothingToDoHere
     if not "_orig_path" in item:
         item["_orig_path"] = item[pathkey]
     logger.info("Item %s path changed to %s" %
                 (item[pathkey], new_path))
     item[pathkey] = new_path
     return item
Пример #8
0
 def __iter__(self):
     context = self.transmogrifier.context
     paths = []
     for item in self.previous:
         path = self.get_path(item)
         paths.append(path)
         yield item
     logger.info("Start setting intids")
     transaction.commit()
     for path in paths:
         # retrieve object:
         obj = context.unrestrictedTraverse(str(path).lstrip('/'), None)
         if obj is not None:
             res = set_intid(obj)
             logger.info("intid of %s set to %s" % (obj, res))
 def transmogrify(self, item):
     if self.get_type(item) not in self.types:
         raise NothingToDoHere
     keys = self.fields.intersection(item.keys())
     for key in keys:
         original = item[key]
         if isinstance(item[key], (tuple, list)):
             single = False
             values = item[key]
         else:
             single = True
             values = (item[key],)
         new_values = []
         for value in values:
             new_values.append(normalize_string(value))
         item[key] = new_values[0] if single else new_values
         if item[key] != original:
             logger.info("Field %s of item at %s modified to 7bit as %s" % (key, self.get_path(item), item[key]))
     return item
 def transmogrify(self, item):
     if self.get_type(item) not in self.types:
         raise NothingToDoHere
     keys = self.fields.intersection(item.keys())
     for key in keys:
         original = item[key]
         if isinstance(item[key], (tuple, list)):
             single = False
             values = item[key]
         else:
             single = True
             values = (item[key], )
         new_values = []
         for value in values:
             new_values.append(normalize_string(value))
         item[key] = new_values[0] if single else new_values
         if item[key] != original:
             logger.info("Field %s of item at %s modified to 7bit as %s" %
                         (key, self.get_path(item), item[key]))
     return item
Пример #11
0
    def set_options(self):
        """Scans the class "OPTIONS" attribute for blueprint options

         Set the options as a list of 1, 2,  3- or 4-tuple
         with each option: name, default_value, and
         type - one of "string"(default), "literal" and
         documentation string
         TODO: generate blueprint docs from option docs.

         Each "_" on the option name will accept either "_" or "-"
         literals in the transmogrifier.cfg  file
         """

        options = self.__class__.OPTIONS

        # Normalize options withdefault values:
        options = [(opt[0], None, "string", "") if len(opt) == 1 else
                   (opt[0], opt[1], "string", "") if len(opt) == 2 else
                   (opt[0], opt[1], opt[2], "") if len(opt) == 3 else
                   (opt[0], opt[1], opt[2], opt[3])
                                                 for opt in options]
        set_options = {}
        for name, default, type_, doc in options:
            value = self.options.get(name,
                self.options.get(name.replace("_", "-"), default))
            if type_ == "literal" and isinstance(value, basestring):
                value = ast.literal_eval(value)
            set_options[name] = value
        logger.info("Transmogrifier section %s configured with options:\n %s"
            % (self.name, pformat(set_options)))

        for opt_name, value in sorted(set_options.items()):
            if hasattr(self, opt_name):
                logger.error("Attention: Blueprint object in "
                    "section %s already has an attribute named %s - "
                    "overriding with option value %s - but this is "
                    "probably broken" % (self.name, opt_name, value))
            setattr(self, opt_name, value)
Пример #12
0
    def set_options(self):
        """Scans the class "OPTIONS" attribute for blueprint options

         Set the options as a list of 1, 2,  3- or 4-tuple
         with each option: name, default_value, and
         type - one of "string"(default), "literal" and
         documentation string
         TODO: generate blueprint docs from option docs.

         Each "_" on the option name will accept either "_" or "-"
         literals in the transmogrifier.cfg  file
         """

        options = self.__class__.OPTIONS

        # Normalize options withdefault values:
        options = [(opt[0], None, "string", "") if len(opt) == 1 else
                   (opt[0], opt[1], "string", "") if len(opt) == 2 else
                   (opt[0], opt[1], opt[2], "") if len(opt) == 3 else
                   (opt[0], opt[1], opt[2], opt[3]) for opt in options]
        set_options = {}
        for name, default, type_, doc in options:
            value = self.options.get(
                name, self.options.get(name.replace("_", "-"), default))
            if type_ == "literal" and isinstance(value, basestring):
                value = ast.literal_eval(value)
            set_options[name] = value
        logger.info("Transmogrifier section %s configured with options:\n %s" %
                    (self.name, pformat(set_options)))

        for opt_name, value in sorted(set_options.items()):
            if hasattr(self, opt_name):
                logger.error("Attention: Blueprint object in "
                             "section %s already has an attribute named %s - "
                             "overriding with option value %s - but this is "
                             "probably broken" % (self.name, opt_name, value))
            setattr(self, opt_name, value)
    def transmogrify(self, item):
        if not "__remote_url_fetch" in item:
            raise NothingToDoHere
        remote_url = item["__remote_url_fetch"]

        remote_url += (("/" if remote_url[-1] != "/" else  "" ) +
                        self.json_posfix)
        if self.remote_url_prefix and ":" not in remote_url[:7]:
            remote_url =  (self.remote_url_prefix.rstrip("/") + "/"
                           + remote_url.lstrip("/"))
        try:
            logger.info("Fetching remote item at %s " % remote_url)
            new_item = json.loads(urllib2.urlopen(remote_url).read())
        except Exception as error:
            logger.error("Could not retrieve and decode remote item "
                          "at %s, skipping" % remote_url)
            raise ThouShallNotPass
        if self.pop_path_prefix and "_path" in new_item:
            pathcomps = new_item["_path"].lstrip("/").split("/")
            pathcomps = pathcomps[self.pop_path_prefix:]
            new_item["_path"] = "/" + "/".join(pathcomps)
        item.update(new_item)
        item.pop("__remote_url_fetch", "")
        return item
    def __iter__(self):
        context = self.transmogrifier.context
        paths_and_dates = []

        for item in self.previous:
            # retrieve object:
            pathkey = self.pathkey(*item.keys())[0]
            if not pathkey:
                yield item
                continue
            path = item[pathkey]
            #obj = context.unrestrictedTraverse(str(path).lstrip('/'), None)
            if "modification_date" or "creation_date" in item:
                paths_and_dates.append(
                    (path, item.get("modification_date",
                                    ""), item.get("creation_date", "")))
            yield item

        # Commit newly created objects to the persistence before proceeding
        transaction.savepoint(True)
        logger.info("Start setting modification dates")

        for counter, (path, modification_date, creation_date) in \
                                             enumerate(paths_and_dates):
            obj = context.unrestrictedTraverse(str(path).lstrip('/'), None)
            if obj is None:
                continue
            if not creation_date and not obj.creation_date:
                creation_date = modification_date

            idx = []
            if modification_date:
                obj.setModificationDate(DateTime(modification_date))
                logger.info("Modification date of %s set to %s" %
                            (path, modification_date))
                idx.append("modified")

            if creation_date:
                obj.creation_date = DateTime(creation_date)
                logger.info("Creation date of %s set to %s" %
                            (path, creation_date))
                idx.append("created")

            if not obj.effective_date:
                obj.effective_date = obj.creation_date
                idx.append("effective")

            obj.reindexObject(idxs=idx)

            if not (counter % 50):
                transaction.savepoint(True)
    def __iter__(self):
        context = self.transmogrifier.context
        paths_and_dates = []

        for item in self.previous:
            # retrieve object:
            pathkey = self.pathkey(*item.keys())[0]
            if not pathkey:
                yield item; continue
            path = item[pathkey]
            #obj = context.unrestrictedTraverse(str(path).lstrip('/'), None)
            if "modification_date" or "creation_date" in item:
                paths_and_dates.append((path, item.get("modification_date", ""),
                                        item.get("creation_date", "")))
            yield item

        # Commit newly created objects to the persistence before proceeding
        transaction.savepoint(True)
        logger.info("Start setting modification dates")

        for counter, (path, modification_date, creation_date) in \
                                             enumerate(paths_and_dates):
            obj = context.unrestrictedTraverse(str(path).lstrip('/'), None)
            if obj is None:
                continue
            if not creation_date and not obj.creation_date:
                creation_date = modification_date


            idx = []
            if modification_date:
                obj.setModificationDate(DateTime(modification_date))
                logger.info("Modification date of %s set to %s" %
                          (path, modification_date))
                idx.append("modified")

            if creation_date:
                obj.creation_date = DateTime(creation_date)
                logger.info("Creation date of %s set to %s" %
                            (path, creation_date))
                idx.append("created")

            if not obj.effective_date:
                obj.effective_date = obj.creation_date
                idx.append("effective")

            obj.reindexObject(idxs=idx)

            if not (counter % 50):
                transaction.savepoint(True)
    def transmogrify(self, item):
        """
            Long history short:
            the NITF converter pipeline we are using
            yields the news item (or blogpost, opor wathever)
            and them its image attribute, as a separate
            content, in the same "yield" loop -
            this separate content is yielded before the
            whitehole blueprint is ever reached. (and obviously
            there would be no container for it in this run
            of pipeline loop).
            So we "freeze" the other items here, and just
            let through the things we created ourselves.

            This should actually be a common scenario -
            therefore this have to be factored
            out and made simpler to use.

        """


        # FIXME:
        # Factor this out into whitehole/wormhole blueprint framework


        if COCOON_KEY in item:
            self.count_cocoons -= 1
            logger.info("Unthawning item %s to proceed on the pipeline" %
                        item[COCOON_KEY].get("_path", "<unknown>"))
            return [item[COCOON_KEY]]
        if self.count_cocoons:
            if "__time_traveler" in item:
                # This is one of ours - let it pass!
                item.pop("__time_traveler")
            else:
                wormhole = self.storage["wormhole"]
                #some item scheduled in the pipeline trying to get ahead
                # of our time_travelers!
                # THAT COULD GET OUR GRANDFATHER KILLED!! DELAY IT!
                logger.info("Delaying item %s - it will proceed the pipeline"
                            " when the wormhole queue is emptied" %
                            item.get("_path", "<unknown>"))
                cocoon = {COCOON_KEY: item}
                self.count_cocoons += 1
                # Deques have no insert :-(
                # we have to mangle with space time weaving itself
                position = 0
                while True:
                    if  COCOON_KEY in wormhole[-1]:
                        wormhole.append(cocoon)
                        break
                    position += 1
                    wormhole.rotate(1)
                wormhole.rotate(-position)
                raise ThouShallNotPass

        traverse = self.traverse

        items = []
        path = self.get_path(item)

        newPathKey = self.newPathKey or self.pathkey(*item.keys())[0]
        newTypeKey = self.newTypeKey

        stripped_path = path.strip("/")
        elems = stripped_path.rsplit('/', 1)
        container, id = elems if len(elems) > 1 else ("", elems[0])

        container_path_items = container.split('/')

        original_container_parts = item.get("_orig_path", "").strip("/").split("/")[:-1]

        # This may be a new container
        if container in self.seen or not container_path_items:
            raise NothingToDoHere

        checked_elements = []

        # Check each possible parent folder
        path_exists = True
        for element in container_path_items:
            checked_elements.append(element)
            currentPath = '/'.join(checked_elements)

            if self.cache:
                if currentPath in self.seen:
                    continue
                self.seen.add(currentPath)

            if path_exists and traverse(currentPath, None) is None:
                # Path does not exist from here on
                path_exists = False

            if path_exists:
                continue

            # We don't have this path - yield to create a
            # skeleton folder
            new_folder = {}
            new_folder[newPathKey] = '/' + currentPath
            new_folder[newTypeKey] = self.folderType
            # Set folder to be published if item is to be as well:
            # FIXME - maybe check the "_review_state" key
            # rather than "_transitions" /
            # even further - have a *utils function to ensure
            # proper "_transitons" and "_review_state"
            # from a _workflow_history item key.
            if "_transitions" in item:
                new_folder["_transitions"] = item["_transitions"]

            remote_url = self.remote_prefix.rstrip("/") + "/"
            if self.remote_fetch:
                if self.use_original_path and "_orig_path" in item:
                    # think of it this way:
                    # if item["_orig_path"] == "/vanishing/old/path/item"
                    # and item["_path"] == "/new/path/item"
                    # we need to schedule for fetching
                    # /vanishing/old and /vanishing/old/path from the remote
                    # (there must be some other blueprint to change
                    # /vanishing/old to /new in the pipeline)

                    # following this use case. if element == "new"
                    index =  len(checked_elements) - len(container_path_items)
                    # will yield "-1". we should have
                    # original_container_parts ==["vanishing", "old", "path"]
                    if index == 0: index = None
                    remote_url += \
                        "/".join(original_container_parts[:index]).lstrip("/")
                else:
                    remote_url += currentPath
                # FIXME: should use the transmogrifier
                # mechanism to target these to a specifc
                # blueprint
                new_folder["__remote_url_fetch"] = remote_url
            logger.info("Schedulling %s folder to be created"
                        " to contain %s" % ("/" + currentPath, path))
            items.append(new_folder)

        if self.cache:
            self.seen.add("%s/%s" % (container, id,))

        if self.use_wormhole and items:
            # Send our folders back to the begining of the pipeline -
            # and put our item on a cocoon from where we will
            # free it again in the future
            # TODO: refactor the "cocoon" mechanism to
            # be more integrated in the wormhole engine

            # can't simply push our stuff to the end of the
            # wormhole - if the wormhole is not empty, it may
            # contain other items scheduled to be build after
            # the current item. And the current item needs
            # these folders to go first:
            cocoon = {COCOON_KEY: item}
            self.storage["wormhole"].appendleft(cocoon)
            self.count_cocoons += 1
            for new_folder in reversed(items):
                new_folder["__time_traveler"] = True
                self.storage["wormhole"].appendleft(new_folder)



            #And...hyperjump back to the beggining of the pipeline, where
            # our item will be hapily yielded by the whitehole blueprint
            raise ThouShallNotPass

        items.append(item)

        return items