def post_pipeline(self): logger.info("Seen: %s, changed: %s", self.seen_count, self.changed_count, extra=dict(seen_count=self.seen_count, changed_count=self.changed_count)) print "*" * 500, self.seen_count, self.changed_count
def transmogrify(self, item): typekey = self.typekey(*item.keys())[0] path = self.get_path(item) if self.upload_prefix: path = self.upload_prefix + path item["_path"] = path if not typekey or not item[typekey] in self.from_types: return item, None item[typekey] = TO_TYPE # Description in NITF should contain an abstract of the article # on the contents being moved it usually contains a subtitle if self.subtitle_from_description: item["subtitle"] = item.pop("description", u"") item["byline"] = item.get("creators", [u""])[0] item["genre"] = self.nitf_genre if self.pick_section_from_path != -1: section = path.split("/")[self.pick_section_from_path] item["section"] = section elif self.nitf_section: item["section"] = self.nitf_section elif item["_type"] == "Story": if not item.get("section", None) in self.SECTION_NAMES: item["section"] = u"Other" self.seen_sections.add(item["section"]) item["urgency"] = self.nitf_urgency item["location"] = item.get("location", u"") item["subjects"] = item.get("subject", []) item["language"] = self.nitf_language # clears default page, since we are changing types # maybe add a configure option for this? item.pop("_defaultpage", None) transition_from_history(item) #fix_creators(item) logger.info("Converted item at %s to NITF" % path) image = {} if item.get("image", None) and item["image"].get("data", ""): image["_type"] = "Image" image["description"] = item.pop("imageCaption", u"") name = item["image"].get("filename", "") if not name: name = image.get("description", "image")[:15] + ".jpg" name = normalize_name(name).strip("_") image["_path"] = path + "/" + name transition_from_history(item) image["creation_date"] = item.get("creation_date", None) image["modification_date"] = item.get("modification_date", None) image["image"] = item["image"]["data"] del item["image"] logger.info("Yielding separate image for item at %s" % path) return item, image
def __iter__(self): for item in self.previous: for time_traveler in self.storage["wormhole"].iterpop(): logger.info("Item %s emerged from the entrails of " "Deep Space" % time_traveler.get("_path",u"<unknown>")) yield time_traveler yield item for time_traveler in self.storage["wormhole"].iterpop(): logger.info("Item %s emerged from the entrails of " "Deep Space, even as the Universe collapses!" % time_traveler.get("_path",u"<unknown>")) yield time_traveler
def get_remote_image(url, item, img_title="", pathkey="_path", jsonmigrator = False): # FIXME: not making an extra call to get the real pathkey logger.info("""Fetching image %s for article %s """ % (url, item.get(pathkey, ""))) # Strip plone view from URL: url, it_worked = _strip_view(url) # it won't work if the image url does not have a proper image if not it_worked: jsonmigrator = False if jsonmigrator: url += "/get_item" try: http = urllib.urlopen(url) image_data = http.read() if http.code > 399: # we can't get the image here raise IOError real_url = http.url except Exception as error: logger.error("Could not retrieve image at %s: %s - skipping" % (url, error)) return None, None, [] if jsonmigrator and real_url.endswith("/get_item"): real_url = real_url[:len("/get_item")] image_filename, post_parts = _get_filename(real_url) if jsonmigrator: try: image = json.loads(image_data) except ValueError: logger.warn("Could not retrieve image json contents at %s " % url) raise None, None, [] else: # build object item for the pipeline image = {} image["_type"] = "Image" image["image"] = image_data image["creation_date"] = item.get("creation_date", None) image["modification_date"] = item.get("modification_date", None) image["_transitions"] = item.get("_transitions", "published") if not img_title: img_title = image_filename.split(".")[0].encode("utf-8") image["title"] = img_title image["_filename"] = image_filename return real_url, image, post_parts
def __iter__(self): for item in self.previous: for time_traveler in self.storage["wormhole"].iterpop(): logger.info("Item %s emerged from the entrails of " "Deep Space" % time_traveler.get("_path", u"<unknown>")) yield time_traveler yield item for time_traveler in self.storage["wormhole"].iterpop(): logger.info("Item %s emerged from the entrails of " "Deep Space, even as the Universe collapses!" % time_traveler.get("_path", u"<unknown>")) yield time_traveler
def transmogrify(self, item): pathkey = self.pathkey(*item.keys())[0] if not self.from_regexp or not pathkey: raise NothingToDoHere new_path = re.sub(self.from_regexp, self.to, item[pathkey], count=1) if new_path == item[pathkey]: raise NothingToDoHere if not "_orig_path" in item: item["_orig_path"] = item[pathkey] logger.info("Item %s path changed to %s" % (item[pathkey], new_path)) item[pathkey] = new_path return item
def __iter__(self): context = self.transmogrifier.context paths = [] for item in self.previous: path = self.get_path(item) paths.append(path) yield item logger.info("Start setting intids") transaction.commit() for path in paths: # retrieve object: obj = context.unrestrictedTraverse(str(path).lstrip('/'), None) if obj is not None: res = set_intid(obj) logger.info("intid of %s set to %s" % (obj, res))
def transmogrify(self, item): if self.get_type(item) not in self.types: raise NothingToDoHere keys = self.fields.intersection(item.keys()) for key in keys: original = item[key] if isinstance(item[key], (tuple, list)): single = False values = item[key] else: single = True values = (item[key],) new_values = [] for value in values: new_values.append(normalize_string(value)) item[key] = new_values[0] if single else new_values if item[key] != original: logger.info("Field %s of item at %s modified to 7bit as %s" % (key, self.get_path(item), item[key])) return item
def transmogrify(self, item): if self.get_type(item) not in self.types: raise NothingToDoHere keys = self.fields.intersection(item.keys()) for key in keys: original = item[key] if isinstance(item[key], (tuple, list)): single = False values = item[key] else: single = True values = (item[key], ) new_values = [] for value in values: new_values.append(normalize_string(value)) item[key] = new_values[0] if single else new_values if item[key] != original: logger.info("Field %s of item at %s modified to 7bit as %s" % (key, self.get_path(item), item[key])) return item
def set_options(self): """Scans the class "OPTIONS" attribute for blueprint options Set the options as a list of 1, 2, 3- or 4-tuple with each option: name, default_value, and type - one of "string"(default), "literal" and documentation string TODO: generate blueprint docs from option docs. Each "_" on the option name will accept either "_" or "-" literals in the transmogrifier.cfg file """ options = self.__class__.OPTIONS # Normalize options withdefault values: options = [(opt[0], None, "string", "") if len(opt) == 1 else (opt[0], opt[1], "string", "") if len(opt) == 2 else (opt[0], opt[1], opt[2], "") if len(opt) == 3 else (opt[0], opt[1], opt[2], opt[3]) for opt in options] set_options = {} for name, default, type_, doc in options: value = self.options.get(name, self.options.get(name.replace("_", "-"), default)) if type_ == "literal" and isinstance(value, basestring): value = ast.literal_eval(value) set_options[name] = value logger.info("Transmogrifier section %s configured with options:\n %s" % (self.name, pformat(set_options))) for opt_name, value in sorted(set_options.items()): if hasattr(self, opt_name): logger.error("Attention: Blueprint object in " "section %s already has an attribute named %s - " "overriding with option value %s - but this is " "probably broken" % (self.name, opt_name, value)) setattr(self, opt_name, value)
def set_options(self): """Scans the class "OPTIONS" attribute for blueprint options Set the options as a list of 1, 2, 3- or 4-tuple with each option: name, default_value, and type - one of "string"(default), "literal" and documentation string TODO: generate blueprint docs from option docs. Each "_" on the option name will accept either "_" or "-" literals in the transmogrifier.cfg file """ options = self.__class__.OPTIONS # Normalize options withdefault values: options = [(opt[0], None, "string", "") if len(opt) == 1 else (opt[0], opt[1], "string", "") if len(opt) == 2 else (opt[0], opt[1], opt[2], "") if len(opt) == 3 else (opt[0], opt[1], opt[2], opt[3]) for opt in options] set_options = {} for name, default, type_, doc in options: value = self.options.get( name, self.options.get(name.replace("_", "-"), default)) if type_ == "literal" and isinstance(value, basestring): value = ast.literal_eval(value) set_options[name] = value logger.info("Transmogrifier section %s configured with options:\n %s" % (self.name, pformat(set_options))) for opt_name, value in sorted(set_options.items()): if hasattr(self, opt_name): logger.error("Attention: Blueprint object in " "section %s already has an attribute named %s - " "overriding with option value %s - but this is " "probably broken" % (self.name, opt_name, value)) setattr(self, opt_name, value)
def transmogrify(self, item): if not "__remote_url_fetch" in item: raise NothingToDoHere remote_url = item["__remote_url_fetch"] remote_url += (("/" if remote_url[-1] != "/" else "" ) + self.json_posfix) if self.remote_url_prefix and ":" not in remote_url[:7]: remote_url = (self.remote_url_prefix.rstrip("/") + "/" + remote_url.lstrip("/")) try: logger.info("Fetching remote item at %s " % remote_url) new_item = json.loads(urllib2.urlopen(remote_url).read()) except Exception as error: logger.error("Could not retrieve and decode remote item " "at %s, skipping" % remote_url) raise ThouShallNotPass if self.pop_path_prefix and "_path" in new_item: pathcomps = new_item["_path"].lstrip("/").split("/") pathcomps = pathcomps[self.pop_path_prefix:] new_item["_path"] = "/" + "/".join(pathcomps) item.update(new_item) item.pop("__remote_url_fetch", "") return item
def __iter__(self): context = self.transmogrifier.context paths_and_dates = [] for item in self.previous: # retrieve object: pathkey = self.pathkey(*item.keys())[0] if not pathkey: yield item continue path = item[pathkey] #obj = context.unrestrictedTraverse(str(path).lstrip('/'), None) if "modification_date" or "creation_date" in item: paths_and_dates.append( (path, item.get("modification_date", ""), item.get("creation_date", ""))) yield item # Commit newly created objects to the persistence before proceeding transaction.savepoint(True) logger.info("Start setting modification dates") for counter, (path, modification_date, creation_date) in \ enumerate(paths_and_dates): obj = context.unrestrictedTraverse(str(path).lstrip('/'), None) if obj is None: continue if not creation_date and not obj.creation_date: creation_date = modification_date idx = [] if modification_date: obj.setModificationDate(DateTime(modification_date)) logger.info("Modification date of %s set to %s" % (path, modification_date)) idx.append("modified") if creation_date: obj.creation_date = DateTime(creation_date) logger.info("Creation date of %s set to %s" % (path, creation_date)) idx.append("created") if not obj.effective_date: obj.effective_date = obj.creation_date idx.append("effective") obj.reindexObject(idxs=idx) if not (counter % 50): transaction.savepoint(True)
def __iter__(self): context = self.transmogrifier.context paths_and_dates = [] for item in self.previous: # retrieve object: pathkey = self.pathkey(*item.keys())[0] if not pathkey: yield item; continue path = item[pathkey] #obj = context.unrestrictedTraverse(str(path).lstrip('/'), None) if "modification_date" or "creation_date" in item: paths_and_dates.append((path, item.get("modification_date", ""), item.get("creation_date", ""))) yield item # Commit newly created objects to the persistence before proceeding transaction.savepoint(True) logger.info("Start setting modification dates") for counter, (path, modification_date, creation_date) in \ enumerate(paths_and_dates): obj = context.unrestrictedTraverse(str(path).lstrip('/'), None) if obj is None: continue if not creation_date and not obj.creation_date: creation_date = modification_date idx = [] if modification_date: obj.setModificationDate(DateTime(modification_date)) logger.info("Modification date of %s set to %s" % (path, modification_date)) idx.append("modified") if creation_date: obj.creation_date = DateTime(creation_date) logger.info("Creation date of %s set to %s" % (path, creation_date)) idx.append("created") if not obj.effective_date: obj.effective_date = obj.creation_date idx.append("effective") obj.reindexObject(idxs=idx) if not (counter % 50): transaction.savepoint(True)
def transmogrify(self, item): """ Long history short: the NITF converter pipeline we are using yields the news item (or blogpost, opor wathever) and them its image attribute, as a separate content, in the same "yield" loop - this separate content is yielded before the whitehole blueprint is ever reached. (and obviously there would be no container for it in this run of pipeline loop). So we "freeze" the other items here, and just let through the things we created ourselves. This should actually be a common scenario - therefore this have to be factored out and made simpler to use. """ # FIXME: # Factor this out into whitehole/wormhole blueprint framework if COCOON_KEY in item: self.count_cocoons -= 1 logger.info("Unthawning item %s to proceed on the pipeline" % item[COCOON_KEY].get("_path", "<unknown>")) return [item[COCOON_KEY]] if self.count_cocoons: if "__time_traveler" in item: # This is one of ours - let it pass! item.pop("__time_traveler") else: wormhole = self.storage["wormhole"] #some item scheduled in the pipeline trying to get ahead # of our time_travelers! # THAT COULD GET OUR GRANDFATHER KILLED!! DELAY IT! logger.info("Delaying item %s - it will proceed the pipeline" " when the wormhole queue is emptied" % item.get("_path", "<unknown>")) cocoon = {COCOON_KEY: item} self.count_cocoons += 1 # Deques have no insert :-( # we have to mangle with space time weaving itself position = 0 while True: if COCOON_KEY in wormhole[-1]: wormhole.append(cocoon) break position += 1 wormhole.rotate(1) wormhole.rotate(-position) raise ThouShallNotPass traverse = self.traverse items = [] path = self.get_path(item) newPathKey = self.newPathKey or self.pathkey(*item.keys())[0] newTypeKey = self.newTypeKey stripped_path = path.strip("/") elems = stripped_path.rsplit('/', 1) container, id = elems if len(elems) > 1 else ("", elems[0]) container_path_items = container.split('/') original_container_parts = item.get("_orig_path", "").strip("/").split("/")[:-1] # This may be a new container if container in self.seen or not container_path_items: raise NothingToDoHere checked_elements = [] # Check each possible parent folder path_exists = True for element in container_path_items: checked_elements.append(element) currentPath = '/'.join(checked_elements) if self.cache: if currentPath in self.seen: continue self.seen.add(currentPath) if path_exists and traverse(currentPath, None) is None: # Path does not exist from here on path_exists = False if path_exists: continue # We don't have this path - yield to create a # skeleton folder new_folder = {} new_folder[newPathKey] = '/' + currentPath new_folder[newTypeKey] = self.folderType # Set folder to be published if item is to be as well: # FIXME - maybe check the "_review_state" key # rather than "_transitions" / # even further - have a *utils function to ensure # proper "_transitons" and "_review_state" # from a _workflow_history item key. if "_transitions" in item: new_folder["_transitions"] = item["_transitions"] remote_url = self.remote_prefix.rstrip("/") + "/" if self.remote_fetch: if self.use_original_path and "_orig_path" in item: # think of it this way: # if item["_orig_path"] == "/vanishing/old/path/item" # and item["_path"] == "/new/path/item" # we need to schedule for fetching # /vanishing/old and /vanishing/old/path from the remote # (there must be some other blueprint to change # /vanishing/old to /new in the pipeline) # following this use case. if element == "new" index = len(checked_elements) - len(container_path_items) # will yield "-1". we should have # original_container_parts ==["vanishing", "old", "path"] if index == 0: index = None remote_url += \ "/".join(original_container_parts[:index]).lstrip("/") else: remote_url += currentPath # FIXME: should use the transmogrifier # mechanism to target these to a specifc # blueprint new_folder["__remote_url_fetch"] = remote_url logger.info("Schedulling %s folder to be created" " to contain %s" % ("/" + currentPath, path)) items.append(new_folder) if self.cache: self.seen.add("%s/%s" % (container, id,)) if self.use_wormhole and items: # Send our folders back to the begining of the pipeline - # and put our item on a cocoon from where we will # free it again in the future # TODO: refactor the "cocoon" mechanism to # be more integrated in the wormhole engine # can't simply push our stuff to the end of the # wormhole - if the wormhole is not empty, it may # contain other items scheduled to be build after # the current item. And the current item needs # these folders to go first: cocoon = {COCOON_KEY: item} self.storage["wormhole"].appendleft(cocoon) self.count_cocoons += 1 for new_folder in reversed(items): new_folder["__time_traveler"] = True self.storage["wormhole"].appendleft(new_folder) #And...hyperjump back to the beggining of the pipeline, where # our item will be hapily yielded by the whitehole blueprint raise ThouShallNotPass items.append(item) return items