def ScrubMeta(self, itemd): itemd = dict(itemd) itemd.setdefault("title", "[Untitled]") if self.AtomLike(): # # Author is close enough to owner # author_name = bm_extract.as_string(itemd, "author") if author_name: itemd["ownerName"] = author_name author_href = bm_extract.as_string(itemd, "author.uri") if author_href: itemd["ownerId"] = author_href author_email = bm_extract.as_string(itemd, "author.email") if author_email: itemd["ownerEmail"] = author_email try: itemd.pop("author") except KeyError: pass # # # for k_from, k_to in [ ( 'created', 'dateCreated' ), ( 'updated', 'dateModified' ), ]: try: value = itemd.pop(k_from) itemd[k_to] = bm_extract.coerce_datetime(value, otherwise = value, rfc822 = True) except KeyError: pass return itemd
def Fetch(self): if self._parser: return self.CustomizeValidate() self._parser = self._parser_class(page_uri = self.uri, at_prefix = '@@', **self._parserd) self._parser.PragmaCLI() self._items = list(self._parser.Iterate()) self._meta = { "link" : self.uri, "title" : self._parser.document_title, } if self._parser.document_date: self._meta['updated'] = bm_extract.coerce_datetime(self._parser.document_date).isoformat()
def CustomizeAtomMeta(self, itemd): itemd = dict(itemd) # # datetimes # for k_to, k_from in [ ( 'created', 'dateCreated' ), ( 'updated', 'dateModified' ), ]: try: value = itemd.pop(k_from) itemd[k_to] = bm_extract.coerce_datetime(value, otherwise = value, atom = True) except KeyError: pass # # OPML Garbage # for key in [ 'expansionState', 'vertScrollState', 'windowBottom', 'windowLeft', 'windowRight', 'windowTop', ]: try: del itemd[key] except: pass # # Atom author # try: author_name = itemd.pop("ownerName") except: author_name = None try: author_email = itemd.pop("ownerEmail") except: author_email = None try: author_href = itemd.pop("ownerId") except: author_href = None if author_name or author_email or author_href: authord = { "@" : author_name or "", } if author_email: authord["email"] = author_email if author_href: authord["uri"] = author_href itemd["author"] = authord return itemd
def ScrubItem(self, itemd): """Note: *not* CustomizeAtomItem""" if not self.AtomLike(): return itemd itemd = dict(itemd) # # Atom title # try: if not itemd.get("title"): itemd["title"] = itemd.pop("text") except KeyError: pass # # Atom datetimes # try: created = itemd.pop("created") itemd["created"] = bm_extract.coerce_datetime(created, otherwise = created, atom = True) except KeyError: pass # # Atom categories # try: tags = itemd.pop("tags") tags = bm_extract.coerce_list(tags, separator = ",", strip = True) itemd["category"] = [ { "term" : tag } for tag in tags ] except KeyError: pass return itemd
def as_datetime(self, path, **ad): return bm_extract.coerce_datetime(self.get(path), **ad)
def ScrubEntry(self, itemd): if bm_extract.is_dict(itemd): nd = {} seen_html = False seen_rss = False seen_url = False for key, value in itemd.iteritems(): if self.AtomLike(): if key == "link": key = "htmlUrl" elif key == "feeds": key = "rssUrl" elif key == "content": key = "description" elif key == "title": key = "text" elif key == "category": key = "tags" value = ", ".join(map(lambda d: d["term"], value)) elif key == "links": for ld in bm_extract.coerce_list(value): if bm_extract.as_string(ld, "rel") == "alternate": key = "rssUrl" value = bm_extract.as_string(ld, "href") # # datetimes (?) # try: created = itemd.pop("created") itemd["created"] = bm_extract.coerce_datetime(created, otherwise = created, rfc822 = True) except KeyError: pass if key == "rssUrl": value = self.FirstInListLikeObject(value, value) if value == None: continue seen_rss = True elif key == "htmlUrl": value = self.FirstInListLikeObject(value, value) if value == None: continue seen_html = True elif key == "url": seen_url = True if key in [ "items", "outline" ]: nd["outline"] = self.ScrubEntry(value) elif value == None: pass elif bm_extract.is_atomic(value): nd['@%s' % key] = value if seen_rss: nd.setdefault("@type", "rss") elif seen_html: nd.setdefault("@type", "link") elif seen_url: nd.setdefault("@type", "link") nd.setdefault("@text", "") return nd elif bm_extract.is_atomic(itemd): return { "@title" : bm_extract.coerce_string(itemd) } elif bm_extract.is_list(itemd) or bm_extract.is_list_like(itemd): return map(self.ScrubEntry, itemd) return itemd