def prep(items, schema=None, strict=False): ''' Prep a raw JSON set of items so it's more useful for simile schema is a description in code of the expectations for items, including descriptions to be applied to keys or values import string from amara.lib.util import mcompose, first_item from zen import exhibit PIPELINES = { u'atom:entry': { u"type": None, u"author": None, u"updated": None, #u"title": mcompose(first_item, string.strip), u"title": None, u"alternate_link": first_item, u"summary": (first_item, exhibit.REQUIRE), u"content": None, u"published": None, u"id": None, #Note: ID is always automatically required u"label": None, }, u'atom:feed': None } prepped = exhibit.prep(obj, schema=PIPELINES) ''' remove_list = [] for item in items: #print item if not u'id' in item: raise ValueError('Missing ID') if schema: match = schema.get(first_item(item.get(u'type'))) if strict and match is None: remove_list.append(item) continue schema_for_item = match or {} #print schema_for_item for key in schema_for_item.keys(): #Extract the unit transforms for the entry key and entry value if isinstance(schema_for_item[key], tuple): value_unit, key_unit = schema_for_item[key] else: value_unit, key_unit = schema_for_item[key], None #import sys; print >> sys.stderr, (key, value_unit, key_unit) if key_unit and key_unit is REQUIRE: if key not in item: raise ValueError('Missing required field: %s'%key) if key in item: #result = pipeline_stage(schema_for_item[key], item[key]).next() value = pipeline_stage(value_unit, item[key]) #FIXME: Now supports either REQUIRE *or* transformation, and not both. Maybe make a 3-tuple if key_unit and key_unit is not REQUIRE: new_key = pipeline_stage(key_unit, key) del item[key] key = new_key item[key] = value for item in remove_list: items.remove(item) return items
def rss2translate(url=None, format=None): """Convert RSS 2.0 feed to Atom or RSS 1.0 Sample request: * curl "http://localhost:8880/akara.rss2translate?url=http://feeds.delicious.com/v2/rss/recent" This is a demo and is not meant as an industrial-strength converter. """ # Support connection-negotiation in addition to query parameter if not format: accepted_imts = request.environ.get('HTTP_ACCEPT', '').split(',') imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) if imt == 'RDF_IMT': format = 'rss1' else: format = 'atom' if not url: raise AssertionError("The 'url' query parameter is mandatory.") import feedparser # From http://www.feedparser.org/ feed = feedparser.parse(url) # Note: bad URLs might mean the feed doesn't have headers logger.debug('Feed info: ' + repr((url, feed.version, feed.encoding, feed.headers.get('Content-type')))) updated = getattr(feed.feed, 'updated_parsed', None) if updated: #FIXME: Double-check this conversion updated = datetime(*updated[:7]).isoformat() f = atomtools.feed(title=feed.feed.title, updated=updated, id=feed.feed.link) for e in feed.entries: updated = getattr(e, 'updated_parsed', None) if updated: #FIXME: Double-check this conversion updated = datetime(*updated[:7]).isoformat() links = [ #FIXME: self? (e.link, u'alternate'), ] f.append( e.link, e.title, updated = updated, summary=e.description, #e.author_detail.name #authors=authors, links=links, ) if format == 'atom': result = f.xml_encode() response.add_header("Content-Type", ATOM_IMT) else: result = f.rss1format() response.add_header("Content-Type", RDF_IMT) return result
def requested_imt(environ): # Choose a preferred media type from the Accept header, using application/json as presumed # default, and stripping out any wildcard types and type parameters # # FIXME Ideally, this should use the q values and pick the best media type, rather than # just picking the first non-wildcard type. Perhaps: http://code.google.com/p/mimeparse/ accepted_imts = [] accept_header = environ.get('HTTP_ACCEPT') if accept_header: accepted_imts = [mt.split(';')[0] for mt in accept_header.split(',')] accepted_imts.append('application/json') #if logger: logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) return imt
def requested_imt(environ): # Choose a preferred media type from the Accept header, using application/json as presumed # default, and stripping out any wildcard types and type parameters # # FIXME Ideally, this should use the q values and pick the best media type, rather than # just picking the first non-wildcard type. Perhaps: http://code.google.com/p/mimeparse/ accepted_imts = [] accept_header = environ.get('HTTP_ACCEPT') if accept_header : accepted_imts = [ mt.split(';')[0] for mt in accept_header.split(',') ] accepted_imts.append('application/json') #if logger: logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) return imt
def atom_moin(body, ctype, maxcount=None, folder=None, feed=None): #Sample query: #curl --request POST "http://localhost:8880/atom.moin?feed=http://bitworking.org/news/feed/&maxcount=10&folder=foo091023" #You can set ...&maxcount=100 or whatever number, if you like maxcount = int(maxcount if maxcount else DEFAULT_MAX) H = httplib2.Http('.cache') if USER: H.add_credentials(USER, PASSWD) #Prepare the envelope for the output (POST response) w = structencoder() output = w.cofeed(ROOT(E_CURSOR(u'updates', {u'feed': feed}))) logger.debug('Feed: ' + feed) entries = atomtools.ejsonize(feed) for entry in islice(entries, 0, maxcount): try: logger.debug('ENTRY: ' + repr(entry)) aid = entry[u'label'] slug = atomtools.slug_from_title(aid) #logger.debug('GRIPPO' + repr((id,))) dest = folder + '/' + slug chunks = [ ' title:: ' + entry[u'title'] ] chunks.append(' last changed:: ' + entry[u'updated']) chunks.append(' link:: ' + (first_item(entry[u'link']) or '')) if u'summary' in entry: chunks.append('= Summary =\n' + entry[u'summary']) if u'content_src' in entry: chunks.append('= Content =\n' + entry[u'content_src']) if u'content_text' in entry: chunks.append('= Content =\n' + entry[u'content_text']) #logger.debug("Result IDs: " + ids) if u'categories' in entry: chunks.append(u'= Categories =') for categories in entry[u'categories']: chunks.append(' * ' + categories) chunks.append(' id:: ' + entry[u'id']) chunks.append('= akara:metadata =\n akara:type:: http://purl.org/com/zepheira/zen/resource/webfeed\n') url = absolutize(dest, MOINBASE) headers = {'Content-Type' : 'text/plain'} resp, content = H.request(url, "PUT", body='\n'.join(chunks).encode('utf-8'), headers=headers) logger.debug("Result: " + repr((resp, content))) output.send(E(u'update', {u'entry-id': entry[u'id'], u'page': url})) except (KeyboardInterrupt, SystemExit): raise except Exception, e: logger.info('Exception handling Entry page: ' + repr(e)) output.send(E(u'failure', {u'entry-id': entry[u'id']}))
def get_page(environ, start_response): #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO']))) req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) upstream_handler = None status = httplib.OK params = cgi.parse_qs(environ['QUERY_STRING']) #Note: probably a better solution here: http://code.google.com/p/mimeparse/ accepted_imts = environ.get('HTTP_ACCEPT', '').split(',') #logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) #logger.debug('imt: ' + repr(imt)) params_for_moin = {} cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used if NO_CACHE_PATHS and first_item(dropwhile(lambda x: x not in page, NO_CACHE_PATHS)): cache_max_age = None if 'rev' in params: #XXX: Not compatible with search #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'} params_for_moin = {'rev' : params['rev'][0]} if 'search' in params: searchq = params['search'][0] query = urllib.urlencode({'value' : searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text'}) #?action=fullsearch&context=180&value=foo&=Text url = absolutize('?'+query, base) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT cache_max_age = None #elif 'action' in params and params['action'][0] == 'recall': elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''): params = urllib.urlencode(params_for_moin) url = absolutize(page+'?'+params, base) request = urllib2.Request(url, None, req_headers) ctype = moin.HTML_IMT elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''): #FIXME: Make unique flag optional #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1' url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base) #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', ) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''): url = absolutize(page + '?action=AttachFile', base) request = urllib2.Request(url, None, req_headers) ctype = moin.ATTACHMENTS_IMT def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]') targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets) ) )) return output.read(), ctype #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw elif ';attachment=' in page: page, attachment = page.split(';attachment=', 1) url = absolutize(page + '?action=AttachFile&do=get&target=' + attachment, base) request = urllib2.Request(url, None, req_headers) def upstream_handler(): with closing(opener.open(request)) as resp: rbody = resp.read() return rbody, dict(resp.info())['content-type'] # elif ';history' in page: cache_max_age = None page, discard = page.split(';history', 1) ctype = moin.XML_IMT def upstream_handler(): revs = scrape_page_history(page, base, opener, req_headers) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'history'), (E(u'rev', {u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T')}) for r in revs) ) )) return output.read(), ctype elif imt: params_for_moin.update({'mimetype': imt}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.DOCBOOK_IMT else: params_for_moin.update({'action': 'raw'}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.WIKITEXT_IMT try: if upstream_handler: rbody, ctype = upstream_handler() else: with closing(opener.open(request)) as resp: rbody = resp.read() #headers = {moin.ORIG_BASE_HEADER: base} #moin_base = absolutize(wiki_id, base) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page response_headers = [("Content-Type", ctype), ("Vary", "Accept"), (moin.ORIG_BASE_HEADER, moin_base_info)] if cache_max_age: response_headers.append(("Cache-Control","max-age="+str(cache_max_age))) start_response(status_response(status), response_headers) return rbody except urllib2.URLError, e: if e.code == 401: raise HTTPAuthorizationError(url=request.get_full_url()) if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ),backurl=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'): ''' A generator of CDM records First generates header info >>> from zen.contentdm import read_contentdm >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None) >>> results.next() {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'} >>> results.next() {u'Title': u'60 years in darkness. ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...} The first yielded value is global metadata; the second is the record for the first item in the collection/query, and so on until all the items are returned, or the limit reached. If you want to see the debug messages, just do (before calling read_contentdm for the first time): >>> import logging; logging.basicConfig(level=logging.DEBUG) for a nice-sized collection to try: >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps') Auburn theater collection: >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01') >>> read_contentdm('http://content.lib.auburn.edu', collection='/football') i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps See also: * /cdm4/browse.php?CISOROOT=/football (51 items) >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]') def follow_pagination(doc): #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21 page_start = 1 while True: items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]') #items = list(items) #for i in items: yield i for i in items: #logger.debug("item: {0}".format(i.title.encode('utf-8'))) yield i next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ] if not next: #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh* next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ] if not next: break page_start = int(next[0].split(u',')[-1]) url = absolutize(next[0], site) resp, doc = cdmsite.index_page(url, "Next page URL: {0}") return items = follow_pagination(resultsdoc) at_least_one = False count = 0 for it in items: at_least_one = True pageuri = absolutize(it.href, site) if pageuri in seen_links: continue seen_links.add(pageuri) entry = {} logger.debug("Processing item URL: {0}".format(pageuri)) (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri) entry['domain'] = netloc params = parse_qs(query) entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0] entry['id'] = params['CISOPTR'][0] logger.debug("Item id: {0}".format(entry['id'])) if entry['id'] in seen_ids: continue seen_ids.add(entry['id']) entry['link'] = unicode(pageuri) entry['local_link'] = '#' + entry['id'] resp, page, cachekey, cached = cdmsite.item_page(pageuri) if cached: entry = cached else: image = first_item(page.xml_select(u'//td[@class="tdimage"]//img')) if image: imageuri = absolutize(image.src, site) entry['imageuri'] = imageuri try: entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site) except AttributeError: logger.debug("No thumbnail") #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root) #fields = page.xml_select(u'//tr[td[@class="tdtext"]]') #fields = page.xml_select(u'//table[@class="metatable"]/tr') fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr')) for f in fields: #key = unicode(f.td[0].span.b).replace(' ', '_') key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b'))) #logger.debug("{0}".format(key)) value = u''.join(CONTENT.dispatch(f.td[1])) #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]'))) entry[key] = unicode(value) if u'Title' in entry: #logger.debug("{0}".format(entry['Title'])) entry['label'] = entry['Title'] else: entry['label'] = u'[NO LABEL AVAILABLE]' if u"Location_Depicted" in entry: locations = entry[u"Location_Depicted"].split(u', ') #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ] locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ] #print >> sys.stderr, "LOCATIONS", repr(locations) entry[u"Locations_Depicted"] = locations if u"Date_Original" in entry: entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ] if cachedir: try: json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w') json.dump(entry, json_stream) except IOError, ValueError: pass yield entry count += 1 if limit and count >= limit: logger.debug("Limit reached") break
def get_page(environ, start_response): #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO']))) req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) upstream_handler = None status = httplib.OK params = cgi.parse_qs(environ['QUERY_STRING']) #Note: probably a better solution here: http://code.google.com/p/mimeparse/ accepted_imts = environ.get('HTTP_ACCEPT', '').split(',') #logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) #logger.debug('imt: ' + repr(imt)) params_for_moin = {} cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used if NO_CACHE_PATHS and first_item( dropwhile(lambda x: x not in page, NO_CACHE_PATHS)): cache_max_age = None if 'rev' in params: #XXX: Not compatible with search #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'} params_for_moin = {'rev': params['rev'][0]} if 'search' in params: searchq = params['search'][0] query = urllib.urlencode({ 'value': searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text' }) #?action=fullsearch&context=180&value=foo&=Text url = absolutize('?' + query, base) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT cache_max_age = None #elif 'action' in params and params['action'][0] == 'recall': elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''): params = urllib.urlencode(params_for_moin) url = absolutize(page + '?' + params, base) request = urllib2.Request(url, None, req_headers) ctype = moin.HTML_IMT elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''): #FIXME: Make unique flag optional #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1' url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base) #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', ) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''): url = absolutize(page + '?action=AttachFile', base) request = urllib2.Request(url, None, req_headers) ctype = moin.ATTACHMENTS_IMT def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select( u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]' ) targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets)))) return output.read(), ctype #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw elif ';attachment=' in page: page, attachment = page.split(';attachment=', 1) url = absolutize( page + '?action=AttachFile&do=get&target=' + attachment, base) request = urllib2.Request(url, None, req_headers) def upstream_handler(): with closing(opener.open(request)) as resp: rbody = resp.read() return rbody, dict(resp.info())['content-type'] # elif ';history' in page: cache_max_age = None page, discard = page.split(';history', 1) ctype = moin.XML_IMT def upstream_handler(): revs = scrape_page_history(page, base, opener, req_headers) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'history'), (E( u'rev', { u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T') }) for r in revs)))) return output.read(), ctype elif imt: params_for_moin.update({'mimetype': imt}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.DOCBOOK_IMT else: params_for_moin.update({'action': 'raw'}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.WIKITEXT_IMT try: if upstream_handler: rbody, ctype = upstream_handler() else: with closing(opener.open(request)) as resp: rbody = resp.read() #headers = {moin.ORIG_BASE_HEADER: base} #moin_base = absolutize(wiki_id, base) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page response_headers = [("Content-Type", ctype), ("Vary", "Accept"), (moin.ORIG_BASE_HEADER, moin_base_info)] if cache_max_age: response_headers.append( ("Cache-Control", "max-age=" + str(cache_max_age))) start_response(status_response(status), response_headers) return rbody except urllib2.URLError, e: if e.code == 401: raise HTTPAuthorizationError(url=request.get_full_url()) if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def rule(self, node): context_elem = first_item(e.xml_select(u'ak:resource')) if context_elem: resource = U(context_elem.select) self.rules.append((U(node.context), resource, [])) list(chain(*imap(self.dispatch, node.xml_children)))