def doc_metadata (repo, response, params): """ Return the metadata for the specified document. :param doc_id: the document to fetch the info for. :type doc_id: an UpLib doc ID string :param format: optional, can be specified as "xml" to return the results as \ an XML document instead of a zip file. Or you can specify the \ HTTP Accept header as "application/xml" to obtain the same result. :type format: string constant "xml" :return: the metadata for the specified documents :rtype: an XML data structure, if the "Accept: application/xml" header \ was passed in the request, otherwise a value of MIME type "text/rfc822-headers". """ id = params.get("doc_id"); if not id: response.error(HTTPCodes.BAD_REQUEST, "No doc_id parameter specified for request.\n") return if not repo.valid_doc_id(id): response.error(HTTPCodes.NOT_FOUND, "Invalid doc_id parameter %s specified for request.\n" % params.get("doc_id")) return doc = repo.get_document(id) if response.xml_request or (params.get("format") == "xml"): retval = getDOMImplementation().createDocument(None, "result", None) d = retval.createElement('document') d.setAttribute('id', doc.id) title = doc.get_metadata("title") or u"" title = title.replace("\r", " ") d.setAttribute('title', title) md = retval.createElement('metadata') dmd = doc.get_metadata() for element in dmd: md.setAttribute(element, dmd[element]) d.appendChild(md) retval.documentElement.appendChild(d) fp = response.open("application/xml;charset=utf-8") fp.write(retval.toxml("UTF-8") + "\n") fp.close() return else: fp = response.open("text/rfc822-headers") write_metadata(fp, doc.get_metadata()) fp.close()
def repo_properties (repo, response, params): """ Return the properties of the repository. These include values like `name`, `port`, `uplib-home`, `uplib-bin`, `uplib-lib`, `uplib-version`, `categories` (a comma-separated list of category names), `docs` (a comma-separated list of doc IDs), `collections` (a comma-separated list of collection IDs), `last-modified-time` (a timestamp with the last-modified time of the repository, as a floating point string giving seconds past the Unix epoch). :return: the repository properties specified above :rtype: either an XML-formatted data set, if "Accept: application/xml" is specified, \ or a plain text list of properties, with one per line (lines can be very long) """ d = {} d['name'] = repo.name() d['port'] = repo.port() d['uplib-home'] = configurator.default_configurator().get("uplib-home") d['uplib-bin'] = configurator.default_configurator().get("uplib-bin") d['uplib-lib'] = configurator.default_configurator().get("uplib-lib") d['uplib-version'] = configurator.default_configurator().get("UPLIB_VERSION") c = repo.categories() c.sort(lambda x, y: cmp(string.lower(x), string.lower(y))) d['categories'] = ','.join(c) d['docs'] = ','.join([doc.id for doc in repo.generate_docs()]) d['collections'] = ','.join([x.id for x in repo.list_collections()]) d['last-modified-time'] = str(repo.mod_time()) if response.xml_request or (params.get("format") == "xml"): retval = getDOMImplementation().createDocument(None, "repository", None) e = retval.createElement('properties') for element in d: e.setAttribute(element, str(d[element])) retval.documentElement.appendChild(e) fp = response.open("application/xml;charset=utf-8") fp.write(retval.toxml("UTF-8") + "\n") fp.close() return else: fp = response.open("text/plain") write_metadata(fp, d) fp.close()
def main(argv): global _IGNORE_KEYBOARD_INTERRUPTS try: import feedparser except ImportError: sys.stderr.write("RSSReader: Python feedparser module not available -- can't run RSS scanner.\n") sys.exit(1) if argv[0] == "run": sys.path.append("/local/share/UpLib-1.7.9/code") from uplib.plibUtil import set_verbosity, set_note_sink, uthread from uplib.repository import Repository uthread.initialize() set_note_sink(sys.stderr) set_verbosity(4) _IGNORE_KEYBOARD_INTERRUPTS = False if len(argv) > 1: repo = Repository("1.7.9", argv[1], {}) else: repo = None _scan_rss_sites(repo) elif argv[0] == "scan": sys.path.append("/local/share/UpLib-1.7.9/code") from uplib.plibUtil import write_metadata for arg in argv[1:]: for feed in find_feeds(arg): print feed.feed.title, feed.href, len(feed.entries) for entry in feed.entries: d = process_entry(entry) if d: print (u'%s, by %s, at %s' % (d.get("title"), d.get("authors"), time.ctime(int(d.get("rss-timestamp"))))).encode("UTF-8", "strict") if "'" in d.get("title"): mdoutput = StringIO.StringIO() write_metadata(mdoutput, d) md = mdoutput.getvalue() mdoutput.close() for line in md.split("\n"): line = line.strip() print ' ' + line else: sys.exit(0)
def _scan_rss_sites(repo): global _ADDED_SITES, _REMOVED_SITES try: from uplib.plibUtil import configurator, note, write_metadata, id_to_time, create_new_id from uplib.extensions import find_and_load_extension conf = configurator.default_configurator() if repo: sys_inits_path = os.path.join(conf.get('uplib-lib'), 'site-extensions') repo_inits_path = os.path.join(repo.root(), "overhead", "extensions", "active") upload_m = find_and_load_extension("UploadDocument", "%s|%s" % (repo_inits_path, sys_inits_path), None, True) if not upload_m: note(0, "Can't load UploadDocument extension!") sys.exit(1) else: note("UploadDocument extension is %s", upload_m) scan_period = conf.get_int("rss-scan-period", 60 * 2) startup_delay = conf.get_int("rss-startup-delay", 0) del conf import feedparser if startup_delay > 0: note(3, "startup delay is %d", startup_delay) time.sleep(startup_delay) except: note(0, "RSSReader: exception starting RSS scan thread:\n%s", ''.join(traceback.format_exception(*sys.exc_info()))) return rss_sites = -1 while True: try: conf = configurator() # re-read uplibrc file old_rss_sites = rss_sites rss_sites = conf.get("rss-sites") if old_rss_sites == -1 or (old_rss_sites != rss_sites): note(2, "rss_sites are %s", rss_sites) scan_period = conf.get_int("rss-scan-period", scan_period) expiration_period = conf.get_int("rss-expiration-period", 30 * 24 * 60 * 60) # 30 days if rss_sites: rss_sites = rss_sites.split() + _ADDED_SITES else: rss_sites = _ADDED_SITES[:] if rss_sites: for site in _REMOVED_SITES: if site in rss_sites: rss_sites.remove(site) if rss_sites: feeds = [] for site in rss_sites: if site.startswith("feed:"): feeds.append(feedparser.parse(site)) elif site.startswith("http:") or site.startswith("https:"): feeds += find_feeds(site) note("feeds are:\n%s", [(x.feed.title, x.href, len(x.entries)) for x in feeds]) for feed in feeds: note("RSSReader: %s: %s entries in feed %s", time.ctime(), len(feed.entries), feed.feed.title) for entry in feed.entries: d = process_entry(entry) if not d: continue id = d.get("rss-id") hits = repo.do_query('+rss-id:"%s"' % id) if hits: # already in repo continue if repo: response = FakeResponse(repo) mdoutput = StringIO.StringIO() write_metadata(mdoutput, d) md = mdoutput.getvalue() mdoutput.close() upload_m.add(repo, response, { 'URL': d.get("original-url"), 'wait': "true", 'no-redirect': "true", 'metadata': md, 'md-categories': "RSSReader/%s" % feed.feed.title, }) if response.thread: while response.thread.isAlive(): response.thread.join(1.0) note("RSSReader: %s: %s (%s: %s)", time.ctime(), repr(d.get("title")), response.code, response.message) else: note("RSSReader: %s: %s (%s)\n %s", time.ctime(), repr(d.get("title")), d.get("date"), d.get("summary")) # now do expiries old_id = create_new_id(time.time() - expiration_period)[:-5] hits = repo.do_query("categories:RSSReader AND id:[00000-00-0000-000 TO %s] AND NOT categories:RSSReader/_noexpire_" % old_id) for score, doc in hits: # check to see if the user has looked at it if os.path.exists(os.path.join(doc.folder(), "activity")): doc.add_category("RSSReader/_noexpire_", True) # and if not, remove it else: repo.delete_document(doc.id) time.sleep(scan_period) except KeyboardInterrupt: if _IGNORE_KEYBOARD_INTERRUPTS: note(0, "RSSReader: %s", ''.join(traceback.format_exception(*sys.exc_info()))) else: sys.exit(0) except: note(0, "RSSReader: %s", ''.join(traceback.format_exception(*sys.exc_info())))