def main (argv): if len(argv) < 1 or (not os.path.isdir(argv[0])): sys.stderr.write("Invalid directory specified.\n") sys.exit(1) set_verbosity(4) files = os.listdir(argv[0]) if ("docs" in files) and ("overhead" in files): from uplib.repository import Repository from uplib.plibUtil import configurator uplib_version = configurator().get("UPLIB_VERSION") r = Repository(uplib_version, argv[0], read_metadata(os.path.join(argv[0], "overhead", "metadata.txt"))) build_index_1_0(r)
def main(argv): global _IGNORE_KEYBOARD_INTERRUPTS try: import feedparser except ImportError: sys.stderr.write("RSSReader: Python feedparser module not available -- can't run RSS scanner.\n") sys.exit(1) if argv[0] == "run": sys.path.append("/local/share/UpLib-1.7.9/code") from uplib.plibUtil import set_verbosity, set_note_sink, uthread from uplib.repository import Repository uthread.initialize() set_note_sink(sys.stderr) set_verbosity(4) _IGNORE_KEYBOARD_INTERRUPTS = False if len(argv) > 1: repo = Repository("1.7.9", argv[1], {}) else: repo = None _scan_rss_sites(repo) elif argv[0] == "scan": sys.path.append("/local/share/UpLib-1.7.9/code") from uplib.plibUtil import write_metadata for arg in argv[1:]: for feed in find_feeds(arg): print feed.feed.title, feed.href, len(feed.entries) for entry in feed.entries: d = process_entry(entry) if d: print (u'%s, by %s, at %s' % (d.get("title"), d.get("authors"), time.ctime(int(d.get("rss-timestamp"))))).encode("UTF-8", "strict") if "'" in d.get("title"): mdoutput = StringIO.StringIO() write_metadata(mdoutput, d) md = mdoutput.getvalue() mdoutput.close() for line in md.split("\n"): line = line.strip() print ' ' + line else: sys.exit(0)
dstats = update_stats("", doc_stats) update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats}) def calculate_page_bboxes (repo, path, doc_id=None): if (os.path.isdir(path) and os.path.exists(os.path.join(path, "contents.txt")) and (os.path.getsize(os.path.join(path, "contents.txt")) > 0) and os.path.isdir(os.path.join(path, "thumbnails")) and os.path.exists(os.path.join(path, "wordbboxes"))): do_page_bounding_boxes(path) class BboxesRipper (Ripper): def rip (self, location, doc_id): try: calculate_page_bboxes(self.repository(), location) except: msg = ''.join(traceback.format_exception(*sys.exc_info())) note("Exception processing %s:\n%s\n" % (doc_id, msg)) note("No page bounding boxes generated.") raise if __name__ == "__main__": # little test from uplib.plibUtil import set_verbosity set_verbosity(4) do_page_bounding_boxes(sys.argv[1])
(os.path.exists(os.path.join(path, "document.tiff")) or os.path.isdir(os.path.join(path, "page-images")))): do_thumbnails(path, os.path.join(path, "thumbnails")) else: note("Either path %s or document.tiff or page-images is missing!", path) class ThumbnailRipper (Ripper): def rip(self, location, doc_id): thumbnail_folder(self.repository(), location) def rerun_after_metadata_changes (self, changed_fields=None): return (changed_fields and ("images-dpi" in changed_fields or "page-numbers" in changed_fields or "first-page-number" in changed_fields or "document-icon-legend" in changed_fields)) update_configuration() if __name__ == "__main__": if len(sys.argv) < 2 or not os.path.isdir(sys.argv[1]): sys.stderr.write("Usage: python createThumbnails.py FOLDERPATH\n") sys.exit(1) path = sys.argv[1] update_configuration() set_verbosity(5) do_thumbnails(path, os.path.join(path, "thumbnails"))
def main(): def usage(): sys.stderr.write('Usage: %s [-v] [-repo REPOSITORY] [-folders FOLDERS] [-server HOST[:PORT]] [-account ACCOUNTNAME] [-mailbox MBOXNAME]\n' % sys.argv[0]) sys.stderr.write('Args were: %s\n' % sys.argv) sys.stderr.write('-v causes the program to run in "verbose" mode\n') sys.stderr.write('-repo gives the directory of the local repository, or the URL of the remote repository\n') sys.stderr.write('-folders gives a comma-separated list of folders to add to the repo\n') sys.stderr.write('-server HOST[:PORT] specifies the IMAP server to talk to, and optionally the port\n') sys.stderr.write('-account ACCOUNTNAME specifies the account (pass the password as the value of the IMAP_PASSWORD env var)\n') sys.stderr.write('-mailbox MBOXNAME specifies the mailbox to read, defaults to INBOX\n') sys.stderr.write('-config CONFIGFILE gives the config file to read\n') sys.exit(1) verbose = False repo_port = 0 repo = None folders = None accountname = None host = None mailbox = None i = 1 while (i < len(sys.argv) and sys.argv[i][0] == '-'): if (sys.argv[i] == '-v'): verbose = True set_verbosity(3) elif ((sys.argv[i] == '-server') and ((i + 1) < len(sys.argv))): i = i + 1 host = sys.argv[i] elif ((sys.argv[i] == '-mailbox') and ((i + 1) < len(sys.argv))): i = i + 1 mailbox = sys.argv[i] elif ((sys.argv[i] == '-account') and ((i + 1) < len(sys.argv))): i = i + 1 accountname = sys.argv[i] elif ((sys.argv[i] == '-repo') and ((i + 1) < len(sys.argv))): i = i + 1 repo = sys.argv[i] elif ((sys.argv[i] == '-folders') and ((i + 1) < len(sys.argv))): i = i + 1 folders = sys.argv[i].split(",") else: usage() i = i + 1 if ':' in host: host, port = host.split(':') port = int(port) else: port = 143 accountpassword = os.environ.get("IMAP_PASSWORD") if not accountpassword: accountpassword = getpass("Account password: "******"https:"): repo = mailcrawler.UpLibRepo(folder, url=repo) elif os.path.isdir(repo): portfile = os.path.join(repo, "overhead", "angel.port") if not os.path.isfile(portfile): raise ValueError("Repository argument '%s' should be the root of an UpLib repository." % repo) repo_port = int(open(portfile, "r").read()) socket.setdefaulttimeout(600) repo = mailcrawler.UpLibRepo(folders, host='127.0.0.1', port=repo_port) else: raise ValueError("Don't understand 'repo' argument '%s'" % repo) if repo and verbose: sys.stdout.write(str(repo) + "\n") # 2. Get a message source (and sink) sourcesink = IMAPSourceSink(host, port, accountname, accountpassword, mailbox, use_ssl=(port == 993)) # 3. Create a MailCrawler... crawler = mailcrawler.MailCrawler(repo, sourcesink, sourcesink, verbose and sys.stdout, sys.stderr) # 4. And run it. crawler.run()