def test_from_url__posix(without_pkg_resources, tmpdir, monkeypatch): if without_pkg_resources and not IRON_PYTHON: monkeypatch.delattr('pkg_resources.iter_entry_points') url = 'file://' + str(tmpdir) fs = from_url(url) assert isinstance(fs, FileSystemRepository) assert fs.path == str(tmpdir) with raises(LookupError): from_url('unregistered-scheme://')
def test_from_url(without_pkg_resources, tmpdir, monkeypatch): if without_pkg_resources and not IRON_PYTHON: monkeypatch.delattr('pkg_resources.iter_entry_points') url = 'file://' + str(tmpdir) fs = from_url(url) assert isinstance(fs, FileSystemRepository) assert fs.path == str(tmpdir) with raises(LookupError): from_url('unregistered-scheme://')
def get_stage(): try: return app.config['STAGE'] except KeyError: session_id = app.config['SESSION_ID'] if request.environ['wsgi.multiprocess']: # Stage doesn't offer safe synchronization between multiprocess. # Unique session identifiers are actually needed to distinguish # different "installations" which technically means "processes," # hence we append pid to the session identifier configured by # the user to make them unique. # Note that it probably causes N times more disk usage # where N = the number of processes. So we should discourage # using web servers of prefork/worker model in the docs. session_id = '{0}.{1}'.format(session_id, os.getpid()) session = Session(session_id) url = urlparse.urlparse(app.config['REPOSITORY']) if url.scheme == 'file': repository = FileSystemRepository( url.path, atomic=request.environ['wsgi.multithread']) else: repository = from_url(app.config['REPOSITORY']) stage = Stage(session, repository) app.config['STAGE'] = stage return stage
def get_stage(): try: return current_app.config['STAGE'] except KeyError: session_id = current_app.config['SESSION_ID'] if request.environ['wsgi.multiprocess']: # Stage doesn't offer safe synchronization between multiprocess. # Unique session identifiers are actually needed to distinguish # different "installations" which technically means "processes," # hence we append pid to the session identifier configured by # the user to make them unique. # Note that it probably causes N times more disk usage # where N = the number of processes. So we should discourage # using web servers of prefork/worker model in the docs. session_id = '{0}.{1}'.format( session_id or uuid.getnode(), os.getpid()) session = Session(session_id) url = urllib.parse.urlparse(current_app.config['REPOSITORY']) if url.scheme == 'file': repository = FileSystemRepository( url.path, atomic=request.environ['wsgi.multithread'] ) else: repository = from_url(current_app.config['REPOSITORY']) stage = Stage(session, repository) current_app.config['STAGE'] = stage return stage
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return urllist = [subscription.feed_uri for subscription in opml.recursive_subscriptions] threads_count = args.threads if args.threads is not None else cpu_count() generator = crawl(urllist, threads_count) try: for feed_url, feed_data, crawler_hints in generator: if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = hashlib.sha1(feed_url).hexdigest() stage.feeds[feed_id] = feed_data except CrawlError as e: print(e, file=sys.stderr)
def main(args): stage = Stage( Session(args.session_id), from_url(args.repository_dir) ) read_loop(stage)
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format(feed_data, len(feed_data.entries))) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break