def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return urllist = [subscription.feed_uri for subscription in opml.recursive_subscriptions] threads_count = args.threads if args.threads is not None else cpu_count() generator = crawl(urllist, threads_count) try: for feed_url, feed_data, crawler_hints in generator: if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = hashlib.sha1(feed_url).hexdigest() stage.feeds[feed_id] = feed_data except CrawlError as e: print(e, file=sys.stderr)
def fx_stages(): repo = MemoryRepository() session = Session('SESSID') stage = Stage(session, repo) other_session = Session('SESSID2') other_stage = Stage(other_session, repo) return stage, other_stage
def test_stage_subscription_list(fx_repo, fx_session): stage = Stage(fx_session, fx_repo) with stage: stage.subscriptions = SubscriptionList() subs = stage.subscriptions subs.add(Category(label='Test')) stage.subscriptions = subs with stage: assert (frozenset(stage.subscriptions) == frozenset([Category(label='Test')]))
def test_stage_subscription_list(fx_repo, fx_session): stage = Stage(fx_session, fx_repo) with stage: stage.subscriptions = SubscriptionList() subs = stage.subscriptions subs.add(Category(label='Test')) stage.subscriptions = subs with stage: assert (frozenset(stage.subscriptions) == frozenset( [Category(label='Test')]))
def fx_stages(tmpdir): if IRON_PYTHON: repo = MemoryRepository() else: repo = FileSystemRepository(str(tmpdir)) session_a = Session(identifier='a') session_b = Session(identifier='b') stage_a = Stage(session_a, repo) stage_b = Stage(session_b, repo) return stage_a, stage_b
def get_stage(): try: return app.config['STAGE'] except KeyError: session_id = app.config['SESSION_ID'] if request.environ['wsgi.multiprocess']: # Stage doesn't offer safe synchronization between multiprocess. # Unique session identifiers are actually needed to distinguish # different "installations" which technically means "processes," # hence we append pid to the session identifier configured by # the user to make them unique. # Note that it probably causes N times more disk usage # where N = the number of processes. So we should discourage # using web servers of prefork/worker model in the docs. session_id = '{0}.{1}'.format(session_id, os.getpid()) session = Session(session_id) url = urlparse.urlparse(app.config['REPOSITORY']) if url.scheme == 'file': repository = FileSystemRepository( url.path, atomic=request.environ['wsgi.multithread']) else: repository = from_url(app.config['REPOSITORY']) stage = Stage(session, repository) app.config['STAGE'] = stage return stage
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format( feed_data, len(feed_data.entries) )) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def crawl_command(args): repo = from_url(args.repository) session = Session(args.session_id) stage = Stage(session, repo) with stage: opml = stage.subscriptions if not opml: print('OPML does not exist in the repository', file=sys.stderr) return feed_id = args.feed_id if feed_id: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions if sub.feed_id == feed_id) if not feed_map: print('There is no such feed:', feed_id, file=sys.stderr) return else: feed_map = dict((sub.feed_uri, sub.feed_id) for sub in opml.recursive_subscriptions) if not feed_map: print('No feeds to crawl', file=sys.stderr) return threads_count = args.threads if args.threads is not None else cpu_count() iterator = iter(crawl(feed_map.keys(), threads_count)) while 1: try: feed_url, feed_data, crawler_hints = next(iterator) if args.verbose: print('{0.title} - {1} entries'.format(feed_data, len(feed_data.entries))) with stage: feed_id = feed_map[feed_url] stage.feeds[feed_id] = feed_data except (CrawlError, SchemaError) as e: if isinstance(e, CrawlError): print('Something went wrong with', e.feed_uri, file=sys.stderr) if args.verbose: traceback.print_exc() else: print(e, file=sys.stderr) except StopIteration: break
def get_stage(): repository = DataStoreRepository() session = get_session() return Stage(session, repository)
def fx_test_stage(tmpdir): session = Session() repo = FileSystemRepository(str(tmpdir)) stage = Stage(session, repo) app.config['STAGE'] = stage return stage