Пример #1
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    urllist = [subscription.feed_uri
               for subscription in opml.recursive_subscriptions]
    threads_count = args.threads if args.threads is not None else cpu_count()

    generator = crawl(urllist, threads_count)
    try:
        for feed_url, feed_data, crawler_hints in generator:
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = hashlib.sha1(feed_url).hexdigest()
                stage.feeds[feed_id] = feed_data
    except CrawlError as e:
        print(e, file=sys.stderr)
Пример #2
0
def fx_stages():
    repo = MemoryRepository()
    session = Session('SESSID')
    stage = Stage(session, repo)
    other_session = Session('SESSID2')
    other_stage = Stage(other_session, repo)
    return stage, other_stage
Пример #3
0
def test_stage_subscription_list(fx_repo, fx_session):
    stage = Stage(fx_session, fx_repo)
    with stage:
        stage.subscriptions = SubscriptionList()
        subs = stage.subscriptions
        subs.add(Category(label='Test'))
        stage.subscriptions = subs
    with stage:
        assert (frozenset(stage.subscriptions) ==
                frozenset([Category(label='Test')]))
Пример #4
0
def test_stage_subscription_list(fx_repo, fx_session):
    stage = Stage(fx_session, fx_repo)
    with stage:
        stage.subscriptions = SubscriptionList()
        subs = stage.subscriptions
        subs.add(Category(label='Test'))
        stage.subscriptions = subs
    with stage:
        assert (frozenset(stage.subscriptions) == frozenset(
            [Category(label='Test')]))
Пример #5
0
def fx_stages(tmpdir):
    if IRON_PYTHON:
        repo = MemoryRepository()
    else:
        repo = FileSystemRepository(str(tmpdir))
    session_a = Session(identifier='a')
    session_b = Session(identifier='b')
    stage_a = Stage(session_a, repo)
    stage_b = Stage(session_b, repo)
    return stage_a, stage_b
Пример #6
0
def get_stage():
    try:
        return app.config['STAGE']
    except KeyError:
        session_id = app.config['SESSION_ID']
        if request.environ['wsgi.multiprocess']:
            # Stage doesn't offer safe synchronization between multiprocess.
            # Unique session identifiers are actually needed to distinguish
            # different "installations" which technically means "processes,"
            # hence we append pid to the session identifier configured by
            # the user to make them unique.
            # Note that it probably causes N times more disk usage
            # where N = the number of processes.  So we should discourage
            # using web servers of prefork/worker model in the docs.
            session_id = '{0}.{1}'.format(session_id, os.getpid())

        session = Session(session_id)
        url = urlparse.urlparse(app.config['REPOSITORY'])
        if url.scheme == 'file':
            repository = FileSystemRepository(
                url.path, atomic=request.environ['wsgi.multithread'])
        else:
            repository = from_url(app.config['REPOSITORY'])

        stage = Stage(session, repository)
        app.config['STAGE'] = stage
        return stage
Пример #7
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
Пример #8
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(feed_data,
                                                       len(feed_data.entries)))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
Пример #9
0
def get_stage():
    repository = DataStoreRepository()
    session = get_session()
    return Stage(session, repository)
Пример #10
0
def fx_test_stage(tmpdir):
    session = Session()
    repo = FileSystemRepository(str(tmpdir))
    stage = Stage(session, repo)
    app.config['STAGE'] = stage
    return stage