예제 #1
0
def test_from_url__posix(without_pkg_resources, tmpdir, monkeypatch):
    if without_pkg_resources and not IRON_PYTHON:
        monkeypatch.delattr('pkg_resources.iter_entry_points')
    url = 'file://' + str(tmpdir)
    fs = from_url(url)
    assert isinstance(fs, FileSystemRepository)
    assert fs.path == str(tmpdir)
    with raises(LookupError):
        from_url('unregistered-scheme://')
예제 #2
0
def test_from_url(without_pkg_resources, tmpdir, monkeypatch):
    if without_pkg_resources and not IRON_PYTHON:
        monkeypatch.delattr('pkg_resources.iter_entry_points')
    url = 'file://' + str(tmpdir)
    fs = from_url(url)
    assert isinstance(fs, FileSystemRepository)
    assert fs.path == str(tmpdir)
    with raises(LookupError):
        from_url('unregistered-scheme://')
예제 #3
0
def get_stage():
    try:
        return app.config['STAGE']
    except KeyError:
        session_id = app.config['SESSION_ID']
        if request.environ['wsgi.multiprocess']:
            # Stage doesn't offer safe synchronization between multiprocess.
            # Unique session identifiers are actually needed to distinguish
            # different "installations" which technically means "processes,"
            # hence we append pid to the session identifier configured by
            # the user to make them unique.
            # Note that it probably causes N times more disk usage
            # where N = the number of processes.  So we should discourage
            # using web servers of prefork/worker model in the docs.
            session_id = '{0}.{1}'.format(session_id, os.getpid())

        session = Session(session_id)
        url = urlparse.urlparse(app.config['REPOSITORY'])
        if url.scheme == 'file':
            repository = FileSystemRepository(
                url.path, atomic=request.environ['wsgi.multithread'])
        else:
            repository = from_url(app.config['REPOSITORY'])

        stage = Stage(session, repository)
        app.config['STAGE'] = stage
        return stage
예제 #4
0
def get_stage():
    try:
        return current_app.config['STAGE']
    except KeyError:
        session_id = current_app.config['SESSION_ID']
        if request.environ['wsgi.multiprocess']:
            # Stage doesn't offer safe synchronization between multiprocess.
            # Unique session identifiers are actually needed to distinguish
            # different "installations" which technically means "processes,"
            # hence we append pid to the session identifier configured by
            # the user to make them unique.
            # Note that it probably causes N times more disk usage
            # where N = the number of processes.  So we should discourage
            # using web servers of prefork/worker model in the docs.
            session_id = '{0}.{1}'.format(
                session_id or uuid.getnode(), os.getpid())
        session = Session(session_id)
        url = urllib.parse.urlparse(current_app.config['REPOSITORY'])
        if url.scheme == 'file':
            repository = FileSystemRepository(
                url.path,
                atomic=request.environ['wsgi.multithread']
            )
        else:
            repository = from_url(current_app.config['REPOSITORY'])
        stage = Stage(session, repository)
        current_app.config['STAGE'] = stage
        return stage
예제 #5
0
파일: command.py 프로젝트: klutzy/web
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    urllist = [subscription.feed_uri
               for subscription in opml.recursive_subscriptions]
    threads_count = args.threads if args.threads is not None else cpu_count()

    generator = crawl(urllist, threads_count)
    try:
        for feed_url, feed_data, crawler_hints in generator:
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = hashlib.sha1(feed_url).hexdigest()
                stage.feeds[feed_id] = feed_data
    except CrawlError as e:
        print(e, file=sys.stderr)
예제 #6
0
def main(args):
    stage = Stage(
        Session(args.session_id),
        from_url(args.repository_dir)
    )

    read_loop(stage)
예제 #7
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
예제 #8
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(feed_data,
                                                       len(feed_data.entries)))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break