Пример #1
0
def test_crawl_error():
    # broken feed
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://brokenrss.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        next(iter(generator))
    # unreachable url
    feeds = ['http://not-exists.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        next(iter(generator))
Пример #2
0
def test_crawl_error():
    # broken feed
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://brokenrss.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        next(iter(generator))
    # unreachable url
    feeds = ['http://not-exists.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        next(iter(generator))
Пример #3
0
def add_feed(category_id):
    cursor = Cursor(category_id)
    url = request.form['url']
    try:
        f = urllib.request.urlopen(url)
        document = f.read()
        f.close()
    except Exception:
        r = jsonify(
            error='unreachable-url',
            message='Cannot connect to given url'
        )
        r.status_code = 400
        return r
    try:
        feed_links = autodiscovery(document, url)
    except FeedUrlNotFoundError:
        r = jsonify(
            error='unreachable-feed-url',
            message='Cannot find feed url'
        )
        r.status_code = 400
        return r
    feed_url = feed_links[0].url
    feed_url, feed, hints = next(iter(crawl([feed_url], 1)))
    with stage:
        sub = cursor.subscribe(feed)
        stage.subscriptions = cursor.subscriptionlist
        stage.feeds[sub.feed_id] = feed
    return feeds(category_id)
Пример #4
0
 def crawl_category(self):
     running = True
     while running:
         priority, arguments = self.crawling_queue.get()
         if priority == 0:
             if arguments == 'terminate':
                 running = False
             self.crawling_queue.task_done()
         elif priority == 1:
             cursor, feed_id = arguments
             if not feed_id:
                 urls = dict((sub.feed_uri, sub.feed_id)
                             for sub in cursor.recursive_subscriptions)
             else:
                 urls = dict((sub.feed_uri, sub.feed_id)
                             for sub in cursor.recursive_subscriptions
                             if sub.feed_id == feed_id)
             iterator = iter(crawl(urls, self.worker_num))
             while True:
                 try:
                     feed_url, feed_data, crawler_hints = next(iterator)
                     with stage:
                         stage.feeds[urls[feed_url]] = feed_data
                 except CrawlError:
                     continue
                 except StopIteration:
                     break
             self.crawling_queue.task_done()
Пример #5
0
def crawl_category():
    running = True
    while running:
        priority, arguments = crawling_queue.get()
        if priority == 0:
            if arguments == 'terminate':
                running = False
            crawling_queue.task_done()
        elif priority == 1:
            cursor, feed_id = arguments
            urls = {}
            if not feed_id:
                urls = dict((sub.feed_uri, sub.feed_id)
                            for sub in cursor.recursive_subscriptions)
            else:
                urls = dict((sub.feed_uri, sub.feed_id)
                            for sub in cursor.recursive_subscriptions
                            if sub.feed_id == feed_id)
            iterator = iter(crawl(urls, app.config['CRAWLER_THREAD']))
            while True:
                try:
                    feed_url, feed_data, crawler_hints = next(iterator)
                    with get_stage() as stage:
                        stage.feeds[urls[feed_url]] = feed_data
                except CrawlError:
                    continue
                except StopIteration:
                    break
            crawling_queue.task_done()
Пример #6
0
def test_crawler():
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://vio.atomtest.com/feed/atom',
             'http://rsstest.com/rss.xml',
             'http://favicontest.com/atom.xml']
    generator = crawl(feeds, 4)
    for result in generator:
        feed_data = result.feed
        if feed_data.title.value == 'Atom Test':
            entries = feed_data.entries
            assert entries[0].title.value == 'xml base test'
            assert entries[1].title.value == 'Title One'
            assert result.hints is None
            assert result.icon_url == 'http://vio.atomtest.com/favicon.ico'
        elif feed_data.title.value == 'Vio Blog':
            entries = feed_data.entries
            assert entries[0].title.value == 'test one'
            source = feed_data.entries[0].source
            assert source.title.value == 'Source Test'
            assert result.icon_url == 'http://rsstest.com/images/favicon.ico'
            assert result.hints == {
                'ttl': '10',
                'lastBuildDate': datetime.datetime(2002, 9, 7, 0, 0, 1,
                                                   tzinfo=utc)
            }
        elif feed_data.title.value == 'Favicon Test':
            assert result.icon_url == 'http://favicontest.com/favicon.ico'
Пример #7
0
def test_sort_entries():
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://reversedentries.com/feed/atom']
    crawler = iter(crawl(feeds, 4))
    url, feed, hints = next(crawler)
    assert feed.entries[0].updated_at > feed.entries[1].updated_at
Пример #8
0
 def crawl_category(self):
     running = True
     while running:
         priority, arguments = self.crawling_queue.get()
         if priority == 0:
             if arguments == 'terminate':
                 running = False
             self.crawling_queue.task_done()
         elif priority == 1:
             cursor, feed_id = arguments
             if not feed_id:
                 urls = dict((sub.feed_uri, sub.feed_id)
                             for sub in cursor.recursive_subscriptions)
             else:
                 urls = dict((sub.feed_uri, sub.feed_id)
                             for sub in cursor.recursive_subscriptions
                             if sub.feed_id == feed_id)
             iterator = iter(crawl(urls, self.worker_num))
             while True:
                 try:
                     feed_url, feed_data, crawler_hints = next(iterator)
                     with stage:
                         stage.feeds[urls[feed_url]] = feed_data
                 except CrawlError:
                     continue
                 except StopIteration:
                     break
             self.crawling_queue.task_done()
Пример #9
0
def add_feed(category_id):
    cursor = Cursor(category_id)
    url = request.form['url']
    try:
        rq = urllib.request.Request(url)
        rq.add_header('User-Agent', '{0}/{1}'.format(version.__package__,
                                                     version.VERSION))
        f = urllib.request.urlopen(rq):
        document = f.read()
        f.close()
    except Exception:
        r = jsonify(
            error='unreachable-url',
            message='Cannot connect to given url'
        )
        r.status_code = 400
        return r
    try:
        feed_links = autodiscovery(document, url)
    except FeedUrlNotFoundError:
        r = jsonify(
            error='unreachable-feed-url',
            message='Cannot find feed url'
        )
        r.status_code = 400
        return r
    feed_url = feed_links[0].url
    feed_url, feed, hints = next(iter(crawl([feed_url], 1)))
    with stage:
        sub = cursor.subscribe(feed)
        stage.subscriptions = cursor.subscriptionlist
        stage.feeds[sub.feed_id] = feed
    return feeds(category_id)
Пример #10
0
def add_feed(category_id):
    stage = get_stage()
    cursor = Cursor(category_id)
    url = request.form['url']
    try:
        f = urllib2.urlopen(url)
        document = f.read()
        f.close()
    except Exception:
        r = jsonify(error='unreachable-url',
                    message='Cannot connect to given url')
        r.status_code = 400
        return r
    try:
        feed_links = autodiscovery(document, url)
    except FeedUrlNotFoundError:
        r = jsonify(error='unreachable-feed-url',
                    message='Cannot find feed url')
        r.status_code = 400
        return r
    feed_url = feed_links[0].url
    feed_url, feed, hints = next(iter(crawl([feed_url], 1)))
    with stage:
        sub = cursor.subscribe(feed)
        stage.subscriptions = cursor.subscriptionlist
        stage.feeds[sub.feed_id] = feed
    return feeds(category_id)
Пример #11
0
def test_crawler():
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = [
        'http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml',
        'http://favicontest.com/atom.xml'
    ]
    generator = crawl(feeds, 4)
    for result in generator:
        feed_data = result.feed
        if feed_data.title.value == 'Atom Test':
            entries = feed_data.entries
            assert entries[0].title.value == 'xml base test'
            assert entries[1].title.value == 'Title One'
            assert result.hints is None
            assert result.icon_url == 'http://vio.atomtest.com/favicon.ico'
        elif feed_data.title.value == 'Vio Blog':
            entries = feed_data.entries
            assert entries[0].title.value == 'test one'
            source = feed_data.entries[0].source
            assert source.title.value == 'Source Test'
            assert result.icon_url == 'http://rsstest.com/images/favicon.ico'
            assert result.hints == {
                'ttl': '10',
                'lastBuildDate': datetime.datetime(2002,
                                                   9,
                                                   7,
                                                   0,
                                                   0,
                                                   1,
                                                   tzinfo=utc)
            }
        elif feed_data.title.value == 'Favicon Test':
            assert result.icon_url == 'http://favicontest.com/favicon.ico'
Пример #12
0
def test_sort_entries():
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://reversedentries.com/feed/atom']
    crawler = iter(crawl(feeds, 4))
    url, feed, hints = next(crawler)
    assert feed.entries[0].updated_at > feed.entries[1].updated_at
Пример #13
0
Файл: app.py Проект: hodduc/web
def crawl_category():
    running = True
    while running:
        priority, arguments = crawling_queue.get()
        if priority == 0:
            if arguments == 'terminate':
                running = False
            crawling_queue.task_done()
        elif priority == 1:
            cursor, feed_id = arguments
            urls = {}
            if not feed_id:
                urls = dict((sub.feed_uri, sub.feed_id)
                            for sub in cursor.recursive_subscriptions)
            else:
                urls = dict((sub.feed_uri, sub.feed_id)
                            for sub in cursor.recursive_subscriptions
                            if sub.feed_id == feed_id)
            iterator = iter(crawl(urls, app.config['CRAWLER_THREAD']))
            while True:
                try:
                    feed_url, feed_data, crawler_hints = next(iterator)
                    with get_stage() as stage:
                        stage.feeds[urls[feed_url]] = feed_data
                except CrawlError:
                    continue
                except StopIteration:
                    break
            crawling_queue.task_done()
Пример #14
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    urllist = [subscription.feed_uri
               for subscription in opml.recursive_subscriptions]
    threads_count = args.threads if args.threads is not None else cpu_count()

    generator = crawl(urllist, threads_count)
    try:
        for feed_url, feed_data, crawler_hints in generator:
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = hashlib.sha1(feed_url).hexdigest()
                stage.feeds[feed_id] = feed_data
    except CrawlError as e:
        print(e, file=sys.stderr)
Пример #15
0
def test_sort_entries(fx_opener):
    feeds = ['http://reversedentries.com/feed/atom']
    crawler = iter(crawl(feeds, 4))
    result = next(crawler)
    url, feed, hints = result
    assert url == result.url
    assert feed is result.feed
    assert hints == result.hints
    assert feed.entries[0].updated_at > feed.entries[1].updated_at
Пример #16
0
def test_sort_entries(fx_opener):
    feeds = ['http://reversedentries.com/feed/atom']
    crawler = iter(crawl(feeds, 4))
    result = next(crawler)
    url, feed, hints = result
    assert url == result.url
    assert feed is result.feed
    assert hints == result.hints
    assert feed.entries[0].updated_at > feed.entries[1].updated_at
Пример #17
0
def test_crawl_error(fx_opener):
    # broken feed
    feeds = ['http://brokenrss.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        try:
            next(iter(generator))
        except CrawlError as e:
            assert e.feed_uri == feeds[0]
            raise
    # unreachable url
    feeds = ['http://not-exists.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        try:
            next(iter(generator))
        except CrawlError as e:
            assert e.feed_uri == feeds[0]
            raise
Пример #18
0
def test_crawl_error(fx_opener):
    # broken feed
    feeds = ['http://brokenrss.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        try:
            next(iter(generator))
        except CrawlError as e:
            assert e.feed_uri == feeds[0]
            raise
    # unreachable url
    feeds = ['http://not-exists.com/rss']
    generator = crawl(feeds, 2)
    with raises(CrawlError):
        try:
            next(iter(generator))
        except CrawlError as e:
            assert e.feed_uri == feeds[0]
            raise
Пример #19
0
def fx_non_exist_opml(fx_test_stage):
    feed_urls = ['http://feedone.com/feed/atom/']
    generator = crawl(feed_urls, 1)
    for result in generator:
        feed_data = result[1]
        feed_url = result[0]
        feed_id = get_hash(feed_url)
        with fx_test_stage as stage:
            stage.feeds[feed_id] = feed_data
    with fx_test_stage as stage:
        stage.subscriptions = read(SubscriptionList, opml_with_non_exist_feed)
Пример #20
0
def fx_non_exist_opml(fx_test_stage):
    feed_urls = ['http://feedone.com/feed/atom/']
    generator = crawl(feed_urls, 1)
    for result in generator:
        feed_data = result[1]
        feed_url = result[0]
        feed_id = get_hash(feed_url)
        with fx_test_stage as stage:
            stage.feeds[feed_id] = feed_data
    with fx_test_stage as stage:
        stage.subscriptions = read(SubscriptionList, opml_with_non_exist_feed)
Пример #21
0
def test_crawler():
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml']
    generator = crawl(feeds, 4)
    for result in generator:
        feed_data = result[1]
        if feed_data.title.value == 'Atom Test':
            entries = feed_data.entries
            assert entries[0].title.value == 'xml base test'
            assert entries[1].title.value == 'Title One'
        elif feed_data.title.value == 'Vio Blog':
            entries = feed_data.entries
            assert entries[0].title.value == 'test one'
            source = feed_data.entries[0].source
            assert source.title.value == 'Source Test'
Пример #22
0
def test_crawler():
    my_opener = urllib2.build_opener(TestHTTPHandler)
    urllib2.install_opener(my_opener)
    feeds = ['http://vio.atomtest.com/feed/atom',
             'http://rsstest.com/rss.xml']
    generator = crawl(feeds, 4)
    for result in generator:
        feed_data = result[1]
        if feed_data.title.value == 'Atom Test':
            entries = feed_data.entries
            assert entries[0].title.value == 'xml base test'
            assert entries[1].title.value == 'Title One'
        elif feed_data.title.value == 'Vio Blog':
            entries = feed_data.entries
            assert entries[0].title.value == 'test one'
            source = feed_data.entries[0].source
            assert source.title.value == 'Source Test'
Пример #23
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(
                    feed_data, len(feed_data.entries)
                ))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
Пример #24
0
def crawl_command(args):
    repo = from_url(args.repository)
    session = Session(args.session_id)
    stage = Stage(session, repo)
    with stage:
        opml = stage.subscriptions
    if not opml:
        print('OPML does not exist in the repository', file=sys.stderr)
        return
    feed_id = args.feed_id
    if feed_id:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions
                        if sub.feed_id == feed_id)
        if not feed_map:
            print('There is no such feed:', feed_id, file=sys.stderr)
            return
    else:
        feed_map = dict((sub.feed_uri, sub.feed_id)
                        for sub in opml.recursive_subscriptions)
        if not feed_map:
            print('No feeds to crawl', file=sys.stderr)
            return
    threads_count = args.threads if args.threads is not None else cpu_count()
    iterator = iter(crawl(feed_map.keys(), threads_count))
    while 1:
        try:
            feed_url, feed_data, crawler_hints = next(iterator)
            if args.verbose:
                print('{0.title} - {1} entries'.format(feed_data,
                                                       len(feed_data.entries)))
            with stage:
                feed_id = feed_map[feed_url]
                stage.feeds[feed_id] = feed_data
        except (CrawlError, SchemaError) as e:
            if isinstance(e, CrawlError):
                print('Something went wrong with', e.feed_uri, file=sys.stderr)
            if args.verbose:
                traceback.print_exc()
            else:
                print(e, file=sys.stderr)
        except StopIteration:
            break
Пример #25
0
def xmls(request, fx_test_stage):
    stage = fx_test_stage
    subscriptions = SubscriptionList()
    categoryone = Category(label='categoryone', _title='categoryone')
    categorytwo = Category(label='categorytwo', _title='categorytwo')
    categorythree = Category(label='categorythree', _title='categorythree')
    subscriptions.add(categoryone)
    subscriptions.add(categorythree)
    categoryone.add(categorytwo)
    pair = {
        'http://feedone.com/feed/atom/': categoryone,
        'http://feedtwo.com/feed/atom/': categorytwo,
        'http://feedthree.com/feed/atom/': subscriptions,
        'http://feedfour.com/feed/atom/': categorythree
    }
    generator = crawl(pair.keys(), 4)
    with stage:
        for feed_url, feed, hints in generator:
            sub = pair[feed_url].subscribe(feed)
            stage.feeds[sub.feed_id] = feed
        stage.subscriptions = subscriptions
Пример #26
0
def xmls(request, fx_test_stage):
    stage = fx_test_stage
    subscriptions = SubscriptionList()
    categoryone = Category(label='categoryone', _title='categoryone')
    categorytwo = Category(label='categorytwo', _title='categorytwo')
    categorythree = Category(label='categorythree', _title='categorythree')
    subscriptions.add(categoryone)
    subscriptions.add(categorythree)
    categoryone.add(categorytwo)
    pair = {
        'http://feedone.com/feed/atom/': categoryone,
        'http://feedtwo.com/feed/atom/': categorytwo,
        'http://feedthree.com/feed/atom/': subscriptions,
        'http://feedfour.com/feed/atom/': categorythree
    }
    generator = crawl(pair.keys(), 4)
    with stage:
        for feed_url, feed, hints in generator:
            sub = pair[feed_url].subscribe(feed)
            stage.feeds[sub.feed_id] = feed
        stage.subscriptions = subscriptions
Пример #27
0
def test_crawler(fx_opener):
    feeds = [
        'http://vio.atomtest.com/feed/atom', 'http://rsstest.com/rss.xml',
        'http://favicontest.com/atom.xml', 'http://nofavicontest.com/atom.xml'
    ]
    generator = crawl(feeds, 4)
    for result in generator:
        feed_data = result.feed
        if feed_data.title.value == 'Atom Test':
            entries = feed_data.entries
            assert entries[0].title.value == 'xml base test'
            assert entries[1].title.value == 'Title One'
            assert result.hints is None
            assert result.icon_url == 'http://vio.atomtest.com/favicon.ico'
        elif feed_data.title.value == 'Vio Blog':
            entries = feed_data.entries
            assert entries[0].title.value == 'test one'
            source = feed_data.entries[0].source
            assert source.title.value == 'Source Test'
            assert result.icon_url == 'http://rsstest.com/images/favicon.ico'
        elif feed_data.title.value == 'Favicon Test':
            assert result.icon_url == 'http://favicontest.com/favicon.ico'
        elif feed_data.title.value == 'No Favicon Test':
            assert result.icon_url is None
Пример #28
0
 def crawlWithSender_(self, sender):
     try:
         subs = self.subscriptions.recursive_subscriptions
         logger.debug('len(subs) = %d', len(subs))
         feeds_dict = {s.feed_uri: s for s in subs}
         cput_count = NSProcessInfo.processInfo().activeProcessorCount()
         logger.debug('len(feeds_dict) = %d', len(feeds_dict))
         for result in crawl(feeds_dict, cput_count):
             assert isinstance(result, CrawlResult)
             logger.info('Crawled %d entries from %s',
                         len(result.feed.entries), result.url)
             sub = feeds_dict[result.url]
             with self.stage:
                 if result.icon_url:
                     sub.icon_uri = result.icon_url
                     self.stage.subscriptions = self.subscriptions
                 self.stage.feeds[sub.feed_id] = result.feed
         logger.info('Finished crawling %d feeds', len(feeds_dict))
     except Exception as e:
         logger.exception(e)
     finally:
         self.pyobjc_performSelectorOnMainThread_withObject_(
             'stopRefreshWithSender:', sender
         )
Пример #29
0
def test_crawler(fx_opener):
    feeds = ['http://vio.atomtest.com/feed/atom',
             'http://rsstest.com/rss.xml',
             'http://favicontest.com/atom.xml',
             'http://nofavicontest.com/atom.xml']
    generator = crawl(feeds, 4)
    for result in generator:
        feed_data = result.feed
        if feed_data.title.value == 'Atom Test':
            entries = feed_data.entries
            assert entries[0].title.value == 'xml base test'
            assert entries[1].title.value == 'Title One'
            assert result.hints is None
            assert result.icon_url == 'http://vio.atomtest.com/favicon.ico'
        elif feed_data.title.value == 'Vio Blog':
            entries = feed_data.entries
            assert entries[0].title.value == 'test one'
            source = feed_data.entries[0].source
            assert source.title.value == 'Source Test'
            assert result.icon_url == 'http://rsstest.com/images/favicon.ico'
        elif feed_data.title.value == 'Favicon Test':
            assert result.icon_url == 'http://favicontest.com/favicon.ico'
        elif feed_data.title.value == 'No Favicon Test':
            assert result.icon_url is None