Exemplo n.º 1
0
def crawl (start_url):
    """Recursively crawl starting from *start_url*.  Returns a set of 
    urls that were found."""
    pool = evy.GreenPool()
    seen = set()
    fetch(start_url, seen, pool)
    pool.waitall()
    return seen
Exemplo n.º 2
0
def launch_green_threads ():
    from evy.patched import socket
    import evy

    def green_accepter (server_sock, pool):
        for i in xrange(CONCURRENCY):
            sock, addr = server_sock.accept()
            pool.spawn_n(reader, sock)

    pool = evy.GreenPool(CONCURRENCY * 2 + 1)
    server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server_sock.bind(('localhost', 0))
    server_sock.listen(50)
    addr = ('localhost', server_sock.getsockname()[1])
    pool.spawn_n(green_accepter, server_sock, pool)
    for i in xrange(CONCURRENCY):
        pool.spawn_n(writer, addr, socket.socket)
    pool.waitall()
Exemplo n.º 3
0
def producer(start_url):
    """Recursively crawl starting from *start_url*.  Returns a set of 
    urls that were found."""
    pool = evy.GreenPool()
    seen = set()
    q = evy.Queue()
    q.put(start_url)
    # keep looping if there are new urls, or workers that may produce more urls
    while True:
        while not q.empty():
            url = q.get()
            # limit requests to evy.net so we don't crash all over the internet
            if url not in seen and 'evy.net' in url:
                seen.add(url)
                pool.spawn_n(fetch, url, q)
        pool.waitall()
        if q.empty():
            break

    return seen
Exemplo n.º 4
0
"""A simple web server that accepts POSTS containing a list of feed urls,
and returns the titles of those feeds.
"""
import evy

feedparser = evy.import_patched('feedparser')

# the pool provides a safety limit on our concurrency
pool = evy.GreenPool()


def fetch_title(url):
    d = feedparser.parse(url)
    return d.feed.get('title', '')


def app(environ, start_response):
    if environ['REQUEST_METHOD'] != 'POST':
        start_response('403 Forbidden', [])
        return []

    # the pile collects the result of a concurrent operation -- in this case,
    # the collection of feed titles
    pile = evy.GreenPile(pool)
    for line in environ['wsgi.input'].readlines():
        url = line.strip()
        if url:
            pile.spawn(fetch_title, url)
        # since the pile is an iterator over the results,
    # you can use it in all sorts of great Pythonic ways
    titles = '\n'.join(pile)
Exemplo n.º 5
0
#! /usr/bin/env python
"""
This is a simple web "crawler" that fetches a bunch of urls using a pool to 
control the number of outbound connections. It has as many simultaneously open
connections as coroutines in the pool.

The prints in the body of the fetch function are there to demonstrate that the
requests are truly made in parallel.
"""

urls = ["http://www.google.com/intl/en_ALL/images/logo.gif",
        "https://wiki.secondlife.com/w/images/secondlife.jpg",
        "http://us.i1.yimg.com/us.yimg.com/i/ww/beta/y3.gif"]

import evy
from evy.patched import urllib2

def fetch (url):
    print "opening", url
    body = urllib2.urlopen(url).read()
    print "done with", url
    return url, body

pool = evy.GreenPool(200)
for url, body in pool.imap(fetch, urls):
    print "got body from", url, "of length", len(body)