Exemplo n.º 1
0
def spider():
    """ Create the httpclient pool which default size is 1200 """
    httpclient_pool = HttpClientPool()
    
    """
    Allocate a task to this spider node,
    skip is the task's start point in queue,
    and limit is the number of data which will be snatched in this spider node.
    """
    queue_size = queue.count()
    limit = queue_size / spider_total_number 
    skip = limit * spider_serial_number
    if spider_total_number - 1 == spider_serial_number:
        limit += queue_size % spider_total_number
        
    print "skip = ", skip, ", limit = ", limit
    
    with gevent.Timeout(None, False):        
        print "This spider is start."
        
        orders = queue.find(skip, limit)
        while orders.count() > 0:
            for order in orders:
                thead_pool.spawn(httpclient_pool.request, order)
            thead_pool.join()
            
            print "Start =", httpclient.start,", End =", httpclient.end,", Error =", httpclient.error
            httpclient.start = 0
            httpclient.end = 0
            httpclient.error = 0
            
            orders = queue.find(skip, limit)
            
        print "This spider is finished."
Exemplo n.º 2
0
def snatch():
    global running
    """ If this node is running, pass """
    if running == True:
        return

    """ Create the httpclient pool which default size is 10 """
    httpclient_pool = HttpClientPool()
    
    """
    Allocate a task to this spider node,
    skip is the task's start point in queue,
    and limit is the number of data which will be snatched in this spider node.
    """
    queue_size = queue.count()
    limit = queue_size / node_total_number 
    skip = limit * node_serial_number
    if node_total_number - 1 == node_serial_number:
        limit += queue_size % node_total_number
        
    print "skip = ", skip, ", limit = ", limit
    
    with gevent.Timeout(None, False):        
        print "This spider is start."
        running = True
        
        orders = queue.find(skip, limit)
        #while orders.count() > 0:
        for order in orders:
            thread_pool.spawn(httpclient_pool.request, order)
            thread_pool.join()
            
        print "Start =", httpclientpool.start,", End =", httpclientpool.end,", Error =", httpclientpool.error
        httpclientpool.start = 0
        httpclientpool.end = 0
        httpclientpool.error = 0
            
        orders = queue.find(skip, limit)

        running = False
        print "This spider is finished."