Exemplo n.º 1
0
    def gtaskManager(self,urls,extractSearchResults,proxy_flag = 0,ua_flag = 0):
        task_log = None 
        gtaskpool.setlogging(logging.INFO,task_log)
        purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"]
        uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
        limited_urls = [
            ('^https{0,1}://', 0)
        ]
        global proxymgr

        if proxy_flag == 1:
            proxymgr = ProxyManager(get_http_proxies, limited_urls,
                                {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1)
        else:
            proxymgr = None
        print proxymgr
        global useragents
        if ua_flag == 1:
            useragents = get_useragents(uurl1)
        else:
            useragents = [None]
        if useragents == []:
            useragents = [None]

        gtaskpool.runtasks(AccessUrls.taskGenerator(self,urls,extractSearchResults))
    def gtaskmanager(self,engine_type):
        #task_log = 'task_log.log'
        task_log = None
        gtaskpool.setlogging(logging.INFO,task_log)
        purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"]
        uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
        limited_urls = [
            ('^https://search\.disconnect\.me', 1)
        ]
        global proxymgr

        proxymgr = ProxyManager(get_http_proxies, limited_urls,
                                {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1)
        global useragents
        useragents = get_useragents(uurl1)
        if useragents == []:
            useragents = [None]

        gtaskpool.runtasks(self.task_generator(self,engine_type))
    def gtaskmanager(self, engine_type):
        #task_log = 'task_log.log'
        task_log = None
        gtaskpool.setlogging(logging.INFO, task_log)
        purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"]
        uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
        limited_urls = [('^https://search\.disconnect\.me', 1)]
        global proxymgr

        proxymgr = ProxyManager(get_http_proxies, limited_urls, {
            'refresh': True,
            'interval': 30 * 60,
            'delay': 8 * 60
        }, *purl1)
        global useragents
        useragents = get_useragents(uurl1)
        if useragents == []:
            useragents = [None]

        gtaskpool.runtasks(self.task_generator(self, engine_type))
Exemplo n.º 4
0
    gtaskpool.setlogging(logging.INFO)

    purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"]
    purl2 = ["http://192.168.1.14:5500/get_http_proxy_list"]
    uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
    uurl2 = "http://192.168.1.14:5500/get_useragent_list"

    # Create a ProxyManager if you need
    limited_urls = [("^http://www\.baidu\.com/s\?wd=apple&pn=\d+$", 1)]
    proxymgr = ProxyManager(
        get_http_proxies, limited_urls, {"refresh": True, "interval": 30 * 60, "delay": 8 * 60}, *purl2
    )
    # Or if you don't want refresh proxies periodcally
    # proxymgr = ProxyManager(get_http_proxies, *purl2, limited_urls, \
    #        {'refresh': False}, *purl2)

    # A useragent list for http request if you need
    useragents = get_useragents(uurl2)
    if useragents == []:
        useragents = [None]

    fresult = open("result.txt", "w")
    fleft = open("left.txt", "w")

    # Optional args:
    #   @max_ongoing_tasks (default to 1000)
    gtaskpool.runtasks(task_generator())

    fresult.close()
    fleft.close()
Exemplo n.º 5
0
    while trycnt != max_try:
        res = task()
        res['try_idx'] = trycnt + 1
        if trycnt+1 == max_try or res['finish']:
            res['last_try'] = True
        else:
            res['last_try'] = False
            
        log_task_result(res, task_log)
        if res['finish']:
            return
        trycnt += 1

def log_task_result(result, filehandle):
    result['ts'] = str(datetime.now())
    jstr = json.dumps(result, ensure_ascii=False).encode('utf-8')
    filehandle.write(jstr + "\n")

def runtasks(task_generator, task_log, max_try=10):
    def gen_task():
        while True:
            try:
                task = task_generator.next()
            except StopIteration, e:
                return
            yield gtaskpool.Task(retry_task, [task, task_log, max_try])

    gtaskpool.runtasks(gen_task())


Exemplo n.º 6
0
    while trycnt != max_try:
        res = task()
        res['try_idx'] = trycnt + 1
        if trycnt + 1 == max_try or res['finish']:
            res['last_try'] = True
        else:
            res['last_try'] = False

        log_task_result(res, task_log)
        if res['finish']:
            return
        trycnt += 1


def log_task_result(result, filehandle):
    result['ts'] = str(datetime.now())
    jstr = json.dumps(result, ensure_ascii=False).encode('utf-8')
    filehandle.write(jstr + "\n")


def runtasks(task_generator, task_log, max_try=10):
    def gen_task():
        while True:
            try:
                task = task_generator.next()
            except StopIteration, e:
                return
            yield gtaskpool.Task(retry_task, [task, task_log, max_try])

    gtaskpool.runtasks(gen_task())
Exemplo n.º 7
0
#!/usr/bin/env python
# encoding: utf-8

import gtaskpool

import requests

import logging


def task(n1, n2):
    logging.info("task(%s, %s): called", n1, n2)
    r = requests.get("http://www.baidu.com")
    print "task(%s, %s): response (len=%s): %s..." % \
            (n1, n2, len(r.text), r.text[:min(100, len(r.text))])
    logging.info("task(%s, %s): finished", n1, n2)


def task_generator():
    task_num = 10
    for i in xrange(1, task_num + 1):
        yield gtaskpool.Task(task, [i, i])


if __name__ == "__main__":
    gtaskpool.setlogging(logging.INFO)
    gtaskpool.runtasks(task_generator())