def worker(ftype, queue): ''' 线程worker :param ftype: :param queue: :return: ''' while True: # 队列为空,停止 if queue.empty(): LazyFW.log('''TaskEmpty: break''') break try: task = queue.get_nowait() #LazyFW.log('''TaskGet: %s''' % (task, )) if ftype == 'fetch_proxy': fetch_proxy(task) elif ftype == 'proxy_test': proxy, speed = LazyFW.test_proxy(task, PROXY_TIMEOUT, 'http://esf.sh.fang.com/agenthome/', '搜房网','gbk') if proxy != None: proxy_insert(proxy, speed) except Exception, e: LazyFW.log('''TaskError(%s)''' % (e,))
def main(): proxy_queue = Queue() proxy_hosts = Queue() create_db() # 查询urls DB_CONN = get_conn() c = DB_CONN.cursor() LazyFW.log(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) c.execute(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) proxys = c.fetchone() c.close() if proxys[0] < 10: proxy_urls = get_proxy_urls() for url in proxy_urls: proxy_queue.put_nowait(url) workers = [] for i in range(PROXY_THREAD_FETCH_MAX): p = Process(target=worker, args=('fetch_proxy', proxy_queue)) p.daemon = True p.start() workers.append(p) for p in workers: p.join() DB_CONN.commit() DB_CONN.close() # 再次查询出数据 DB_CONN = get_conn() LazyFW.log(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) c = DB_CONN.cursor() c.execute(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) for row in c.fetchall(): proxy_hosts.put_nowait(row) c.close() DB_CONN.commit() DB_CONN.close() workers = [] for i in range(PROXY_THREAD_TEST_PROXY_MAX): p = Process(target=worker, args=('proxy_test', proxy_hosts)) p.daemon = True p.start() workers.append(p) for p in workers: p.join()
def fetch_proxy(url): LazyFW.log("Fetch URL: %s" % (url)) try: matches = None urls = urlparse(url) r = requests.get(url, timeout=PROXY_TIMEOUT, headers={ 'User-Agent': LazyFW.USER_AGENT, 'Referer': 'http://%s' % (urls.hostname) }) if r.status_code == 200 or r.status_code == 304: html = r.text html = LazyFW.clear_space(html) # www.cz88.net if urls.hostname == 'www.cz88.net': matches = re.compile(r'''<tr><td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html) # www.cnproxy.com elif urls.hostname == 'www.cnproxy.com': matches = [] script = LazyFW.mid(html, '<SCRIPT type="text/javascript">', '</SCRIPT>') block = LazyFW.mid(html, '<div id="proxylisttb">', '<div class="proxylistnav">') tmp_vars = script.split(';') js_vars = {} for line in tmp_vars: if line != '': var = re.compile(r'''(^[a-z])\="([^"]+)"''', re.I).findall(line) js_vars[var[0][0]] = var[0][1] host_lists = re.compile(r'''<td>([^<]+)<SCRIPT[^>]*>document\.write\("\:"([^)]+)\)</SCRIPT></td>''', re.I).findall(block) for line in host_lists: tmp_arr = line[1].lstrip('+').split('+') ports = [] for k in tmp_arr: ports.append(js_vars[k]) match = (line[0], ''.join(ports)) matches.append(match) # www.xici.net.co elif urls.hostname == 'www.xici.net.co': # <td>119.233.255.24</td> <td>80</td> matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>.+?<td>HTTP<\/td>''', re.I).findall(html) # proxy.com.ru elif urls.hostname == 'proxy.com.ru': # <td>41.222.196.52</td><td>8080</td> matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html) # proxy.com.ru elif urls.hostname == 'free-proxy.cz': # </div> 117.166.75.36</td><td><span class="fport">8123</span></td><td><small>HTTP</small></td> matches = re.compile( r'''</div>\s*([\w\d\.]+?)<\/td><td><span\s+class="fport">(\d+)<\/span><\/td><td><small>HTTP<\/small><\/td>''', re.I).findall(html) # checkerproxy.net elif urls.hostname == 'checkerproxy.net': # 122.227.8.190:80 matches = re.compile( r'''([\w\d\.]+?)\:(\d+)''', re.I).findall(html) # proxy if matches != None: length = 0 for proxy in matches: insertOk = proxy_insert(proxy, 999999) if insertOk == True: length += 1 # LazyFW.log("Add Proxy Server: %s:%s" % proxy); LazyFW.log("Add Proxy: %d/%d %s" % (length, len(matches), url)) except Exception: return False return True