def worker(ftype, queue): ''' 线程worker :param ftype: :param queue: :return: ''' while True: # 队列为空,停止 if queue.empty(): LazyFW.log('''TaskEmpty: break''') break try: task = queue.get_nowait() #LazyFW.log('''TaskGet: %s''' % (task, )) if ftype == 'fetch_proxy': fetch_proxy(task) elif ftype == 'proxy_test': proxy, speed = LazyFW.test_proxy(task, PROXY_TIMEOUT, 'http://esf.sh.fang.com/agenthome/', '搜房网','gbk') if proxy != None: proxy_insert(proxy, speed) except Exception, e: LazyFW.log('''TaskError(%s)''' % (e,))
def main(): proxy_queue = Queue() proxy_hosts = Queue() create_db() # 查询urls DB_CONN = get_conn() c = DB_CONN.cursor() LazyFW.log(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) c.execute(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) proxys = c.fetchone() c.close() if proxys[0] < 10: proxy_urls = get_proxy_urls() for url in proxy_urls: proxy_queue.put_nowait(url) workers = [] for i in range(PROXY_THREAD_FETCH_MAX): p = Process(target=worker, args=('fetch_proxy', proxy_queue)) p.daemon = True p.start() workers.append(p) for p in workers: p.join() DB_CONN.commit() DB_CONN.close() # 再次查询出数据 DB_CONN = get_conn() LazyFW.log(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) c = DB_CONN.cursor() c.execute(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, )) for row in c.fetchall(): proxy_hosts.put_nowait(row) c.close() DB_CONN.commit() DB_CONN.close() workers = [] for i in range(PROXY_THREAD_TEST_PROXY_MAX): p = Process(target=worker, args=('proxy_test', proxy_hosts)) p.daemon = True p.start() workers.append(p) for p in workers: p.join()
def FetchLinksFromSource(self, url, htmlSource): html = LazyFW.format_url(url, htmlSource) html = LazyFW.clear_space(html) list_body = LazyFW.mid(html, '<!-- begin comments -->', '<!-- end comments -->') # <li id="comment-2792510"> <div> list_body = re.sub(r'''(<li\s+id="comment\-[\d]+">\s*<div>)''', r'''\1<h1>[BEGIN]</h1>''', list_body, flags=re.I) list_body = re.sub(r'''(</li>)''', r'''<h1>[END]</h1>\1''', list_body, flags=re.I) list_text = LazyFW.html2text(url, list_body, { 'ignore_links': True, 'ignore_images': False, }) list_tuple = re.findall(r'''\[BEGIN\](.+?)\[END\]''', list_text, re.S) if list_tuple != None: for block in list_tuple: oo = int(LazyFW.mid(block, 'oo [', ']')) xx = int(LazyFW.mid(block, 'xx [', ']')) total = oo + xx avg = total / 2 if (oo > xx) and (oo > avg): images = re.findall(r'''(http\://[^\.]+\.sinaimg\.cn/(.+?)\.jpg)''', block, re.I) if len(images) > 0: for image in images: self.download_file(image[0])
def fetch_proxy(url): LazyFW.log("Fetch URL: %s" % (url)) try: matches = None urls = urlparse(url) r = requests.get(url, timeout=PROXY_TIMEOUT, headers={ 'User-Agent': LazyFW.USER_AGENT, 'Referer': 'http://%s' % (urls.hostname) }) if r.status_code == 200 or r.status_code == 304: html = r.text html = LazyFW.clear_space(html) # www.cz88.net if urls.hostname == 'www.cz88.net': matches = re.compile(r'''<tr><td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html) # www.cnproxy.com elif urls.hostname == 'www.cnproxy.com': matches = [] script = LazyFW.mid(html, '<SCRIPT type="text/javascript">', '</SCRIPT>') block = LazyFW.mid(html, '<div id="proxylisttb">', '<div class="proxylistnav">') tmp_vars = script.split(';') js_vars = {} for line in tmp_vars: if line != '': var = re.compile(r'''(^[a-z])\="([^"]+)"''', re.I).findall(line) js_vars[var[0][0]] = var[0][1] host_lists = re.compile(r'''<td>([^<]+)<SCRIPT[^>]*>document\.write\("\:"([^)]+)\)</SCRIPT></td>''', re.I).findall(block) for line in host_lists: tmp_arr = line[1].lstrip('+').split('+') ports = [] for k in tmp_arr: ports.append(js_vars[k]) match = (line[0], ''.join(ports)) matches.append(match) # www.xici.net.co elif urls.hostname == 'www.xici.net.co': # <td>119.233.255.24</td> <td>80</td> matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>.+?<td>HTTP<\/td>''', re.I).findall(html) # proxy.com.ru elif urls.hostname == 'proxy.com.ru': # <td>41.222.196.52</td><td>8080</td> matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html) # proxy.com.ru elif urls.hostname == 'free-proxy.cz': # </div> 117.166.75.36</td><td><span class="fport">8123</span></td><td><small>HTTP</small></td> matches = re.compile( r'''</div>\s*([\w\d\.]+?)<\/td><td><span\s+class="fport">(\d+)<\/span><\/td><td><small>HTTP<\/small><\/td>''', re.I).findall(html) # checkerproxy.net elif urls.hostname == 'checkerproxy.net': # 122.227.8.190:80 matches = re.compile( r'''([\w\d\.]+?)\:(\d+)''', re.I).findall(html) # proxy if matches != None: length = 0 for proxy in matches: insertOk = proxy_insert(proxy, 999999) if insertOk == True: length += 1 # LazyFW.log("Add Proxy Server: %s:%s" % proxy); LazyFW.log("Add Proxy: %d/%d %s" % (length, len(matches), url)) except Exception: return False return True
import os import random import time from urlparse import urlparse import re import sys import LazyFW import MySQLdb as mysql from multiprocessing import Process from multiprocessing.queues import Queue import requests __author__ = 'Lukin' # 下载代理列表线程数 PROXY_THREAD_FETCH_MAX = int(LazyFW.config('Proxy', 'FETCH_THREAD_MAX')) # 代理测速线程数 PROXY_THREAD_TEST_PROXY_MAX = int(LazyFW.config('Proxy', 'TEST_THREAD_MAX')) # 代理超时 PROXY_TIMEOUT = int(LazyFW.config('Proxy', 'TIMEOUT')) # 当前日期 CURR_DATE = LazyFW.t2date(time.time(), '%Y%m%d') # DB config DB_HOST = str(LazyFW.config('DB', 'HOST')) DB_USER = str(LazyFW.config('DB', 'USER')) DB_PASS = str(LazyFW.config('DB', 'PASS')) DB_NAME = str(LazyFW.config('DB', 'NAME')) DB_PORT = int(LazyFW.config('DB', 'PORT'))