Exemplo n.º 1
0
def worker(ftype, queue):
    '''
    线程worker

    :param ftype: 
    :param queue:
    :return:
    '''

    while True:
        # 队列为空,停止
        if queue.empty():
            LazyFW.log('''TaskEmpty: break''')
            break

        try:
            task = queue.get_nowait()
            #LazyFW.log('''TaskGet: %s''' % (task, ))

            if ftype == 'fetch_proxy':
                fetch_proxy(task)
            elif ftype == 'proxy_test':
                proxy, speed = LazyFW.test_proxy(task, PROXY_TIMEOUT, 'http://esf.sh.fang.com/agenthome/', '搜房网','gbk')
                if proxy != None:
                    proxy_insert(proxy, speed)
        except Exception, e:
            LazyFW.log('''TaskError(%s)''' % (e,))
Exemplo n.º 2
0
def main():
    proxy_queue = Queue()
    proxy_hosts = Queue()

    create_db()
    # 查询urls
    DB_CONN = get_conn()
    c = DB_CONN.cursor()
    LazyFW.log(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    c.execute(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    proxys = c.fetchone()
    c.close()
    if proxys[0] < 10:
        proxy_urls = get_proxy_urls()
        for url in proxy_urls:
            proxy_queue.put_nowait(url)

        workers = []
        for i in range(PROXY_THREAD_FETCH_MAX):
            p = Process(target=worker, args=('fetch_proxy', proxy_queue))
            p.daemon = True
            p.start()
            workers.append(p)

        for p in workers:
            p.join()
    DB_CONN.commit()
    DB_CONN.close()

    # 再次查询出数据
    DB_CONN = get_conn()
    LazyFW.log(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    c = DB_CONN.cursor()
    c.execute(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    for row in c.fetchall():
        proxy_hosts.put_nowait(row)

    c.close()
    DB_CONN.commit()
    DB_CONN.close()

    workers = []
    for i in range(PROXY_THREAD_TEST_PROXY_MAX):
        p = Process(target=worker, args=('proxy_test', proxy_hosts))
        p.daemon = True
        p.start()
        workers.append(p)

    for p in workers:
        p.join()
Exemplo n.º 3
0
    def FetchLinksFromSource(self, url, htmlSource):
        html = LazyFW.format_url(url, htmlSource)
        html = LazyFW.clear_space(html)
        list_body = LazyFW.mid(html, '<!-- begin comments -->', '<!-- end comments -->')
        # <li id="comment-2792510"> <div>

        list_body = re.sub(r'''(<li\s+id="comment\-[\d]+">\s*<div>)''', r'''\1<h1>[BEGIN]</h1>''', list_body,
                           flags=re.I)
        list_body = re.sub(r'''(</li>)''', r'''<h1>[END]</h1>\1''', list_body, flags=re.I)

        list_text = LazyFW.html2text(url, list_body, {
            'ignore_links': True,
            'ignore_images': False,
        })

        list_tuple = re.findall(r'''\[BEGIN\](.+?)\[END\]''', list_text, re.S)
        if list_tuple != None:
            for block in list_tuple:
                oo = int(LazyFW.mid(block, 'oo [', ']'))
                xx = int(LazyFW.mid(block, 'xx [', ']'))
                total = oo + xx
                avg = total / 2
                if (oo > xx) and (oo > avg):
                    images = re.findall(r'''(http\://[^\.]+\.sinaimg\.cn/(.+?)\.jpg)''', block, re.I)
                    if len(images) > 0:
                        for image in images:
                            self.download_file(image[0])
Exemplo n.º 4
0
def fetch_proxy(url):
    LazyFW.log("Fetch URL: %s" % (url))
    try:
        matches = None
        urls = urlparse(url)
        r = requests.get(url, timeout=PROXY_TIMEOUT, headers={
            'User-Agent': LazyFW.USER_AGENT,
            'Referer': 'http://%s' % (urls.hostname)
        })

        if r.status_code == 200 or r.status_code == 304:
            html = r.text
            html = LazyFW.clear_space(html)

            # www.cz88.net
            if urls.hostname == 'www.cz88.net':
                matches = re.compile(r'''<tr><td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html)
            # www.cnproxy.com
            elif urls.hostname == 'www.cnproxy.com':
                matches = []
                script = LazyFW.mid(html, '<SCRIPT type="text/javascript">', '</SCRIPT>')
                block = LazyFW.mid(html, '<div id="proxylisttb">', '<div class="proxylistnav">')
                tmp_vars = script.split(';')
                js_vars = {}
                for line in tmp_vars:
                    if line != '':
                        var = re.compile(r'''(^[a-z])\="([^"]+)"''', re.I).findall(line)
                        js_vars[var[0][0]] = var[0][1]

                host_lists = re.compile(r'''<td>([^<]+)<SCRIPT[^>]*>document\.write\("\:"([^)]+)\)</SCRIPT></td>''',
                                        re.I).findall(block)
                for line in host_lists:
                    tmp_arr = line[1].lstrip('+').split('+')
                    ports = []
                    for k in tmp_arr:
                        ports.append(js_vars[k])

                    match = (line[0], ''.join(ports))
                    matches.append(match)
            # www.xici.net.co
            elif urls.hostname == 'www.xici.net.co':
                # <td>119.233.255.24</td> <td>80</td>
                matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>.+?<td>HTTP<\/td>''',
                                     re.I).findall(html)
            # proxy.com.ru
            elif urls.hostname == 'proxy.com.ru':
                # <td>41.222.196.52</td><td>8080</td>
                matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''',
                                     re.I).findall(html)

            # proxy.com.ru
            elif urls.hostname == 'free-proxy.cz':
                # </div> 117.166.75.36</td><td><span class="fport">8123</span></td><td><small>HTTP</small></td>
                matches = re.compile(
                    r'''</div>\s*([\w\d\.]+?)<\/td><td><span\s+class="fport">(\d+)<\/span><\/td><td><small>HTTP<\/small><\/td>''',
                    re.I).findall(html)

            # checkerproxy.net
            elif urls.hostname == 'checkerproxy.net':
                # 122.227.8.190:80
                matches = re.compile(
                    r'''([\w\d\.]+?)\:(\d+)''',
                    re.I).findall(html)

        # proxy
        if matches != None:
            length = 0
            for proxy in matches:
                insertOk = proxy_insert(proxy, 999999)
                if insertOk == True:
                    length += 1
                # LazyFW.log("Add Proxy Server: %s:%s" % proxy);

            LazyFW.log("Add Proxy: %d/%d %s" % (length, len(matches), url))

    except Exception:
        return False

    return True
Exemplo n.º 5
0
import os
import random
import time
from urlparse import urlparse
import re
import sys
import LazyFW
import MySQLdb as mysql
from multiprocessing import Process
from multiprocessing.queues import Queue
import requests

__author__ = 'Lukin'

# 下载代理列表线程数
PROXY_THREAD_FETCH_MAX = int(LazyFW.config('Proxy', 'FETCH_THREAD_MAX'))
# 代理测速线程数
PROXY_THREAD_TEST_PROXY_MAX = int(LazyFW.config('Proxy', 'TEST_THREAD_MAX'))
# 代理超时
PROXY_TIMEOUT = int(LazyFW.config('Proxy', 'TIMEOUT'))

# 当前日期
CURR_DATE = LazyFW.t2date(time.time(), '%Y%m%d')

# DB config
DB_HOST = str(LazyFW.config('DB', 'HOST'))
DB_USER = str(LazyFW.config('DB', 'USER'))
DB_PASS = str(LazyFW.config('DB', 'PASS'))
DB_NAME = str(LazyFW.config('DB', 'NAME'))
DB_PORT = int(LazyFW.config('DB', 'PORT'))