Exemplo n.º 1
0
def filterContentUrlFunc(website_id, website_url, xpath):
    """
    @summary: 筛选出网站的内容url
    """
    try:
        spiderRes = Spider().chromedriver(website_url)
        html_selector = spiderRes.selector
        if html_selector is None:
            log.logMsg(LogType.htmlSelectorNone,
                       "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None."))
            return False

        hrefs = filterHrefs(website_url, xpath, html_selector)
        if len(hrefs) == 0:
            return False

        flag = False
        for href in hrefs:
            if not Cache.listItemExist(cache.oldContent_list, href) and \
                    not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href):
                Cache.putQueue(cache.freshContentUrl_queue, (website_id, href))
                flag = True
        if not flag:
            # 如果没有新数据,则延迟15分钟的爬取时间
            incrDelay_time(website_id, 900)
        return True
    except Exception as e:
        log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc()))
    return False
Exemplo n.º 2
0
def initContentUrl_dict():
    """
    @summary: 初始化去重列表
    """
    items = mysql.Mysql.queryContentUrl()
    for item in items:
        Cache.appendList(cache.oldContent_list, item[0])
Exemplo n.º 3
0
 def initWebsite_delay_dict(self, record):
     """
     @summary: 初始化网站的等待更新时间
     :param record: 网站记录(id, url, xpath, delay_time)
     :return:
     """
     if not Cache.keyExist(cache.websiteDelay_dict, record[0]):
         Cache.setDict(cache.websiteDelay_dict, record[0], record[-1])
Exemplo n.º 4
0
def saveWebsiteDelaytime():
    """
    @summary: 保存网站爬取延迟到数据库中
    """
    try:
        for website_id in Cache.keys(cache.websiteDelay_dict):
            delaytime = Cache.getDict(cache.websiteDelay_dict, website_id)
            db.saveDelay_time(website_id, delaytime)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
Exemplo n.º 5
0
def show_delay_time():
    """
    @summary: 显示各网站的爬取延迟
    """
    records = []
    keys = Cache.keys(cache.websiteDelay_dict) or []
    for website_id in keys:
        record = mysql.Mysql.queryWebsiteUrl(website_id)    # id,url,xpath,detail,delay_time
        records.append((record[0][0], record[0][3] or record[0][1], Cache.getDict(cache.websiteDelay_dict, website_id)))
    headers = ["id", "url", "delay-time(s)"]
    print(tabulate(records, headers=headers))
Exemplo n.º 6
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False)
             res = filterContentInfoFunc(website_id, url)
             if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                 Cache.appendList(cache.oldContent_list, url)
             else:
                 Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id)
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
Exemplo n.º 7
0
 def run(self):
     while not global_EXIT:
         website_url = ""
         try:
             website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False)
             if not filterContentUrlFunc(website_id, website_url, xpath):
                 Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath))
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc()))
             else:
                 for i in range(10):
                     if global_EXIT: break
                     time.sleep(1)
Exemplo n.º 8
0
def resetDelay_time():
    """
    @summary: 重置各网站的爬取延迟
    """
    db = None
    try:
        db = mysql.Mysql()
        for website_id in Cache.keys(cache.websiteDelay_dict):
            record = Cache.getDict(cache.websiteDelay_dict, website_id)
            Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0))
            db.saveDelay_time(website_id, 0)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()
Exemplo n.º 9
0
def main():
    thread_count = 3
    pre_threads = []

    initdb()                                            # 初始化redis数据库
    initGlobalArgs()
    initContentUrl_dict()                               # 初始化去重表

    log_thread = log.LogThread()                        # 启动日志记录线程
    log_thread.start()

    QueryWebsiteUrl_thread = QueryWebsiteUrlThread()    # 启动读取网站地址线程
    QueryWebsiteUrl_thread.start()
    pre_threads.append(QueryWebsiteUrl_thread)

    filterContentUrl_thread = FilterContentUrlThread()  # 启动爬取内容地址线程
    filterContentUrl_thread.start()
    pre_threads.append(filterContentUrl_thread)

    for i in range(thread_count):
        thread = FilterContentInfoThread()
        thread.start()
        pre_threads.append(thread)

    unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread()
    unrecognizedWebsiteUrl_thread.start()
    pre_threads.append(unrecognizedWebsiteUrl_thread)

    unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread()
    unrecognizedContentUrl_thread.start()
    pre_threads.append(unrecognizedContentUrl_thread)


    while not global_EXIT: pass

    time.sleep(5)

    saveWebsiteDelaytime()              # 保存各网站的延迟时间

    for t in pre_threads:
        t.join()

    log.logMsg(LogType.success, "--------------------bye---------------------\n")
    while not Cache.qempty(cache.log_queue): pass  # 等待把所有日志写到文件中
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True)
    log_thread.join()

    if db: db.dispose()
Exemplo n.º 10
0
    def _requests_getPagesource(page_source_q,
                                url,
                                method,
                                data,
                                use_proxy=False):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                headers = {"User-agent": user_agent()}
                if use_oa_proxy:
                    proxies, ip, port = Spider._getproxy()

                if method == "POST":
                    res = requests.post(url,
                                        data=data,
                                        proxies=proxies,
                                        headers=headers)
                elif method == "GET":
                    res = requests.get(url,
                                       data=data,
                                       proxies=proxies,
                                       headers=headers)
                if res.status_code == 200 and Spider._pagesourceLegal(
                        res.text):
                    page_source_q.put(res.text)
            except Exception as e:
                print(e)
                if ip: redis_client.delete(ip)
Exemplo n.º 11
0
 def run(self):
     while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"):
         try:
             info = Cache.getQueue(cache.log_queue, False)
             if os.path.exists(self.getFilename()):
                 log_size = os.path.getsize(
                     self.getFilename()) / 1024 / 1024  # 日志大小超过1M时另建新的日志文件
                 if log_size > 1:
                     self.index += 1
             with open(self.getFilename(), 'a') as f:
                 info += '<%s>\n' % (
                     datetime.datetime.now().strftime("%H:%M:%S"))
                 f.write(info)
         except Exception as e:
             if type(e) is not queue.Empty:
                 print("Log Error: %s" % e)
Exemplo n.º 12
0
 def putRecord(self, record):
     """
     @summary: 把record添加到正在等待的网站队列中
     """
     website_id, website_url, xpath = record[:3]
     if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \
             not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id):
         Cache.appendList(cache.workingWebsite_list, website_id)
         Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath))
         sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id)
         for i in range(int(sleep_time)):
             if global_EXIT: return
             time.sleep(1)
         Cache.removeList(cache.workingWebsite_list, website_id)
Exemplo n.º 13
0
    def run(self):
        while not global_EXIT:
            website_url = ""
            if not Cache.dempty(cache.unrecognized_websiteUrl_dict):
                try:
                    website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict)
                    if not website_id:
                        for i in range(30):
                            if global_EXIT: break
                            time.sleep(1)
                            continue

                    website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id)
                    if (website_id, website_url, xpath):
                        Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id)

                except Exception as e:
                    log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))
Exemplo n.º 14
0
def logMsg(logType, msg, website_id="", content_url=""):
    """
    @summary:               把日志放到redis中(partialNone要放到数据库中)
    :param logType:         日志类型
    :param msg:             日志内容
    :param website_id:      网站id
    :param content_url:     内容url
    :return:
    """
    if logType == LogType.error and msg:
        msg = "》Error《:%s" % msg
    elif logType == LogType.htmlSelectorNone or logType == LogType.partialNone:
        msg = "?Warning?:%s" % msg
    elif logType == LogType.success:
        msg = "【Success】:%s" % msg
    else:
        msg = "--Other--:%s" % msg
    if logType == LogType.partialNone:
        Mysql.writeWebsiteMsg(website_id, content_url)
    Cache.putQueue(cache.log_queue, msg)
Exemplo n.º 15
0
    def run(self):
        while not global_EXIT:
            try:
                if Cache.qempty(cache.websiteUrl_queue):
                    records = mysql.Mysql.queryWebsiteUrl()
                    for record in records:  # record: id,url,xpath,detail,delay_time
                        record = [str(item) for item in record]
                        self.initWebsite_delay_dict(record)
                        t = threading.Thread(target=self.putRecord, args=(record,))
                        t.setDaemon(True)
                        t.start()

            except Exception as e:
                log.logMsg(LogType.error, "[QueryWebsiteUrlThread] %s" % (traceback.format_exc()))
            for i in range(60):
                if global_EXIT: break
                time.sleep(1)
Exemplo n.º 16
0
    def _chrome_getPagesource(page_source_q, url, timeout):
        driver, ip, port = None, None, None
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                if system == "Linux":
                    chrome_options = Options()
                    chrome_options.add_argument('--headless')
                    chrome_options.add_argument('--disable-gpu')
                else:
                    os.environ["webdriver.chrome.driver"] = chromedriver
                    chrome_options = webdriver.ChromeOptions()

                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()

                if ip and port:
                    chrome_options.add_argument("--proxy-server=http://%s:%s" %
                                                (ip, port))

                if system == "Linux":
                    driver = webdriver.Chrome(chrome_options=chrome_options)
                else:
                    driver = webdriver.Chrome(chromedriver,
                                              chrome_options=chrome_options)

                driver.get(url)
                time.sleep(timeout)
                js = "document.body.scrollTop=1000"
                driver.execute_script(js)
                time.sleep(3)
                page_source = driver.page_source
                driver.close()
                driver.quit()
                if page_source and Spider._pagesourceLegal(page_source):
                    page_source_q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
                if driver:
                    driver.close()
                    driver.quit()
Exemplo n.º 17
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
Exemplo n.º 18
0
    def _urllib_getPagesource(q, url):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and q.empty():
            proxies, ip, port = None, None, None
            try:
                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()
                if proxies:
                    proxy_handler = urllib.request.ProxyHandler(proxies)
                    opener = urllib.request.build_opener(proxy_handler)
                    opener.addheaders = [('User-agent', user_agent())]
                    res = opener.open(url, timeout=5)
                    page_source = res.read().decode("utf8")
                else:
                    req = urllib.request.Request(
                        url, headers={"User-agent": user_agent()})
                    resp = urllib.request.urlopen(req)
                    page_source = resp.read().decode("utf8")

                if page_source and Spider._pagesourceLegal(page_source):
                    q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
Exemplo n.º 19
0
def incrDelay_time(website_id, timeout):
    """
    @summary: 对网站增加timeout个时间延迟
    """
    record = Cache.getDict(cache.websiteDelay_dict, website_id)
    Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)
Exemplo n.º 20
0
def content_count():
    """
    @summary: 显示已爬取的内容地址数量
    """
    print("content'count: %s"%(Cache.listLength(cache.oldContent_list)))
Exemplo n.º 21
0
import threading
import os
import datetime
import queue

from Spider.cache import Cache
from Spider.config import LOG_DIR
from Spider.mysql import Mysql

cache = Cache()


class LogType(object):
    error = 0
    htmlSelectorNone = 1
    partialNone = 2
    success = 3
    other = 4


def logMsg(logType, msg, website_id="", content_url=""):
    """
    @summary:               把日志放到redis中(partialNone要放到数据库中)
    :param logType:         日志类型
    :param msg:             日志内容
    :param website_id:      网站id
    :param content_url:     内容url
    :return:
    """
    if logType == LogType.error and msg:
        msg = "》Error《:%s" % msg
Exemplo n.º 22
0
def initGlobalArgs():
    """
    @summary:  初始化全局变量
    """
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", False)
    Cache.setDict(cache.globalArgs_dict, "global_EXIT", False)
Exemplo n.º 23
0
def initdb():
    """
    @summary: 清空redis中的数据
    """
    Cache.flushdb(cache.websiteDelay_dict)
    Cache.flushdb(cache.workingWebsite_list)
    Cache.flushdb(cache.websiteUrl_queue)
    Cache.flushdb(cache.oldContent_list)
    Cache.flushdb(cache.freshContentUrl_queue)
    Cache.flushdb(cache.log_queue)
    Cache.flushdb(cache.unrecognized_websiteUrl_dict)
    Cache.flushdb(cache.unrecognized_contentUrl_dict)
    Cache.flushdb(cache.globalArgs_dict)
Exemplo n.º 24
0
import queue
from tabulate import tabulate
import traceback

from Spider import mysql
from Spider import log
from Spider.log import LogType
from Spider import thumbnail
from Spider.spider import Spider
from Spider.cache import Cache
from Spider.common import imgSrcHandler, hrefHandler,filterPureTag, incrDelay_time, brief, randomImg, spaceHandler,filterHrefs
from Spider.config import global_Chrome
from Spider.models import SpiderResType


cache = Cache()
db = mysql.Mysql()
global_EXIT = False


def filterContentUrlFunc(website_id, website_url, xpath):
    """
    @summary: 筛选出网站的内容url
    """
    try:
        spiderRes = Spider().chromedriver(website_url)
        html_selector = spiderRes.selector
        if html_selector is None:
            log.logMsg(LogType.htmlSelectorNone,
                       "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None."))
            return False