Пример #1
0
def filterContentUrlFunc(website_id, website_url, xpath):
    """
    @summary: 筛选出网站的内容url
    """
    try:
        spiderRes = Spider().chromedriver(website_url)
        html_selector = spiderRes.selector
        if html_selector is None:
            log.logMsg(LogType.htmlSelectorNone,
                       "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None."))
            return False

        hrefs = filterHrefs(website_url, xpath, html_selector)
        if len(hrefs) == 0:
            return False

        flag = False
        for href in hrefs:
            if not Cache.listItemExist(cache.oldContent_list, href) and \
                    not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href):
                Cache.putQueue(cache.freshContentUrl_queue, (website_id, href))
                flag = True
        if not flag:
            # 如果没有新数据,则延迟15分钟的爬取时间
            incrDelay_time(website_id, 900)
        return True
    except Exception as e:
        log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc()))
    return False
Пример #2
0
def initContentUrl_dict():
    """
    @summary: 初始化去重列表
    """
    items = mysql.Mysql.queryContentUrl()
    for item in items:
        Cache.appendList(cache.oldContent_list, item[0])
Пример #3
0
 def initWebsite_delay_dict(self, record):
     """
     @summary: 初始化网站的等待更新时间
     :param record: 网站记录(id, url, xpath, delay_time)
     :return:
     """
     if not Cache.keyExist(cache.websiteDelay_dict, record[0]):
         Cache.setDict(cache.websiteDelay_dict, record[0], record[-1])
Пример #4
0
def saveWebsiteDelaytime():
    """
    @summary: 保存网站爬取延迟到数据库中
    """
    try:
        for website_id in Cache.keys(cache.websiteDelay_dict):
            delaytime = Cache.getDict(cache.websiteDelay_dict, website_id)
            db.saveDelay_time(website_id, delaytime)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
Пример #5
0
def show_delay_time():
    """
    @summary: 显示各网站的爬取延迟
    """
    records = []
    keys = Cache.keys(cache.websiteDelay_dict) or []
    for website_id in keys:
        record = mysql.Mysql.queryWebsiteUrl(website_id)    # id,url,xpath,detail,delay_time
        records.append((record[0][0], record[0][3] or record[0][1], Cache.getDict(cache.websiteDelay_dict, website_id)))
    headers = ["id", "url", "delay-time(s)"]
    print(tabulate(records, headers=headers))
Пример #6
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False)
             res = filterContentInfoFunc(website_id, url)
             if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                 Cache.appendList(cache.oldContent_list, url)
             else:
                 Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id)
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
Пример #7
0
 def run(self):
     while not global_EXIT:
         website_url = ""
         try:
             website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False)
             if not filterContentUrlFunc(website_id, website_url, xpath):
                 Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath))
         except Exception as e:
             if type(e) is not queue.Empty:
                 log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc()))
             else:
                 for i in range(10):
                     if global_EXIT: break
                     time.sleep(1)
Пример #8
0
def resetDelay_time():
    """
    @summary: 重置各网站的爬取延迟
    """
    db = None
    try:
        db = mysql.Mysql()
        for website_id in Cache.keys(cache.websiteDelay_dict):
            record = Cache.getDict(cache.websiteDelay_dict, website_id)
            Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0))
            db.saveDelay_time(website_id, 0)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()
Пример #9
0
def main():
    thread_count = 3
    pre_threads = []

    initdb()                                            # 初始化redis数据库
    initGlobalArgs()
    initContentUrl_dict()                               # 初始化去重表

    log_thread = log.LogThread()                        # 启动日志记录线程
    log_thread.start()

    QueryWebsiteUrl_thread = QueryWebsiteUrlThread()    # 启动读取网站地址线程
    QueryWebsiteUrl_thread.start()
    pre_threads.append(QueryWebsiteUrl_thread)

    filterContentUrl_thread = FilterContentUrlThread()  # 启动爬取内容地址线程
    filterContentUrl_thread.start()
    pre_threads.append(filterContentUrl_thread)

    for i in range(thread_count):
        thread = FilterContentInfoThread()
        thread.start()
        pre_threads.append(thread)

    unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread()
    unrecognizedWebsiteUrl_thread.start()
    pre_threads.append(unrecognizedWebsiteUrl_thread)

    unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread()
    unrecognizedContentUrl_thread.start()
    pre_threads.append(unrecognizedContentUrl_thread)


    while not global_EXIT: pass

    time.sleep(5)

    saveWebsiteDelaytime()              # 保存各网站的延迟时间

    for t in pre_threads:
        t.join()

    log.logMsg(LogType.success, "--------------------bye---------------------\n")
    while not Cache.qempty(cache.log_queue): pass  # 等待把所有日志写到文件中
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True)
    log_thread.join()

    if db: db.dispose()
Пример #10
0
    def _requests_getPagesource(page_source_q,
                                url,
                                method,
                                data,
                                use_proxy=False):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                headers = {"User-agent": user_agent()}
                if use_oa_proxy:
                    proxies, ip, port = Spider._getproxy()

                if method == "POST":
                    res = requests.post(url,
                                        data=data,
                                        proxies=proxies,
                                        headers=headers)
                elif method == "GET":
                    res = requests.get(url,
                                       data=data,
                                       proxies=proxies,
                                       headers=headers)
                if res.status_code == 200 and Spider._pagesourceLegal(
                        res.text):
                    page_source_q.put(res.text)
            except Exception as e:
                print(e)
                if ip: redis_client.delete(ip)
Пример #11
0
 def run(self):
     while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"):
         try:
             info = Cache.getQueue(cache.log_queue, False)
             if os.path.exists(self.getFilename()):
                 log_size = os.path.getsize(
                     self.getFilename()) / 1024 / 1024  # 日志大小超过1M时另建新的日志文件
                 if log_size > 1:
                     self.index += 1
             with open(self.getFilename(), 'a') as f:
                 info += '<%s>\n' % (
                     datetime.datetime.now().strftime("%H:%M:%S"))
                 f.write(info)
         except Exception as e:
             if type(e) is not queue.Empty:
                 print("Log Error: %s" % e)
Пример #12
0
 def putRecord(self, record):
     """
     @summary: 把record添加到正在等待的网站队列中
     """
     website_id, website_url, xpath = record[:3]
     if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \
             not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id):
         Cache.appendList(cache.workingWebsite_list, website_id)
         Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath))
         sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id)
         for i in range(int(sleep_time)):
             if global_EXIT: return
             time.sleep(1)
         Cache.removeList(cache.workingWebsite_list, website_id)
Пример #13
0
    def run(self):
        while not global_EXIT:
            website_url = ""
            if not Cache.dempty(cache.unrecognized_websiteUrl_dict):
                try:
                    website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict)
                    if not website_id:
                        for i in range(30):
                            if global_EXIT: break
                            time.sleep(1)
                            continue

                    website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id)
                    if (website_id, website_url, xpath):
                        Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id)

                except Exception as e:
                    log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))
Пример #14
0
def logMsg(logType, msg, website_id="", content_url=""):
    """
    @summary:               把日志放到redis中(partialNone要放到数据库中)
    :param logType:         日志类型
    :param msg:             日志内容
    :param website_id:      网站id
    :param content_url:     内容url
    :return:
    """
    if logType == LogType.error and msg:
        msg = "》Error《:%s" % msg
    elif logType == LogType.htmlSelectorNone or logType == LogType.partialNone:
        msg = "?Warning?:%s" % msg
    elif logType == LogType.success:
        msg = "【Success】:%s" % msg
    else:
        msg = "--Other--:%s" % msg
    if logType == LogType.partialNone:
        Mysql.writeWebsiteMsg(website_id, content_url)
    Cache.putQueue(cache.log_queue, msg)
Пример #15
0
    def run(self):
        while not global_EXIT:
            try:
                if Cache.qempty(cache.websiteUrl_queue):
                    records = mysql.Mysql.queryWebsiteUrl()
                    for record in records:  # record: id,url,xpath,detail,delay_time
                        record = [str(item) for item in record]
                        self.initWebsite_delay_dict(record)
                        t = threading.Thread(target=self.putRecord, args=(record,))
                        t.setDaemon(True)
                        t.start()

            except Exception as e:
                log.logMsg(LogType.error, "[QueryWebsiteUrlThread] %s" % (traceback.format_exc()))
            for i in range(60):
                if global_EXIT: break
                time.sleep(1)
Пример #16
0
    def _chrome_getPagesource(page_source_q, url, timeout):
        driver, ip, port = None, None, None
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                if system == "Linux":
                    chrome_options = Options()
                    chrome_options.add_argument('--headless')
                    chrome_options.add_argument('--disable-gpu')
                else:
                    os.environ["webdriver.chrome.driver"] = chromedriver
                    chrome_options = webdriver.ChromeOptions()

                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()

                if ip and port:
                    chrome_options.add_argument("--proxy-server=http://%s:%s" %
                                                (ip, port))

                if system == "Linux":
                    driver = webdriver.Chrome(chrome_options=chrome_options)
                else:
                    driver = webdriver.Chrome(chromedriver,
                                              chrome_options=chrome_options)

                driver.get(url)
                time.sleep(timeout)
                js = "document.body.scrollTop=1000"
                driver.execute_script(js)
                time.sleep(3)
                page_source = driver.page_source
                driver.close()
                driver.quit()
                if page_source and Spider._pagesourceLegal(page_source):
                    page_source_q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
                if driver:
                    driver.close()
                    driver.quit()
Пример #17
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
Пример #18
0
    def _urllib_getPagesource(q, url):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and q.empty():
            proxies, ip, port = None, None, None
            try:
                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()
                if proxies:
                    proxy_handler = urllib.request.ProxyHandler(proxies)
                    opener = urllib.request.build_opener(proxy_handler)
                    opener.addheaders = [('User-agent', user_agent())]
                    res = opener.open(url, timeout=5)
                    page_source = res.read().decode("utf8")
                else:
                    req = urllib.request.Request(
                        url, headers={"User-agent": user_agent()})
                    resp = urllib.request.urlopen(req)
                    page_source = resp.read().decode("utf8")

                if page_source and Spider._pagesourceLegal(page_source):
                    q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
Пример #19
0
def incrDelay_time(website_id, timeout):
    """
    @summary: 对网站增加timeout个时间延迟
    """
    record = Cache.getDict(cache.websiteDelay_dict, website_id)
    Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)
Пример #20
0
def content_count():
    """
    @summary: 显示已爬取的内容地址数量
    """
    print("content'count: %s"%(Cache.listLength(cache.oldContent_list)))
Пример #21
0
import threading
import os
import datetime
import queue

from Spider.cache import Cache
from Spider.config import LOG_DIR
from Spider.mysql import Mysql

cache = Cache()


class LogType(object):
    error = 0
    htmlSelectorNone = 1
    partialNone = 2
    success = 3
    other = 4


def logMsg(logType, msg, website_id="", content_url=""):
    """
    @summary:               把日志放到redis中(partialNone要放到数据库中)
    :param logType:         日志类型
    :param msg:             日志内容
    :param website_id:      网站id
    :param content_url:     内容url
    :return:
    """
    if logType == LogType.error and msg:
        msg = "》Error《:%s" % msg
Пример #22
0
def initGlobalArgs():
    """
    @summary:  初始化全局变量
    """
    Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", False)
    Cache.setDict(cache.globalArgs_dict, "global_EXIT", False)
Пример #23
0
def initdb():
    """
    @summary: 清空redis中的数据
    """
    Cache.flushdb(cache.websiteDelay_dict)
    Cache.flushdb(cache.workingWebsite_list)
    Cache.flushdb(cache.websiteUrl_queue)
    Cache.flushdb(cache.oldContent_list)
    Cache.flushdb(cache.freshContentUrl_queue)
    Cache.flushdb(cache.log_queue)
    Cache.flushdb(cache.unrecognized_websiteUrl_dict)
    Cache.flushdb(cache.unrecognized_contentUrl_dict)
    Cache.flushdb(cache.globalArgs_dict)
Пример #24
0
import queue
from tabulate import tabulate
import traceback

from Spider import mysql
from Spider import log
from Spider.log import LogType
from Spider import thumbnail
from Spider.spider import Spider
from Spider.cache import Cache
from Spider.common import imgSrcHandler, hrefHandler,filterPureTag, incrDelay_time, brief, randomImg, spaceHandler,filterHrefs
from Spider.config import global_Chrome
from Spider.models import SpiderResType


cache = Cache()
db = mysql.Mysql()
global_EXIT = False


def filterContentUrlFunc(website_id, website_url, xpath):
    """
    @summary: 筛选出网站的内容url
    """
    try:
        spiderRes = Spider().chromedriver(website_url)
        html_selector = spiderRes.selector
        if html_selector is None:
            log.logMsg(LogType.htmlSelectorNone,
                       "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None."))
            return False