Exemplo n.º 1
0
    def __init__(self):
        """ Initialization """

        self.conf = ConfReader("crawler.conf", default_conf)
        self.logger = Logger()
        self.db = None

        self.thread_pool_size = self.conf.get("thread_pool_size")
        self.left_ip = self.conf.get("manager_ip")
        self.left_port = self.conf.get("manager_port")
        self._buffer_size_threshold = self.conf.get("buffer_size_threshold")
        # how many links we should return when the caller call self.get_links()
        self._crawl_NR = self.conf.get("concurrent_crawl_NR")
        self.my_open = uopen if self.conf.get(
            "buffer_output") == "no" else open
        self.content_path = self.conf.get("content_path")
        self.crawling_timeout = self.conf.get("crawling_timeout")

        self.DB_url = self.conf.get("DB_url")
        self.DB_user = self.conf.get("DB_user")
        self.DB_passwd = self.conf.get("DB_passwd")
        self.crawler_DB = self.conf.get("crawler_DB")
        self.crawler_table = self.conf.get("crawler_table")
        self.db = DBHandler(self.crawler_DB, self.DB_user, self.DB_passwd,
                            self.DB_url)
        self.db.connect()
        # Mysql columns are case insensitive (contrary to Oracle) for search operations
        # and the default behavior can be changed while creating the table by specifying
        # the "BINARY"
        self.db.update("CREATE TABLE IF NOT EXISTS `" + self.crawler_table +
                       "` ("
                       " `page_id` int(20) NOT NULL AUTO_INCREMENT,"
                       " `page_url` varchar(200) BINARY NOT NULL,"
                       " `domain_name` varchar(100) BINARY NOT NULL,"
                       " `inner_links` text,"
                       " `outer_links` text,"
                       " `title` varchar(1024),"
                       " `normal_content` text,"
                       " `emphasized_content` text,"
                       " `keywords` varchar(1024),"
                       " `description` varchar(1024),"
                       " `text` longtext,"
                       " `PR_score` double default 0.0,"
                       " `ad_NR` int default 0,"
                       " `tag1` varchar(20) default null,"
                       " `tag2` varchar(20) default null,"
                       " `tag3` varchar(20) default null,"
                       " INDEX (`page_url`),"
                       " PRIMARY KEY (`page_id`)"
                       ")CHARSET=UTF8, ENGINE=InnoDB")
        self.db.update("truncate table " + self.crawler_table)

        # hold all the links to be sent back to manager
        self._result_dict = {}
        # used to hold all the links which are got from manager
        self._buffer = []
        self.result_sender = NetworkHandler(self.left_ip, self.left_port)
        self.links_requester = NetworkHandler(self.left_ip, self.left_port)
        self.focusing = True  # whether or not the crawling should do focus-crawling
Exemplo n.º 2
0
    def __init__(self, file, destinations):
        self.file = file
        self.destinations = destinations

        self.logger = Logger.Logger("%sPXCopy.log" % (PXPaths.LOG), "INFO", "DDB").getLogger()
        self.manager = PXManager.PXManager()
        self.manager.setLogger(self.logger)
        self.manager.initNames()
        self.drp = DirectRoutingParser.DirectRoutingParser(PXPaths.ROUTING_TABLE, [], self.logger)
        self.drp.printErrors = False
        self.drp.parseAlias()
        self.flowDirsCache = CacheManager.CacheManager(maxEntries=10000, timeout=2 * 3600)
        self.machine = socket.gethostname()

        cr = ConfReader("%spx.conf" % (PXPaths.ETC))
        self.resendLimit = int(cr.getConfigValues("resendLimit")[0])
Exemplo n.º 3
0
    def __init__(self):
        """ Initialization """

        self.conf = ConfReader("crawler.conf", default_conf)
        self.logger = Logger()
        self.db = None

        self.thread_pool_size = self.conf.get("thread_pool_size")
        self.left_ip = self.conf.get("manager_ip")
        self.left_port = self.conf.get("manager_port")
        self._buffer_size_threshold = self.conf.get("buffer_size_threshold")
        # how many links we should return when the caller call self.get_links()
        self._crawl_NR = self.conf.get("concurrent_crawl_NR")
        self.my_open = uopen if self.conf.get("buffer_output") == "no" else open
        self.content_path = self.conf.get("content_path")
        self.crawling_timeout = self.conf.get("crawling_timeout")

        self.DB_url = self.conf.get("DB_url")
        self.DB_user = self.conf.get("DB_user")
        self.DB_passwd = self.conf.get("DB_passwd")
        self.crawler_DB = self.conf.get("crawler_DB")
        self.crawler_table = self.conf.get("crawler_table")
        self.db = DBHandler(self.crawler_DB, self.DB_user,self.DB_passwd,self.DB_url)
        self.db.connect()
        # Mysql columns are case insensitive (contrary to Oracle) for search operations
        # and the default behavior can be changed while creating the table by specifying
        # the "BINARY"
        self.db.update("CREATE TABLE IF NOT EXISTS `" + self.crawler_table + "` ("
                       " `page_id` int(20) NOT NULL AUTO_INCREMENT,"
                       " `page_url` varchar(200) BINARY NOT NULL,"
                       " `domain_name` varchar(100) BINARY NOT NULL,"
                       " `inner_links` text,"
                       " `outer_links` text,"
                       " `title` varchar(1024),"
                       " `normal_content` text,"
                       " `emphasized_content` text,"
                       " `keywords` varchar(1024),"
                       " `description` varchar(1024),"
                       " `text` longtext,"
                       " `PR_score` double default 0.0,"
                       " `ad_NR` int default 0,"
                       " `tag1` varchar(20) default null,"
                       " `tag2` varchar(20) default null,"
                       " `tag3` varchar(20) default null,"
                       " INDEX (`page_url`),"
                       " PRIMARY KEY (`page_id`)"
                       ")CHARSET=UTF8, ENGINE=InnoDB" )
        self.db.update("truncate table " + self.crawler_table)

        # hold all the links to be sent back to manager
        self._result_dict = {}
        # used to hold all the links which are got from manager
        self._buffer = []
        self.result_sender = NetworkHandler(self.left_ip, self.left_port)
        self.links_requester = NetworkHandler(self.left_ip, self.left_port)
        self.focusing = True  # whether or not the crawling should do focus-crawling
Exemplo n.º 4
0
    def __init__(self, file, destinations):
        self.file = file
        self.destinations = destinations

        self.logger = Logger.Logger("%sPXCopy.log" % (PXPaths.LOG), "INFO",
                                    "DDB").getLogger()
        self.manager = PXManager.PXManager()
        self.manager.setLogger(self.logger)
        self.manager.initNames()
        self.drp = DirectRoutingParser.DirectRoutingParser(
            PXPaths.ROUTING_TABLE, [], self.logger)
        self.drp.printErrors = False
        self.drp.parseAlias()
        self.flowDirsCache = CacheManager.CacheManager(maxEntries=10000,
                                                       timeout=2 * 3600)
        self.machine = socket.gethostname()

        cr = ConfReader("%spx.conf" % (PXPaths.ETC))
        self.resendLimit = int(cr.getConfigValues("resendLimit")[0])
Exemplo n.º 5
0
    def __init__(self):
        # We get the backends hostname with ConfReader
        cr = ConfReader("%spx.conf" % (PXPaths.ETC))
        self.machines = ",".join(cr.getConfigValues("backend"))

        self.headerRegexes = {}
        self.searchRegex = ""
        self.searchType = ""
        self.names = ['*']
        self.logPath = ""
        self.since = 0
        self.fromdate = "epoch"
        self.todate = "now"
        self.timesort = False
        self.ftp = False

        # These are the default wildcard regexes used throughout the whole program.
        self.alphanumericRegex = "[.[:alnum:]-]"
        self.digitRegex = "[[:digit:]]"
        self.zeroOrMoreRegex = "*"
        self.oneOrMoreRegex = "+"

        self.fillHeaderRegexes()
Exemplo n.º 6
0
 def __init__(self):
     # We get the backends hostname with ConfReader
     cr = ConfReader("%spx.conf" % (PXPaths.ETC))
     self.machines = ",".join(cr.getConfigValues("backend"))
     
     self.headerRegexes = {}
     self.searchRegex = ""
     self.searchType = ""
     self.names = ['*']
     self.logPath = ""
     self.since = 0
     self.fromdate = "epoch"
     self.todate = "now"
     self.timesort = False
     self.ftp = False
     
     # These are the default wildcard regexes used throughout the whole program.
     self.alphanumericRegex = "[.[:alnum:]-]"
     self.digitRegex = "[[:digit:]]"
     self.zeroOrMoreRegex = "*"
     self.oneOrMoreRegex = "+"
     
     self.fillHeaderRegexes()
Exemplo n.º 7
0
from colormap import colormap

from threading import Thread, Lock, Condition

import libutil

import libclient

from ClientStructures import XlaterHelper, Mode
from ConfReader import ConfReader, read_modes
from getfir import getfir

gtk.gdk.threads_init()

# get configuration
conf = ConfReader(os.getenv("HOME") + "/.kukuruku/gui")

# initialize the graphic stuff
vbox = gtk.VBox(False, 0)
screen = None
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)

wait_for_info = Condition()

window = gtk.Window(gtk.WINDOW_TOPLEVEL)
toolbar = gtk.Toolbar()
toolbar.set_style(gtk.TOOLBAR_ICONS)
tb_record = gtk.ToolButton(gtk.STOCK_MEDIA_RECORD)
tb_record.set_tooltip_text("Record the entire baseband")
tb_dump = gtk.ToolButton(gtk.STOCK_SAVE)
Exemplo n.º 8
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

from __future__ import print_function

import os
import sys
import subprocess
import time

from ConfReader import ConfReader, read_modes

conf = ConfReader(os.getenv('HOME') + '/.kukuruku/gui')
modes = read_modes(os.getenv('HOME') + '/.kukuruku/modes')

blacklistfile = os.path.join(os.path.expanduser('~'), ".kukuruku/scanner/")
blacklistfile = os.path.join(blacklistfile, "blacklist.conf")

archdir = "archive"

if not os.path.isdir(archdir):
    os.mkdir(archdir)

if len(sys.argv) > 1:
    filenames = sys.argv[1:]
else:
    filenames = os.listdir(".")

for filename in filenames:

    fn = os.path.basename(filename)
Exemplo n.º 9
0
cgitb.enable()
import sys, os, commands
sys.path.append(sys.path[0] + "/../../lib")
sys.path.append("../../lib")
sys.path.append("/apps/px/lib")

from PDSPath import *
from ColumboPaths import *
from types import *
from myTime import *
import PXPaths
PXPaths.normalPaths()
import template
from ConfReader import ConfReader

cr = ConfReader("%spx.conf" % (PXPaths.ETC))
targets = cr.getConfigValues("backend")
user = cr.getConfigValues("user")[0]

form = cgi.FieldStorage()
machines = form["machines"].value


def getLogNames(type):
    """
    Gets the name of all logs on the target machines
    Arguments:
        type -> 'tx' or 'rx'
    Returns: a list of string
    """
    logNames = []
Exemplo n.º 10
0
sys.path.append(sys.path[0] + "/../../lib")
sys.path.append("../../lib")
sys.path.append("/apps/px/lib")
sys.path.append("/apps/px/lib/search")

import template
from PDSPath import *
from ColumboPaths import *
from types import *
from myTime import *

import searchResendUtils
import PXPaths; PXPaths.normalPaths()
from ConfReader import ConfReader

cr = ConfReader("%spx.conf" % (PXPaths.ETC))
user = cr.getConfigValues("user")[0]

form = cgi.FieldStorage()
item = form["item"].value

def readFromDB(file, host):
    """
    Reads a bulletin file from the database.
    The output is copied to a temporary file on the local machine.
    Arguments:
        host   -> machine that hosts the bulletin file
        dbPath -> path to the bulletin in the database
    Returns: path to the bulletin's copy
    """
    
Exemplo n.º 11
0
from colormap import colormap

from threading  import Thread, Lock, Condition

import libutil

import libclient

from ClientStructures import XlaterHelper, Mode
from ConfReader import ConfReader, read_modes
from getfir import getfir

gtk.gdk.threads_init()

# get configuration
conf = ConfReader(os.getenv('HOME') + '/.kukuruku/gui')

# initialize the graphic stuff
vbox = gtk.VBox(False, 0)
screen = None
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)

wait_for_info = Condition()

window = gtk.Window(gtk.WINDOW_TOPLEVEL)
toolbar = gtk.Toolbar()
toolbar.set_style(gtk.TOOLBAR_ICONS)
tb_record = gtk.ToolButton(gtk.STOCK_MEDIA_RECORD)
tb_record.set_tooltip_text("Record the entire baseband")
tb_dump = gtk.ToolButton(gtk.STOCK_SAVE)
Exemplo n.º 12
0
"""
This script downloads all the articles on Zendesk and saves them in HTML format in a newly created folder named "Zendesk_{date}".
:author: Qrhl
"""

from ConfReader import ConfReader
import datetime
import csv
import os
import requests
import pickle
from shutil import rmtree

logs = []

config = ConfReader('zendesk_conf_test')
path_pref = "{}".format(config.get_value("BACKUP_PATH"))
zendesk = "{}".format(config.get_value("URL"))
locale = "{}".format(config.get_value("LOCALE"))


def save_restore_list(path, restore_list):
    """
    Saves a list of the articles that are backed up in a binary file (Pickle). This list will be reused to restore the articles
    :param path: str
    :param restore_list: list
    :return: /
    """
    path_res = os.path.join(path, 'Restore_List')
    with open(path_res, 'wb') as file:
        pickler = pickle.Pickler(file)
Exemplo n.º 13
0
    def __init__(self, ip, port, buff_size=1024, listen_num=5, thread_num=10):
        """ Initialization """

        self.conf = ConfReader("manager.conf", default_conf)

        self.my_open = uopen if self.conf.get("buffer_output") == "no" else open
        self.links_file = self.conf.get("links_file")
        self.how_many_links_file = self.conf.get("how_many_links_file")
        # record all the links we have crawled
        self._links_track = self.my_open(self.how_many_links_file, "w+")
        self._links_track_lock = threading.Lock()   #lock
        # how many links we send to crawler per request
        self._nsent = self.conf.get("links_to_crawler_NR")

        self.ip = ip
        self.port = port
        self.buff_size = buff_size
        self.listen_num = listen_num

        # socket initialization
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.sock.bind((self.ip, self.port))
        self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.sock.listen(self.listen_num)

        _blmftr_record = self.conf.get("blmftr_record")
        _start_fresh = False if self.conf.get("continuous_crawl") == "yes" else True
        # bloom_filter里面存放已经爬过的链接(或许没有爬成功)
        self.bloom_filter = Bloom_filter.Bloom_filter(10000,
                                         0.001,  #error rate
                                         filename=(_blmftr_record, -1),
                                         start_fresh=_start_fresh)
        self.bf_lock = threading.Lock() # lock to access bloom_filter

        self.prio_que = PriQueue(self.links_file)          # manager's priority queue
        self.prio_que.get_links_from_disk() # initially get links from disk
        for l in self.prio_que.links:  # add to bloom filter
            self.bloom_filter.add(l)
        self.prio_ful_threshold = self.conf.get("prio_ful_threshold")

        self.thread_list = []        #list of threads in manager
        self.thread_num = thread_num #how many thread we should start

        self.auto_speed = True
        self.speed_count = 0  # how many links to crawl from a website. Not using now
        speed = self.conf.get("speed")
        if speed != "auto":   # the art of dynamic language
            self.speed_count = speed
            self.auto_speed = False

        self.focusing = False if self.conf.get("focus") == "no" else True

        # crawling_width take effect when self.focusing is False
        self.crawling_width = self.conf.get("crawling_width")

        self.search_engine_weed = self.conf.get("search_engine_weed")

        # MACRO,represent whether crawler want to send back links
        # or get links from here
        self.SEND = 0
        self.REQUEST = 1
        self.ASKFOCUSING = 2
Exemplo n.º 14
0
class Manager:
    """Core manager """

    def __init__(self, ip, port, buff_size=1024, listen_num=5, thread_num=10):
        """ Initialization """

        self.conf = ConfReader("manager.conf", default_conf)

        self.my_open = uopen if self.conf.get("buffer_output") == "no" else open
        self.links_file = self.conf.get("links_file")
        self.how_many_links_file = self.conf.get("how_many_links_file")
        # record all the links we have crawled
        self._links_track = self.my_open(self.how_many_links_file, "w+")
        self._links_track_lock = threading.Lock()   #lock
        # how many links we send to crawler per request
        self._nsent = self.conf.get("links_to_crawler_NR")

        self.ip = ip
        self.port = port
        self.buff_size = buff_size
        self.listen_num = listen_num

        # socket initialization
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.sock.bind((self.ip, self.port))
        self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.sock.listen(self.listen_num)

        _blmftr_record = self.conf.get("blmftr_record")
        _start_fresh = False if self.conf.get("continuous_crawl") == "yes" else True
        # bloom_filter里面存放已经爬过的链接(或许没有爬成功)
        self.bloom_filter = Bloom_filter.Bloom_filter(10000,
                                         0.001,  #error rate
                                         filename=(_blmftr_record, -1),
                                         start_fresh=_start_fresh)
        self.bf_lock = threading.Lock() # lock to access bloom_filter

        self.prio_que = PriQueue(self.links_file)          # manager's priority queue
        self.prio_que.get_links_from_disk() # initially get links from disk
        for l in self.prio_que.links:  # add to bloom filter
            self.bloom_filter.add(l)
        self.prio_ful_threshold = self.conf.get("prio_ful_threshold")

        self.thread_list = []        #list of threads in manager
        self.thread_num = thread_num #how many thread we should start

        self.auto_speed = True
        self.speed_count = 0  # how many links to crawl from a website. Not using now
        speed = self.conf.get("speed")
        if speed != "auto":   # the art of dynamic language
            self.speed_count = speed
            self.auto_speed = False

        self.focusing = False if self.conf.get("focus") == "no" else True

        # crawling_width take effect when self.focusing is False
        self.crawling_width = self.conf.get("crawling_width")

        self.search_engine_weed = self.conf.get("search_engine_weed")

        # MACRO,represent whether crawler want to send back links
        # or get links from here
        self.SEND = 0
        self.REQUEST = 1
        self.ASKFOCUSING = 2

    def bf_acquire(self):
        self.bf_lock.acquire()

    def bf_release(self):
        self.bf_lock.release()

    def generate_search_engine_random_links(self):
        """ generate random search engine links
            (e.g., https://www.baidu.com/s?wd=xxxx)"""

        with open("conf/american-english", "r") as english_file:
            english_lines = english_file.readlines()
        with open("conf/idf-chinese.txt", "r") as chinese_file:
            chinese_lines = chinese_file.readlines()

        for i in range(1000): # 1000 link
            i = random.randint(0, len(english_lines))
            j = random.randint(0, len(chinese_lines))
            line1 = english_lines[i].strip()
            line2 = chinese_lines[j].strip()
            link = self.search_engine_weed + urllib.parse.quote_plus(line1 + "+" + line2)
            logger.info("Link generated: " + str(link))
            self.prio_que.append_random(link)

    def handle_connection(self, conn, addr):
        """ handle connection with some crawler """

        # set timeout for this connection, so failure of one crawler would
        # not waste resource of the manager
        conn.settimeout(60)

        method = None
        data_buf = []
        try:
            method = self.get_conn_type(conn)
            if (method == self.SEND):#client want to send links to manager
                conn.sendall(b'OK')
                while True:
                    data = conn.recv(self.buff_size)
                    if data:
                        data_buf.append(data)
                    else:
                        break
                #result dict is a dict: { link: {set of links} or 'FAIL'}
                data = b''.join(data_buf)
                _result_dict = pickle.loads(data)
                crawled_links = []
                for key, value in _result_dict.items():
                    #没爬成功的,就是'FAIL'(我们对一个链接只爬一次,无论成功与否)
                    self.bf_acquire()
                    if (value == 'FAIL'):
                        pass
                    else:
                        crawled_links.append(key)
                        # limit links count(it's ok to exceed a little bit)
                        if len(self.prio_que) < self.prio_ful_threshold:
                            for sub_link in value:
                                if sub_link not in self.bloom_filter:
                                    self.bloom_filter.add(sub_link)
                                    self.prio_que.append(sub_link)
                    self.bf_release()

                # remove links of dominant domain so that links
                # from other domains have an opportunity to be
                # crawled
                if self.focusing:
                    self.prio_que.remove_dominant()

                #write all the link to `self._links_track`
                with self._links_track_lock:
                    for link in crawled_links:
                        self._links_track.write(str(link) + "\n")

            elif (method == self.REQUEST): #crawler request some links
                conn.sendall(b'OK')
                if (self.prio_que.domains_nr() > self.crawling_width):
                    self.focusing = True  # we have enogh domains now, so now we do focused-crawling
                    logger.info("[[Focused-crawling]] crawling_width[%d], domains_nr[%d]\n" % (
                        self.crawling_width, self.prio_que.domains_nr()),
                        Logger.STDOUT)
                #crawler would ask whether or not to be focusing
                if self.get_conn_type(conn) == self.ASKFOCUSING:
                    if self.focusing:
                        conn.sendall(b'OK')
                    else:
                        conn.sendall(b'NO') # can only send back two bytes

                """ 假如prioQueue里面没有了就会返回一个空的lists """
                data = None
                links_buffer = []
                try:
                    for _ in range(self._nsent):  #一次发送self._nsent条链接
                        links_buffer.append(self.prio_que.get_by_addr(addr[0]))
                except EmptyPriQueue:
                    pass
                except Exception as e:
                    raise Exception("Exception:[%s] when getting links from PriQueue" % str(e))
                #如果prio_que里面没有链接了,我们发送过去的就是一个空的list了
                #
                #如果没有链接了,则从一些搜索引擎那里获取一些随即的链接
                if not len(links_buffer):
                    logger.info("Empty priority queue now. Trying to generate random links from search engine...")
                    self.generate_search_engine_random_links()

                data = pickle.dumps(links_buffer)
                try:
                    conn.sendall(data)
                except Exception as e:
                    raise
            else:
                raise Exception("UNKNOWN CONNECTION TYPE")
        except Exception as e:
            #不能再向上抛异常了,因为这是多线程模型,
            #异常应该在本函数内处理
            logger.info("Exception:[%s]" % str(e), Logger.STDERR)
        finally:
            conn.close()

    def get_conn_type(self, conn):
        """ get connection type(SEND or REQUEST) of this connection """
        try:
            data = conn.recv(self.buff_size)
            if (data == b'SEND'):
                return self.SEND
            elif (data == b'REQUEST'):
                return self.REQUEST
            elif (data == b'FOCUSING?'):
                return self.ASKFOCUSING
            else:
                return None
        except Exception:
            raise

    def run(self):
        """ start manager """
        logger.info("manager start running at: [%s]\n" % str(datetime.datetime.now()),
                Logger.STDOUT)
        while(True):
            #only want a fix number of thread in this program
            if (len(self.thread_list) > self.thread_num):
                for thread in self.thread_list:
                    thread.join()
                self.thread_list = []
            conn, addr = self.sock.accept()
            logger.info("Connection established: %s\n" % str(addr), Logger.STDOUT)
            t = threading.Thread(target=self.handle_connection, args=(conn, addr))
            #t.daemon = True
            self.thread_list.append(t)
            t.start()
Exemplo n.º 15
0
class Crawler(object):
    """ The crawler.
        Multiple threads would be started in method run() """

    def __init__(self):
        """ Initialization """

        self.conf = ConfReader("crawler.conf", default_conf)
        self.logger = Logger()
        self.db = None

        self.thread_pool_size = self.conf.get("thread_pool_size")
        self.left_ip = self.conf.get("manager_ip")
        self.left_port = self.conf.get("manager_port")
        self._buffer_size_threshold = self.conf.get("buffer_size_threshold")
        #how many links we should return when the caller call self.get_links()
        self._crawl_NR = self.conf.get("concurrent_crawl_NR")
        self.my_open = uopen if self.conf.get("buffer_output") == "no" else open
        self.content_path = self.conf.get("content_path")
        self.crawling_timeout = self.conf.get("crawling_timeout")

        self.DB_url = self.conf.get("DB_url")
        self.DB_user = self.conf.get("DB_user")
        self.DB_passwd = self.conf.get("DB_passwd")
        self.crawler_DB = self.conf.get("crawler_DB")
        self.crawler_table = self.conf.get("crawler_table")
        self.db = DBHandler(self.crawler_DB, self.DB_user,self.DB_passwd,self.DB_url)
        self.db.connect()
        # Mysql columns are case insensitive (contrary to Oracle) for search operations
        # and the default behavior can be changed while creating the table by specifying
        # the "BINARY"
        self.db.update("CREATE TABLE IF NOT EXISTS `" + self.crawler_table + "` ("
                       " `page_id` int(20) NOT NULL AUTO_INCREMENT,"
                       " `page_url` varchar(200) BINARY NOT NULL,"
                       " `domain_name` varchar(100) BINARY NOT NULL,"
                       " `sublinks` text,"
                       " `title` varchar(1024),"
                       " `normal_content` text,"
                       " `emphasized_content` text,"
                       " `keywords` varchar(1024),"
                       " `description` varchar(1024),"
                       " `text` longtext,"
                       " `PR_score` double default 0.0,"
                       " `ad_NR` int default 0,"
                       " `tag` varchar(20) default null,"
                       #" `classify_attribute_1` ...
                       #" `classify_attribute_2` ...
                       " PRIMARY KEY (`page_id`),"
                       " INDEX (`page_url`)"
                       ")CHARSET=UTF8, ENGINE=InnoDB" )
        self.db.update("truncate table " + self.crawler_table)

        # hold all the links to be sent back to manager
        self._result_dict = {}
        # used to hold all the links which are got from manager
        self._buffer = []
        self.result_sender = NetworkHandler(self.left_ip, self.left_port)
        self.links_requester = NetworkHandler(self.left_ip, self.left_port)
        self.focusing = True  # whether or not the crawling should do focus-crawling


    def get_links(self):
        """ used to get urls from manager.
        we use a buffer, so that we can get 50 links from manager,
        and then return 10 links with call to self.get_links() one by one.
        To do this, for example, user can adjust the 'concurrent_crawl_NR'
        setting in 'conf/crawler.conf' to 10 and 'links_to_crawler_NR' to 50.

        Note that we don't have to set any timeount here,
        because, after all, crawler have to get some links from
        manager side before it can continue """

        # if there are not enough links in the buffer
        if len(self._buffer) < self._buffer_size_threshold:
            try:
                # manager would return links together with a
                # message(self.focusing), which tell the crawler whether it
                # should still be focused-crawling or not
                (self.focusing, links) = self.links_requester.request()
                self.logger.info("links_requester succeed request()")
                if not links:
                    #return whatever in self._buffer
                    tmp = self._buffer
                    self._buffer = []
                    return tmp
                else:
                    self._buffer.extend(links)
                    #make sure that we don't exceed the limit
                    nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer)
                                            else len(self._buffer))
                    tmp = []
                    for _ in range(nsent):
                        tmp.append(self._buffer.pop())
                    return tmp
            except Exception:
                raise
        else:
            #make sure that we don't exceed the limit
            nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer)
                                    else len(self._buffer))
            #we have enough links, so just return
            tmp = []
            for _ in range(nsent):
                tmp.append(self._buffer.pop())
            return tmp

    @staticmethod
    def req(url, **kwargs):
        page = requests.get(url, **kwargs)
        trytime = 1
        while trytime < _exceeded_try and page.status_code != 200:
            page = requests.get(url, **kwargs)
            time.sleep(_pause_interval)
            trytime = trytime + 1
        return page


    def get_web(self, resolved_url):
        """used to grab a web information and return a Response object."""

        #fake as 'Baidu Spider'. Can also fake as GoogleBot, or YoudaoBot,
        #but this maybe easily detected due to ip-mismatch
        #NOTE: According to RFC 7230, HTTP header names are case-INsensitive
        headers={
                'Accept':'text/plain, text/html', #want only text
                #"accept-encoding":"gzip, deflate, sdch",
                #"accept-language":"en-US,en;q=0.8",
                #"Cache-Control":"max-age=0",
                #"Cookie":"timezone=480; I2KBRCK=1; cookiePolicy=accept",
                #"Host":"www.tandfonline.com",
                #"Proxy-Connection":"keep-alive",
                #"Referer":"https://www.tandfonline.com",
                #"Upgrade-Insecure-Requests":"1",
                #"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
                "User-agent":"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
                }

        try:
            response = Crawler.req(resolved_url, headers=headers, timeout=self.crawling_timeout)
            self.logger.info("Get response[%d]: [%s]" % (response.status_code, resolved_url))
            #check whether we get a plain text response
            #note that key in `response.headers` is case insensitive
            if 'content-type' in response.headers:
                if 'text/' not in response.headers['content-type']:
                    return None
            if (response.status_code == requests.codes.ok): #200
                return response
            else:
                return None
        except Exception as e:
            self.logger.info("Fail to fetch page. Exception: %s, url:[%s]" % (str(e), resolved_url))
            return None

    def run(self):
        """ main routine of crawler class
            @urls: used to hold the raw urls got from the left.  """

        while (True):
            try:
                urls = self.get_links()
            except Exception as e:
                self.logger.info("Cannot get urls. crawler sleep for 10 seconds.\n"
                        "\tException:[%s]\n" % str(e))
                time.sleep(10) #wait a little bit to see if thing would get better
                continue
            if not urls:
                self.logger.info("Empty urls from dns_resolver. Crawler will loop")
                time.sleep(10)
                continue

            #####DEBUG
            self.logger.info("GOT urls from manager: [")
            for u in urls:
                self.logger.info("\t" + u)
            self.logger.info("  ]")
            #####END

            # 爬取链接
            with ThreadPoolExecutor(self.thread_pool_size) as pool:
                responses = pool.map(self.get_web, urls)

            #开始处理response,将得到的子内链与源链接组合在一起然后返回
            for index, resp in enumerate(responses):
                origin = urls[index]
                if not resp:
                    self._result_dict[origin] = "FAIL"
                else:
                    try:
                        # Note that we resp is already of type 'text/html'
                        # Note that resp.text return unicode string
                        outer_links, inner_links = self.extract_link(origin, resp.text)
                    except Exception as e:
                        self.logger.info(("Exception when extract_links:[%s],"
                                "url:[%s]\n") % (str(e), origin))
                        continue
                    self.logger.info("Finished extract_links()")
                    outer_links = set(outer_links)
                    inner_links = set(inner_links)
                    if self.focusing:
                        self.logger.info("crawler is FOCUSING now.\n")
                        self._result_dict[origin] = self.trim_url_suffix(inner_links)
                    else:
                        self._result_dict[origin] = self.trim_url_suffix(outer_links)

                    # resp.content return 'bytes' object
                    try:
                        self.dump_content(resp, origin)
                    except Exception as e:
                        self.logger.info(("Exception when dump_content():[%s],"
                                "url:[%s]") % (str(e), origin))
                        traceback.print_exc()
                        continue
                    self.logger.info("Finished dump_content()")

            data = pickle.dumps(self._result_dict)
            try:
                self.result_sender.send(data)
                self.logger.info("successfully sent back to the left\n")
            except Exception as e:
                self.logger.info(("Fail sending to manager:[%s]\n"
                                    "unsent links:[%s]\n") % (str(e), str(self._result_dict)))
            finally:
                self._result_dict = {}

    def extract_link(self, origin_url, html):
        """This function is used for extract all links from the web.
           It would distinct the inner links and outer links.
           For inner links, it should add the header and
           delete the tag#, remove .css and javascript link"""
        html_text = etree.HTML(html)
        links = html_text.xpath('//*/a/@href') #all the links, relative or absolute

        origin_url = origin_url.strip()
        # get the url domain to define the website
        protocal, domain = self.get_protocal_domain(origin_url)

        #useless file pattern (something like xxx.jpg, xxx.mp4, xxx.css, xxx.pdf, etc)
        uf_pattern = re.compile(r'\.jpg$|\.png|\.xml|\.mp4|\.mp3|\.css|\.pdf|\.svg|\.gz|\.zip|\.rar|\.exe|\.tar')
        #unsupported protocal pattern(something like ftp://, sftp://, thunders://, etc)
        up_pattern = re.compile(r'^.{0,10}:')
        #we only support http/https protocal
        sp_pattern = re.compile(r'http://|https://')

        outer_link_lists = []
        inner_link_lists = []
        for element in links:
            element = element.strip()
            if re.match(sp_pattern, element):  # begin with http/https
                #first check if this match those useless pattern
                if re.findall(uf_pattern, element):
                    continue
                #check whether it's outer link or inner link
                test_protocal, test_domain = self.get_protocal_domain(element)
                if test_domain != domain:
                    outer_link_lists.append(element.strip())
                else:
                    inner_link_lists.append(element.strip())
            elif re.findall(uf_pattern, element):
                continue
            elif re.findall(up_pattern, element):
                continue
            else:
                if element.startswith('/'):
                    link = protocal + '://' + domain + element
                else:
                    link = protocal + '://' + domain + '/' + element
                inner_link_lists.append(link.strip())

        return (outer_link_lists, inner_link_lists)

    def trim_url_suffix(self, urls):
        """
        trim those urls with suffix `#xxxxx' or `?xxxx'
        NOTE that ALL URLS PASSED IN MUST BE VALID!!!
        """
        def _trim_url_suffix(url): #make it reusable
            #tag link pattern
            return url.split('#')[0].split('?')[0]

        return list(map(_trim_url_suffix, urls))

    def get_protocal_domain(self, url):
        """ return protocal and domain """
        protocal, rest = urllib.parse.splittype(url)
        domain, url_suffix = urllib.parse.splithost(rest)
        return (protocal, domain)

    def dump_content(self, resp, origin_url):
        """ requests cannot detect web page encoding automatically(F**K!).
            response.encoding is from the html reponse header. If we want to
            convert all the content we want to utf8, we have to use `get_encodings_from_content; """
        # resp.text is in unicode(type 'str')
        # resp.content is in unicode(type 'bytes')
        text = resp.text
        # requests get html page encoding from HTTP Response header, if the
        # Response header provide no info about encoding, then requests would
        # default to 'ISO-8859-1'. But most of the time we can detect the
        # encoding in html page content
        if(resp.encoding == 'ISO-8859-1' and not 'ISO-8859-1' in resp.headers.get('Content-Type', '')):
            try:
                real_encoding = requests.utils.get_encodings_from_content(resp.text)[0]
                text = resp.content.decode(real_encoding, 'ignore')
            except Exception:
                text = resp.content.decode('utf-8', 'ignore')
        html_tree = etree.HTML(text)
        kws = html_tree.xpath('//*/meta[re:test(@name, "[Kk]eywords?")]/@content', namespaces={'re': "http://exslt.org/regular-expressions"})
        descs = html_tree.xpath('//*/meta[re:test(@name, "[Dd]escription")]/@content', namespaces={'re': "http://exslt.org/regular-expressions"})
        kw = kws[0] if kws else ""
        desc = descs[0] if descs else ""
        kw = kw.encode('utf-8', 'ignore')
        desc = desc.encode('utf-8', 'ignore')

        try:
            real_encoding = requests.utils.get_encodings_from_content(resp.text)[0]
            utf8_text = resp.content.decode(real_encoding, "ignore").encode('utf-8')
        except Exception:
            utf8_text = resp.content

        # requests的请求会出现重定向。比如
        #       http://bbs.people.com.cn/
        #会被重定向到
        #       http://bbs1.people.com.cn/
        #因此如果我们取 resp.url 作为爬取的 url 的话
        #会导致最终数据库中看到 url 重复。因此这里我
        #我们取传进来的origin_url (bbs, NOT bss1)
        #
        #page_url = bytes(resp.url, 'utf-8')
        page_url = origin_url

        _, domain_name = self.get_protocal_domain(resp.url)
        domain_name = bytes(domain_name, 'utf-8')
        titles = re.findall(rb'<title>(.*?)</title>', utf8_text)
        title = titles[0] if titles else b''

        self.db.update("INSERT INTO " + self.crawler_table + "(`page_url`, `domain_name`,"
                "`title`, `text`, `keywords`, `description`) "
                "VALUES (%s, %s, %s, %s, %s, %s);",
                (page_url, domain_name, title, utf8_text, kw, desc))
Exemplo n.º 16
0
import cgitb; cgitb.enable()
import sys, os, pwd, time, re, pickle, commands
sys.path.append(sys.path[0] + "/../../lib");
sys.path.append("../../lib")
sys.path.append("/apps/px/lib")
sys.path.append("/apps/px/lib/importedLibs")

import template
from PDSPath import *
from ColumboPaths import *
from types import *
from myTime import *
import PXPaths; PXPaths.normalPaths()
from ConfReader import ConfReader

cr = ConfReader("%spx.conf" % (PXPaths.ETC))
user = cr.getConfigValues("user")[0]
backends = cr.getConfigValues("backend")

form = cgi.FieldStorage()

def menuContent():
    """
    Creates the menu options dynamically
    Returns: a string
    """
    
    flows = []
    for backend in backends:
        status, output = commands.getstatusoutput("sudo -u %s ssh %s python /apps/px/lib/search/getFlowList.py" % (user, backend))
        if not status:
Exemplo n.º 17
0
sys.path.append("../../lib")
sys.path.append("/apps/px/lib")
sys.path.append("/apps/px/lib/search")

import template
from PDSPath import *
from ColumboPaths import *
from types import *
from myTime import *

import searchResendUtils
import PXPaths
PXPaths.normalPaths()
from ConfReader import ConfReader

cr = ConfReader("%spx.conf" % (PXPaths.ETC))
user = cr.getConfigValues("user")[0]

form = cgi.FieldStorage()
item = form["item"].value


def readFromDB(file, host):
    """
    Reads a bulletin file from the database.
    The output is copied to a temporary file on the local machine.
    Arguments:
        host   -> machine that hosts the bulletin file
        dbPath -> path to the bulletin in the database
    Returns: path to the bulletin's copy
    """
Exemplo n.º 18
0
import cgi
import cgitb; cgitb.enable()
import sys, os, commands
sys.path.append(sys.path[0] + "/../../lib");
sys.path.append("../../lib")
sys.path.append("/apps/px/lib")

from PDSPath import *
from ColumboPaths import *
from types import *
from myTime import *
import PXPaths; PXPaths.normalPaths()
import template
from ConfReader import ConfReader

cr = ConfReader("%spx.conf" % (PXPaths.ETC))
targets = cr.getConfigValues("backend")
user = cr.getConfigValues("user")[0]

form = cgi.FieldStorage()
machines = form["machines"].value

def getLogNames(type):
    """
    Gets the name of all logs on the target machines
    Arguments:
        type -> 'tx' or 'rx'
    Returns: a list of string
    """
    logNames = []
    for target in targets:
Exemplo n.º 19
0
import requests
import os
import csv
import datetime
from ConfReader import ConfReader
from git import Repo
from shutil import rmtree

config = ConfReader("./gh.conf")
path_pref = config.get_value("PATH")
logs = []


def get_dates():
    """
    Counts the folders in GitHub_backup and sorts the dates of the files
    :return: A sorted list of the dates and the number of folders
    """
    dates = []
    count_folders = 0
    for dir in os.listdir(path_pref):
        try:
            if dir.split("_") == "GitHub":
                count_folders += 1
                dates.append(dir.split("_")[1])
        except Exception:
            pass
    dates.sort()
    return dates, count_folders

Exemplo n.º 20
0
class Crawler(object):
    """ The crawler.
        Multiple threads would be started in method run() """
    def __init__(self):
        """ Initialization """

        self.conf = ConfReader("crawler.conf", default_conf)
        self.logger = Logger()
        self.db = None

        self.thread_pool_size = self.conf.get("thread_pool_size")
        self.left_ip = self.conf.get("manager_ip")
        self.left_port = self.conf.get("manager_port")
        self._buffer_size_threshold = self.conf.get("buffer_size_threshold")
        # how many links we should return when the caller call self.get_links()
        self._crawl_NR = self.conf.get("concurrent_crawl_NR")
        self.my_open = uopen if self.conf.get(
            "buffer_output") == "no" else open
        self.content_path = self.conf.get("content_path")
        self.crawling_timeout = self.conf.get("crawling_timeout")

        self.DB_url = self.conf.get("DB_url")
        self.DB_user = self.conf.get("DB_user")
        self.DB_passwd = self.conf.get("DB_passwd")
        self.crawler_DB = self.conf.get("crawler_DB")
        self.crawler_table = self.conf.get("crawler_table")
        self.db = DBHandler(self.crawler_DB, self.DB_user, self.DB_passwd,
                            self.DB_url)
        self.db.connect()
        # Mysql columns are case insensitive (contrary to Oracle) for search operations
        # and the default behavior can be changed while creating the table by specifying
        # the "BINARY"
        self.db.update("CREATE TABLE IF NOT EXISTS `" + self.crawler_table +
                       "` ("
                       " `page_id` int(20) NOT NULL AUTO_INCREMENT,"
                       " `page_url` varchar(200) BINARY NOT NULL,"
                       " `domain_name` varchar(100) BINARY NOT NULL,"
                       " `inner_links` text,"
                       " `outer_links` text,"
                       " `title` varchar(1024),"
                       " `normal_content` text,"
                       " `emphasized_content` text,"
                       " `keywords` varchar(1024),"
                       " `description` varchar(1024),"
                       " `text` longtext,"
                       " `PR_score` double default 0.0,"
                       " `ad_NR` int default 0,"
                       " `tag1` varchar(20) default null,"
                       " `tag2` varchar(20) default null,"
                       " `tag3` varchar(20) default null,"
                       " INDEX (`page_url`),"
                       " PRIMARY KEY (`page_id`)"
                       ")CHARSET=UTF8, ENGINE=InnoDB")
        self.db.update("truncate table " + self.crawler_table)

        # hold all the links to be sent back to manager
        self._result_dict = {}
        # used to hold all the links which are got from manager
        self._buffer = []
        self.result_sender = NetworkHandler(self.left_ip, self.left_port)
        self.links_requester = NetworkHandler(self.left_ip, self.left_port)
        self.focusing = True  # whether or not the crawling should do focus-crawling

    def get_links(self):
        """ used to get urls from manager.
        we use a buffer, so that we can get 50 links from manager,
        and then return 10 links with call to self.get_links() one by one.
        To do this, for example, user can adjust the 'concurrent_crawl_NR'
        setting in 'conf/crawler.conf' to 10 and 'links_to_crawler_NR' to 50.

        Note that we don't have to set any timeount here,
        because, after all, crawler have to get some links from
        manager side before it can continue """

        # if there are not enough links in the buffer
        if len(self._buffer) < self._buffer_size_threshold:
            try:
                # manager would return links together with a
                # message(self.focusing), which tell the crawler whether it
                # should still be focused-crawling or not
                (self.focusing, links) = self.links_requester.request()
                self.logger.info("links_requester succeed request()")
                if not links:
                    # return whatever in self._buffer
                    tmp = self._buffer
                    self._buffer = []
                    return tmp
                else:
                    self._buffer.extend(links)
                    # make sure that we don't exceed the limit
                    nsent = (self._crawl_NR
                             if self._crawl_NR <= len(self._buffer) else len(
                                 self._buffer))
                    tmp = []
                    for _ in range(nsent):
                        tmp.append(self._buffer.pop())
                    return tmp
            except Exception:
                raise
        else:
            # make sure that we don't exceed the limit
            nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer) else
                     len(self._buffer))
            # we have enough links, so just return
            tmp = []
            for _ in range(nsent):
                tmp.append(self._buffer.pop())
            return tmp

    @staticmethod
    def req(url, **kwargs):
        page = requests.get(url, **kwargs)
        trytime = 1
        while trytime < _exceeded_try and page.status_code != 200:
            page = requests.get(url, **kwargs)
            time.sleep(_pause_interval)
            trytime = trytime + 1
        return page

    def get_web(self, resolved_url):
        """used to grab a web information and return a Response object."""

        # fake as 'Baidu Spider'. Can also fake as GoogleBot, or YoudaoBot,
        # but this maybe easily detected due to ip-mismatch
        # NOTE: According to RFC 7230, HTTP header names are case-INsensitive
        headers = {
            'Accept':
            'text/plain, text/html',  #want only text
            # "accept-encoding":"gzip, deflate, sdch",
            # "accept-language":"en-US,en;q=0.8",
            # "Cache-Control":"max-age=0",
            # "Cookie":"timezone=480; I2KBRCK=1; cookiePolicy=accept",
            # "Host":"www.tandfonline.com",
            # "Proxy-Connection":"keep-alive",
            # "Referer":"https://www.tandfonline.com",
            # "Upgrade-Insecure-Requests":"1",
            # "User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
            "User-agent":
            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
        }

        try:
            response = Crawler.req(resolved_url,
                                   headers=headers,
                                   timeout=self.crawling_timeout)
            self.logger.info("Get response[%d]: [%s]" %
                             (response.status_code, resolved_url))
            # check whether we get a plain text response
            # note that key in `response.headers` is case insensitive
            if 'content-type' in response.headers:
                if 'text/' not in response.headers['content-type']:
                    return None
            if response.status_code == requests.codes.ok:  # 200
                return response
            else:
                return None
        except Exception as e:
            self.logger.info("Fail to fetch page. Exception: %s, url:[%s]" %
                             (str(e), resolved_url))
            return None

    def run(self):
        """ main routine of crawler class
            @urls: used to hold the raw urls got from the left.  """

        while True:
            try:
                urls = self.get_links()
            except Exception as e:
                self.logger.info(
                    "Cannot get urls. Crawler sleep for 10 seconds.\n"
                    "\tException:[%s]\n" % str(e))
                time.sleep(
                    10)  # wait a little bit to see if thing would get better
                continue
            if not urls:
                self.logger.info(
                    "Empty urls from dns_resolver. Crawler will loop")
                time.sleep(10)
                continue

            #####DEBUG
            self.logger.info("GOT urls from manager: [")
            for u in urls:
                self.logger.info("\t" + u)
            self.logger.info("  ]")
            #####END

            # 爬取链接
            with ThreadPoolExecutor(self.thread_pool_size) as pool:
                responses = pool.map(self.get_web, urls)

            # 开始处理response,将得到的子内链与源链接组合在一起然后返回
            for index, resp in enumerate(responses):
                origin = urls[index]
                if not resp:
                    self._result_dict[origin] = "FAIL"
                else:
                    try:
                        # Note that we resp is already of type 'text/html'
                        # Note that resp.text return unicode string
                        outer_links, inner_links = self.extract_link(
                            origin, resp.text)
                    except Exception as e:
                        self.logger.info(("Exception when extract_links:[%s],"
                                          "url:[%s]\n") % (str(e), origin))
                        continue
                    self.logger.info("Finished extract_links()")
                    outer_links = set(outer_links)
                    inner_links = set(inner_links)
                    if self.focusing:
                        self.logger.info("crawler is FOCUSING now.\n")
                        self._result_dict[origin] = self.trim_url_suffix(
                            inner_links)
                    else:
                        self._result_dict[origin] = self.trim_url_suffix(
                            outer_links)

                    # resp.content return 'bytes' object
                    try:
                        self.dump_content(resp, origin, inner_links,
                                          outer_links)
                    except Exception as e:
                        self.logger.info(("Exception when dump_content():[%s],"
                                          "url:[%s]") % (str(e), origin))
                        traceback.print_exc()
                        continue
                    self.logger.info("Finished dump_content()")

            data = pickle.dumps(self._result_dict)
            try:
                self.result_sender.send(data)
                self.logger.info("successfully sent back to the left\n")
            except Exception as e:
                self.logger.info(
                    ("Fail sending to manager:[%s]\n"
                     "unsent links:[%s]\n") % (str(e), str(self._result_dict)))
            finally:
                self._result_dict = {}

    def extract_link(self, origin_url, html):
        """This function is used for extract all links from the web.
           It would distinct the inner links and outer links.
           For inner links, it should add the header and
           delete the tag#, remove .css and javascript link"""
        html_text = etree.HTML(html)
        links = html_text.xpath(
            '//*/a/@href')  #all the links, relative or absolute

        origin_url = origin_url.strip()
        # get the url domain to define the website
        protocal, domain = self.get_protocal_domain(origin_url)

        # useless file pattern (something like xxx.jpg, xxx.mp4, xxx.css, xxx.pdf, etc)
        uf_pattern = re.compile(
            r'\.jpg$|\.png|\.xml|\.mp4|\.mp3|\.css|\.pdf|\.svg|\.gz|\.zip|\.rar|\.exe|\.tar'
        )
        # unsupported protocal pattern(something like ftp://, sftp://, thunders://, etc)
        up_pattern = re.compile(r'^.{0,10}:')
        # we only support http/https protocal
        sp_pattern = re.compile(r'http://|https://')

        outer_link_lists = []
        inner_link_lists = []
        for element in links:
            element = element.strip()
            if re.match(sp_pattern, element):  # begin with http/https
                # first check if this match those useless pattern
                if re.findall(uf_pattern, element):
                    continue
                # check whether it's outer link or inner link
                test_protocal, test_domain = self.get_protocal_domain(element)
                if test_domain != domain:
                    outer_link_lists.append(element.strip())
                else:
                    inner_link_lists.append(element.strip())
            elif re.findall(uf_pattern, element):
                continue
            elif re.findall(up_pattern, element):
                continue
            else:
                if element.startswith('/'):
                    link = protocal + '://' + domain + element
                else:
                    link = protocal + '://' + domain + '/' + element
                inner_link_lists.append(link.strip())

        return outer_link_lists, inner_link_lists

    def trim_url_suffix(self, urls):
        """
        trim those urls with suffix `#xxxxx' or `?xxxx'
        NOTE that ALL URLS PASSED IN MUST BE VALID!!!
        """
        def _trim_url_suffix(url):  # make it reusable
            # tag link pattern
            return url.split('#')[0].split('?')[0]

        return list(map(_trim_url_suffix, urls))

    def get_protocal_domain(self, url):
        """ return protocal and domain """
        protocal, rest = urllib.parse.splittype(url)
        domain, url_suffix = urllib.parse.splithost(rest)
        return protocal, domain

    def dump_content(self, resp, origin_url, inner_links, outer_links):
        """ requests cannot detect web page encoding automatically(F**K!).
            response.encoding is from the html reponse header. If we want to
            convert all the content we want to utf8, we have to use `get_encodings_from_content; """
        # resp.text is in unicode(type 'str')
        # resp.content is in unicode(type 'bytes')
        text = resp.text
        # requests get html page encoding from HTTP Response header, if the
        # Response header provide no info about encoding, then requests would
        # default to 'ISO-8859-1'. But most of the time we can detect the
        # encoding in html page content
        if resp.encoding == 'ISO-8859-1' and not 'ISO-8859-1' in resp.headers.get(
                'Content-Type', ''):
            try:
                real_encoding = requests.utils.get_encodings_from_content(
                    resp.text)[0]
                text = resp.content.decode(real_encoding, 'ignore')
            except Exception:
                text = resp.content.decode('utf-8', 'ignore')
        html_tree = etree.HTML(text)
        kws = html_tree.xpath(
            '//*/meta[re:test(@name, "[Kk]eywords?")]/@content',
            namespaces={'re': "http://exslt.org/regular-expressions"})
        descs = html_tree.xpath(
            '//*/meta[re:test(@name, "[Dd]escription")]/@content',
            namespaces={'re': "http://exslt.org/regular-expressions"})
        kw = kws[0] if kws else ""
        desc = descs[0] if descs else ""
        kw = kw.encode('utf-8', 'ignore')
        desc = desc.encode('utf-8', 'ignore')

        try:
            real_encoding = requests.utils.get_encodings_from_content(
                resp.text)[0]
            utf8_text = resp.content.decode(real_encoding,
                                            "ignore").encode('utf-8')
        except Exception:
            utf8_text = resp.content

        # requests的请求会出现重定向。比如
        #       http://bbs.people.com.cn/
        # 会被重定向到
        #       http://bbs1.people.com.cn/
        # 因此如果我们取 resp.url 作为爬取的 url 的话
        # 会导致最终数据库中看到 url 重复。因此这里我
        # 我们取传进来的origin_url (bbs, NOT bss1)
        #
        # page_url = bytes(resp.url, 'utf-8')
        page_url = origin_url

        _, domain_name = self.get_protocal_domain(resp.url)
        domain_name = bytes(domain_name, 'utf-8')
        titles = re.findall(rb'<title>(.*?)</title>', utf8_text)
        title = titles[0] if titles else b''
        inner_links = ";".join(inner_links)
        outer_links = ";".join(outer_links)

        self.db.update(
            "INSERT INTO " + self.crawler_table + "(`page_url`, `domain_name`,"
            "`inner_links`,`outer_links`,`title`, `text`, `keywords`, `description`) "
            "VALUES (%s, %s, %s, %s, %s, %s, %s, %s);",
            (page_url, domain_name, inner_links, outer_links, title, utf8_text,
             kw, desc))