class ProxyManager(object):
    """
	ProxyManager
	"""
    def __init__(self):
        from db.DbClient import DbClient
        self.db = DbClient()
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):

        self.db.changeTable(self.raw_proxy_queue)
        for proxyGetter in options.ProxyGetter_Run:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
            except Exception as e:
                self.log.error(
                    "{func}: fetch proxy fail".format(func=proxyGetter))
                continue

    def get(self):
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            return random.choice(list(item_dict.keys()))
        return None

    def delete(self, proxy):
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        return list(item_dict.keys()) if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
Пример #2
0
class ProxyCheck(ProxyManager, Thread):
    def __init__(self, queue, item_dict):
        ProxyManager.__init__(self)
        Thread.__init__(self)
        self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
        self.queue = queue
        self.item_dict = item_dict

    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        while self.queue.qsize():
            proxy = self.queue.get()
            count = self.item_dict[proxy]
            if validUsefulProxy(proxy):
                # 验证通过计数器减1
                if count and int(count) > 0:
                    self.db.put(proxy, num=int(count) - 1)
                else:
                    pass
                self.log.info('ProxyCheck: {} validation pass'.format(proxy))
            else:
                self.log.info('ProxyCheck: {} validation fail'.format(proxy))
                if count and int(count) + 1 >= FAIL_COUNT:
                    self.log.info(
                        'ProxyCheck: {} fail too many, delete!'.format(proxy))
                    self.db.delete(proxy)
                else:
                    self.db.put(proxy, num=int(count) + 1)
            self.queue.task_done()
Пример #3
0
class GetDetails(threading.Thread):
    
    def __init__(self, link_queue, data_queue, thread_name, proxies):
        threading.Thread.__init__(self, name = thread_name)
        self.log = LogHandler('detail_data_crawler')
        self.link_queue = link_queue
        self.data_queue = data_queue
        self.proxies = proxies
        
    def run(self):
        cookies, session = get_cookies_session(self.proxies)
        while True:
            try:
                url = self.link_queue.get(block = False)
            except Empty:
                self.log.info(f"{self.name}的线程因队列为空退出")
                break
            
            
            try:
                full_url = 'http://www.landchina.com/' + url
                text = self.session.get(full_url).text
                data = get_data(text)
                data['链接'] = url
                self.data_queue.put(data)
                self.log.info(f'{self.name}的线程成功获取{url}的数据')
                time.sleep(5 + random.random() * 10)
            except:
                self.log.info(f'代理失效,{self.name}的线程退出')
Пример #4
0
class ProxyRefreshSchedule(ProxyManager):
    """
	代理定时刷新
	"""
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('refresh_schedule')

    def validProxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
		"""
        self.db.changeTable(self.raw_proxy_queue)
        raw_proxy_item = self.db.pop()
        self.log.info('ProxyRefreshSchedule: %s start validProxy' %
                      time.ctime())
        # 计算剩余代理,用来减少重复计算
        remaining_proxies = self.getAll()
        while raw_proxy_item:
            raw_proxy = raw_proxy_item.get('proxy')
            if isinstance(raw_proxy, bytes):
                raw_proxy = raw_proxy.decode('utf8')

            if (raw_proxy
                    not in remaining_proxies) and validUsefulProxy(raw_proxy):
                self.db.changeTable(self.useful_proxy_queue)
                self.db.put(raw_proxy)
                self.log.info('ProxyRefreshSchedule: %s validation pass' %
                              raw_proxy)
            else:
                self.log.info('ProxyRefreshSchedule: %s validation fail' %
                              raw_proxy)
            self.db.changeTable(self.raw_proxy_queue)
            raw_proxy_item = self.db.pop()
            remaining_proxies = self.getAll()
        self.log.info('ProxyRefreshSchedule: %s validProxy complete' %
                      time.ctime())
Пример #5
0
class DatabaseHandler:
    """ database handler class """
    def __init__(self, path, log_level=0):
        """
        :param path: path to the db, including db name
        :param log_level: 0-4, Debug...Critical, see LogHandler class
        """
        self.__path = path
        self.__tables = {}
        self.__keywords = {}
        self.__lh = LogHandler(log_level)

    def open(self):
        """ open database """
        self.__db = sqlite3.connect(self.__path)
        self.__dbcursor = self.__db.cursor()
        self.__lh.info("DB is opened: " + self.__path)

    def close(self):
        """ close database """
        self.__db.close()
        self.__lh.info("DB is closed: " + self.__path)

    def get_table_name(self, name):
        """
        get columns of the selected table
        :param name: tablen ame
        :return: list of columns
        """
        self.__dbcursor.execute("PRAGMA table_info('" + name + "')")
        fetch = self.__dbcursor.fetchall()
        columns = []
        for col in fetch:
            columns.append(col[1])
        self.__lh.debug("Table columns: " + ", ".join(columns))
        return columns

    def set_used_table(self, name):
        """
        set which table is used from database
        :param name: table name
        """
        self.__tables[name] = self.get_table_name(name)
        self.__lh.info("New table is set to used: " + name)

    def release_used_table(self, name):
        """
        release the given table
        :param name: table name
        """
        del self.__tables[name]
        self.__lh.info("Table is released from used tables: " + name)

    def clear_used_tables(self):
        """ release all used tables """
        self.__tables.clear()
        self.__lh.info("All tables are released from used tables")

    def execute_query(self, query):
        """
        executes the given sqlite query string
        :param query: sqlite query string
        :return: db's response to the query
        """
        try:
            self.__dbcursor.execute(query)
            self.__lh.info("Executed query: " + query)
        except Exception as e:
            self.__lh.exceptionHandling(e)
        return self.__dbcursor.fetchall()

    def commit(self):
        """ apply changes done to database """
        self.__db.commit()

    def insert(self, table_name, data, columns=[]):
        """
        builds and executes INSERT query
        :param table_name: used table's name
        :param data: list of strings, data of a row to be inserted to the given table
        :param columns: list of columns where the given data should be inserted
        """
        if self.is_registered(table_name, data, columns) == True:
            self.__lh.warning("Record already inserted")
        else:
            try:
                keyword = "INSERT INTO"
                keyword2 = "VALUES"
                cols = "(" + ",".join(columns) + ")"
                args = "(" + ",".join(data) + ")"
                query = " ".join([keyword, table_name, cols, keyword2, args
                                  ]) + ";"
                self.execute_query(query)
            except Exception as e:
                self.__lh.exceptionHandling(e)

    def select(self, table_name, cols=[], condition=[], distinct=False):
        """
        builds and executes SELECT query
        :param table_name: used table's name
        :param cols: list of columns should be selected
        :param condition: list of tuples, conditions of the query, use tupleListToStatement function to generate this string list
        :param distinct: set True if SELECT DISTICT query is required
        :return: list of tuple list, tuple list contains the selected data of one line
        """
        zipped = []
        try:
            if cols == []:
                raise Exception("Empty selected columns list")

            keyword = "SELECT" if distinct == False else "SELECT DISTINCT"
            if cols == ["*"]:
                args = "*"
            else:
                args = ", ".join(cols)
            table = "FROM " + table_name
            if condition != []:
                where = self.where_clause(condition)
            else:
                where = ""
            query = " ".join([keyword, args, table, where]) + ";"
            rows = self.execute_query(query)
            for row in rows:
                self.__lh.debug("Select raw return values: " + (str(row)))
                zipped.append(list(zip(cols, row)))
                line = ""
                for index in range(0, len(cols)):
                    line = line + str(cols[index]) + " = " + str(row[index])
                    if index is not (len(cols) - 1):
                        line += ", "
                self.__lh.debug("Returned line: " + line)
            self.__lh.debug(zipped)
        except Exception as e:
            self.__lh.exceptionHandling(e)
        finally:
            return zipped

    def update(self, table_name, data=[], condition=[]):
        """
        builds and executes UPDATE query
        :param table_name: used table's name
        :param data: list of tuple, see tupleListToStatement function
        :param condition: list of tuple, see whereClause
        """
        keyword = "UPDATE " + table_name + " SET"
        for i in range(0, len(data)):
            data[i] = (data[i][0], "=", data[i][1])
        d_args = self.tuple_list_to_statement(data)
        c_args = self.where_clause(condition)
        query = " ".join([keyword, d_args, c_args]) + ";"
        self.execute_query(query)

    def delete(self, table_name, condition=[]):
        """
        builds and executes DELETE query
        :param table_name: used table's name
        :param condition: list of tuple, see whereClause
        """
        keyword = "DELETE FROM " + table_name
        c_args = self.where_clause(condition)
        query = " ".join([keyword, c_args]) + ";"
        self.execute_query(query)

    def where_clause(self, condition=[]):
        """
        builds WHERE clause
        :param condition: list of tuple, see tupleListToStatement function
        :return: WHERE clause string
        """
        clause = "WHERE " + self.tuple_list_to_statement(condition)
        self.__lh.debug("WHERE clause: " + clause)
        return clause

    def and_clause(self):
        # TODO
        pass

    def or_clause(self):
        # TODO
        pass

    def like_clause(self):
        # TODO
        pass

    def glob_clause(self):
        # TODO
        pass

    def limit_clause(self):
        # TODO
        pass

    def order_by_clause(self):
        # TODO
        pass

    def group_by_clause(self):
        # TODO
        pass

    def having_clause(self):
        # TODO
        pass

    def tuple_list_to_statement(self, tuple_list=[]):
        """
        builds string from tuple
        :param tuple_list: tuple list, it should look like this when building conditions: [(column, relation operator, value), (c,ro,v),...]
        :return: list of strings
        """
        statement = ""
        for t in tuple_list:
            tmp = str(t[0]) + " " + str(t[1]) + " " + str(t[2])
            if statement == "":
                statement = tmp
            else:
                statement = ",".join([statement, tmp])
        return statement

    def is_registered(self, table_name, data_list=[], col_list=[]):
        ret = False
        new_list = []
        self.__lh.debug(data_list)
        self.__lh.debug(col_list)
        for x in range(len(data_list)):
            new_list.append((data_list[x], "=", col_list[x]))

        result = []
        #result = self.select(table_name,"*",self.tuple_list_to_statement(new_list))
        if result != []:
            ret = True
        return ret