class ProxyManager(object): """ ProxyManager """ def __init__(self): from db.DbClient import DbClient self.db = DbClient() self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): self.db.changeTable(self.raw_proxy_queue) for proxyGetter in options.ProxyGetter_Run: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.error( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: return random.choice(list(item_dict.keys())) return None def delete(self, proxy): self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() return list(item_dict.keys()) if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ProxyCheck(ProxyManager, Thread): def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) + 1 >= FAIL_COUNT: self.log.info( 'ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) self.queue.task_done()
class GetDetails(threading.Thread): def __init__(self, link_queue, data_queue, thread_name, proxies): threading.Thread.__init__(self, name = thread_name) self.log = LogHandler('detail_data_crawler') self.link_queue = link_queue self.data_queue = data_queue self.proxies = proxies def run(self): cookies, session = get_cookies_session(self.proxies) while True: try: url = self.link_queue.get(block = False) except Empty: self.log.info(f"{self.name}的线程因队列为空退出") break try: full_url = 'http://www.landchina.com/' + url text = self.session.get(full_url).text data = get_data(text) data['链接'] = url self.data_queue.put(data) self.log.info(f'{self.name}的线程成功获取{url}的数据') time.sleep(5 + random.random() * 10) except: self.log.info(f'代理失效,{self.name}的线程退出')
class ProxyRefreshSchedule(ProxyManager): """ 代理定时刷新 """ def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule') def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.getAll() while raw_proxy_item: raw_proxy = raw_proxy_item.get('proxy') if isinstance(raw_proxy, bytes): raw_proxy = raw_proxy.decode('utf8') if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
class DatabaseHandler: """ database handler class """ def __init__(self, path, log_level=0): """ :param path: path to the db, including db name :param log_level: 0-4, Debug...Critical, see LogHandler class """ self.__path = path self.__tables = {} self.__keywords = {} self.__lh = LogHandler(log_level) def open(self): """ open database """ self.__db = sqlite3.connect(self.__path) self.__dbcursor = self.__db.cursor() self.__lh.info("DB is opened: " + self.__path) def close(self): """ close database """ self.__db.close() self.__lh.info("DB is closed: " + self.__path) def get_table_name(self, name): """ get columns of the selected table :param name: tablen ame :return: list of columns """ self.__dbcursor.execute("PRAGMA table_info('" + name + "')") fetch = self.__dbcursor.fetchall() columns = [] for col in fetch: columns.append(col[1]) self.__lh.debug("Table columns: " + ", ".join(columns)) return columns def set_used_table(self, name): """ set which table is used from database :param name: table name """ self.__tables[name] = self.get_table_name(name) self.__lh.info("New table is set to used: " + name) def release_used_table(self, name): """ release the given table :param name: table name """ del self.__tables[name] self.__lh.info("Table is released from used tables: " + name) def clear_used_tables(self): """ release all used tables """ self.__tables.clear() self.__lh.info("All tables are released from used tables") def execute_query(self, query): """ executes the given sqlite query string :param query: sqlite query string :return: db's response to the query """ try: self.__dbcursor.execute(query) self.__lh.info("Executed query: " + query) except Exception as e: self.__lh.exceptionHandling(e) return self.__dbcursor.fetchall() def commit(self): """ apply changes done to database """ self.__db.commit() def insert(self, table_name, data, columns=[]): """ builds and executes INSERT query :param table_name: used table's name :param data: list of strings, data of a row to be inserted to the given table :param columns: list of columns where the given data should be inserted """ if self.is_registered(table_name, data, columns) == True: self.__lh.warning("Record already inserted") else: try: keyword = "INSERT INTO" keyword2 = "VALUES" cols = "(" + ",".join(columns) + ")" args = "(" + ",".join(data) + ")" query = " ".join([keyword, table_name, cols, keyword2, args ]) + ";" self.execute_query(query) except Exception as e: self.__lh.exceptionHandling(e) def select(self, table_name, cols=[], condition=[], distinct=False): """ builds and executes SELECT query :param table_name: used table's name :param cols: list of columns should be selected :param condition: list of tuples, conditions of the query, use tupleListToStatement function to generate this string list :param distinct: set True if SELECT DISTICT query is required :return: list of tuple list, tuple list contains the selected data of one line """ zipped = [] try: if cols == []: raise Exception("Empty selected columns list") keyword = "SELECT" if distinct == False else "SELECT DISTINCT" if cols == ["*"]: args = "*" else: args = ", ".join(cols) table = "FROM " + table_name if condition != []: where = self.where_clause(condition) else: where = "" query = " ".join([keyword, args, table, where]) + ";" rows = self.execute_query(query) for row in rows: self.__lh.debug("Select raw return values: " + (str(row))) zipped.append(list(zip(cols, row))) line = "" for index in range(0, len(cols)): line = line + str(cols[index]) + " = " + str(row[index]) if index is not (len(cols) - 1): line += ", " self.__lh.debug("Returned line: " + line) self.__lh.debug(zipped) except Exception as e: self.__lh.exceptionHandling(e) finally: return zipped def update(self, table_name, data=[], condition=[]): """ builds and executes UPDATE query :param table_name: used table's name :param data: list of tuple, see tupleListToStatement function :param condition: list of tuple, see whereClause """ keyword = "UPDATE " + table_name + " SET" for i in range(0, len(data)): data[i] = (data[i][0], "=", data[i][1]) d_args = self.tuple_list_to_statement(data) c_args = self.where_clause(condition) query = " ".join([keyword, d_args, c_args]) + ";" self.execute_query(query) def delete(self, table_name, condition=[]): """ builds and executes DELETE query :param table_name: used table's name :param condition: list of tuple, see whereClause """ keyword = "DELETE FROM " + table_name c_args = self.where_clause(condition) query = " ".join([keyword, c_args]) + ";" self.execute_query(query) def where_clause(self, condition=[]): """ builds WHERE clause :param condition: list of tuple, see tupleListToStatement function :return: WHERE clause string """ clause = "WHERE " + self.tuple_list_to_statement(condition) self.__lh.debug("WHERE clause: " + clause) return clause def and_clause(self): # TODO pass def or_clause(self): # TODO pass def like_clause(self): # TODO pass def glob_clause(self): # TODO pass def limit_clause(self): # TODO pass def order_by_clause(self): # TODO pass def group_by_clause(self): # TODO pass def having_clause(self): # TODO pass def tuple_list_to_statement(self, tuple_list=[]): """ builds string from tuple :param tuple_list: tuple list, it should look like this when building conditions: [(column, relation operator, value), (c,ro,v),...] :return: list of strings """ statement = "" for t in tuple_list: tmp = str(t[0]) + " " + str(t[1]) + " " + str(t[2]) if statement == "": statement = tmp else: statement = ",".join([statement, tmp]) return statement def is_registered(self, table_name, data_list=[], col_list=[]): ret = False new_list = [] self.__lh.debug(data_list) self.__lh.debug(col_list) for x in range(len(data_list)): new_list.append((data_list[x], "=", col_list[x])) result = [] #result = self.select(table_name,"*",self.tuple_list_to_statement(new_list)) if result != []: ret = True return ret