def _get_job_rule(pool: happybase.ConnectionPool, job_name) -> crawler.CrawlJobCore: ''' 获取 hbase 里的 crawl_job_core (爬取规则) ''' with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(job_name) row = table.row(rule_row_key, columns=[ rule_col, ]) rule = row[bytes(rule_col, encoding="utf-8")].decode("utf-8") # _json_str = row.values # print(rule) common.print_info("get crawl rule: {}".format(rule)) crawl_job_core = crawler.CrawlJobCore.loads(rule) # TODO 键 有点问题 return crawl_job_core except Exception as e: common.print_exception(e) return None pass finally: conn.close() # 关闭连接
def _get_job_result(pool: happybase.ConnectionPool, crawl_job_name) -> list: ''' 获取爬虫结果 ''' with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(crawl_job_name) result_list = [] for key, value in table.scan(include_timestamp=True): tmp = {} tmp['url'] = key.decode("utf-8") # tmp['result']={ele.decode("utf-8"):value[ele].decode("utf-8") for ele in value} tmp['result'] = { ele.decode("utf-8"): (value[ele][0].decode("utf-8"), value[ele][1]) for ele in value } result_list.append(tmp) return result_list except Exception as e: common.print_exception(e) return None pass finally: conn.close() # 关闭连接
def _save_results(pool: happybase.ConnectionPool, crawl_job_core, url, result_list) -> bool: ''' 保存爬取结果到 hbase 里 如果 result_list 为空,不进行操作 ''' if not bool(result_list): return False core = crawl_job_core with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(core.name) row_key = url table.put(row_key, { results_col_pattern(i): ele for i, ele in enumerate(result_list) }) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接
def test_pool_exhaustion(): pool = ConnectionPool(size=1, **connection_kwargs) def run(): with assert_raises(NoConnectionsAvailable): with pool.connection(timeout=.1) as connection: connection.tables() with pool.connection(): # At this point the only connection is assigned to this thread, # so another thread cannot obtain a connection at this point. t = threading.Thread(target=run) t.start() t.join()
def _get_job_list(pool: happybase.ConnectionPool) -> list: ''' 获取hbase中存的job名称list ''' with pool.connection() as conn: try: conn: happybase.Connection table_list = conn.tables() return table_list except Exception as e: common.print_exception(e) return None pass finally: conn.close() # 关闭连接
def _set_job_rule(pool: happybase.ConnectionPool, crawl_job_core) -> bool: ''' 改变规则 ''' core = crawl_job_core with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(core.name) table.put(rule_row_key, { rule_col: core.dumps(), }) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接
def _save_job(pool: happybase.ConnectionPool, crawl_job_core)-> bool: ''' 存储 crawl_job_core (爬取规则) 到 hbase 里 ''' core = crawl_job_core with pool.connection() as conn: try: conn: happybase.Connection conn.create_table(name=core.name, families={ rule_col: dict(max_versions=rule_max_version), results_family: dict(max_versions=results_max_version), }) table = conn.table(core.name) table.put(rule_row_key, { rule_col: core.dumps() }) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接