def _get_job_rule(pool: happybase.ConnectionPool, job_name) -> crawler.CrawlJobCore: ''' 获取 hbase 里的 crawl_job_core (爬取规则) ''' with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(job_name) row = table.row(rule_row_key, columns=[ rule_col, ]) rule = row[bytes(rule_col, encoding="utf-8")].decode("utf-8") # _json_str = row.values # print(rule) common.print_info("get crawl rule: {}".format(rule)) crawl_job_core = crawler.CrawlJobCore.loads(rule) # TODO 键 有点问题 return crawl_job_core except Exception as e: common.print_exception(e) return None pass finally: conn.close() # 关闭连接
def _save_results(pool: happybase.ConnectionPool, crawl_job_core, url, result_list) -> bool: ''' 保存爬取结果到 hbase 里 如果 result_list 为空,不进行操作 ''' if not bool(result_list): return False core = crawl_job_core with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(core.name) row_key = url table.put(row_key, { results_col_pattern(i): ele for i, ele in enumerate(result_list) }) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接
def _get_job_result(pool: happybase.ConnectionPool, crawl_job_name) -> list: ''' 获取爬虫结果 ''' with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(crawl_job_name) result_list = [] for key, value in table.scan(include_timestamp=True): tmp = {} tmp['url'] = key.decode("utf-8") # tmp['result']={ele.decode("utf-8"):value[ele].decode("utf-8") for ele in value} tmp['result'] = { ele.decode("utf-8"): (value[ele][0].decode("utf-8"), value[ele][1]) for ele in value } result_list.append(tmp) return result_list except Exception as e: common.print_exception(e) return None pass finally: conn.close() # 关闭连接
def run_task_fetcher(self): ''' 从 redis 中获取 任务 ''' while not self.end_flag: try: obj_tuple = QUEUE.get_wait(timeout=self.timeout) # print(obj) if obj_tuple is None: # 取出为空,说明超时了 continue _, content = obj_tuple task_info = CrawlTaskJson.from_json_str(content) # 如果该任务在 close_set 里,说明它被手动关闭了 if CLOSE_SET.is_member(task_info.job_name): common.print_info( "this crawl_job has been closed: {}".format( task_info.job_name)) continue # 判断是否为合法url for url in task_info.urls: assert common.urltools.check_url(url) # 阻塞一段时间,防止其他节点抢不到 time.sleep(0.5) self.add_urls(task_info.job_name, task_info.layer, task_info.urls) except Exception as e: common.print_exception(e)
def work(self, tasks: common.LockedIterator): ''' 接收 crawl_tasks,并执行这些 task ''' while not self.end_flag: try: self.activate() for task in tasks: task(self.driver) except Exception as e: common.print_exception(e) continue finally: self.close()
def process(self): while not self.quit: #self.polling() try: self.polling() found = self.queue.get(True, self.default_polling_interval) found.process(self) except queue.Empty: found = None except KeyboardInterrupt: found = None self.quit = True except Exception as e: print_exception()
def _get_job_list(pool: happybase.ConnectionPool) -> list: ''' 获取hbase中存的job名称list ''' with pool.connection() as conn: try: conn: happybase.Connection table_list = conn.tables() return table_list except Exception as e: common.print_exception(e) return None pass finally: conn.close() # 关闭连接
def _remove_job(pool, crawl_job_name) -> bool: ''' 删除 job (删除job_name所对应的表) 要想删除 hbase 的表,应该先 disable 掉它 ''' with pool.connection() as conn: try: conn: happybase.Connection conn.delete_table(crawl_job_name, disable=True) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接
def export_csv(service, docid, filename_template='%(title)s - %(sheet)s.csv'): error_count = common.error_count # Note original error count if docid and re.match("^https://docs.google.com/spreadsheets/d/", docid, re.IGNORECASE): docid = docid.split("/")[5] try: for (doc, sheet), rows in itersheets(service, docid): file_name = filename_template % {'title': doc, 'sheet': sheet} file_path = os.path.join(common.args[TITLE_DIR], common.args[TITLE_PREFIX], file_name) common.print_notice("Saving \"%s\" sheet to file: %s" % (common.colour_text(sheet), common.colour_text(file_name, common.COLOUR_GREEN))) with open(file_name, 'wb') as fd: write_csv(service, fd, rows) except Exception as e: common.print_exception(e) return error_count == common.error_count
def _set_job_rule(pool: happybase.ConnectionPool, crawl_job_core) -> bool: ''' 改变规则 ''' core = crawl_job_core with pool.connection() as conn: try: conn: happybase.Connection table = conn.table(core.name) table.put(rule_row_key, { rule_col: core.dumps(), }) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接
def _save_job(pool: happybase.ConnectionPool, crawl_job_core)-> bool: ''' 存储 crawl_job_core (爬取规则) 到 hbase 里 ''' core = crawl_job_core with pool.connection() as conn: try: conn: happybase.Connection conn.create_table(name=core.name, families={ rule_col: dict(max_versions=rule_max_version), results_family: dict(max_versions=results_max_version), }) table = conn.table(core.name) table.put(rule_row_key, { rule_col: core.dumps() }) return True except Exception as e: common.print_exception(e) return False pass finally: conn.close() # 关闭连接