def _update_state(self, state, data): if self._test: return data["crawl_state"] = state table = CONFIG.G_TABLE_LINK["name"] sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE,\ table, "update", data, "md5")
def _save_html(self, md5, html, d_config): if CONFIG.G_IFSAVE_HTML == False: return dcode = d_config["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5":md5, "html":html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _save_html(self, md5, html, d_config): if CONFIG.G_IFSAVE_HTML == False: return dcode = d_config["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5": md5, "html": html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _save_html(self, md5, html): if self._test: return if CONFIG.G_IFSAVE_HTML == False: return if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\ CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]: return dcode = self._site["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5": md5, "html": html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _insert2sql(self, links, check_list): if self._test: return 0 db_had = self._db_had(CONFIG.G_TABLE_LINK, check_list) last_time = time.time() table = CONFIG.G_TABLE_LINK["name"] division = CONFIG.G_TABLE_LINK["division"] print "GET LINK: ", len(links) count = 0 for item in links: if item["md5"] in db_had: continue item["depth"] = self._site["new_depth"] item["domain"] = self._site["domain"] item["last_time"] = last_time sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) count += 1 return count
def _save_html(self, md5, html): if self._test: return if CONFIG.G_IFSAVE_HTML == False: return if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\ CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]: return dcode = self._site["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5":md5, "html":html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _data2redis_sql(self, sqldata, table_cfg, op_type): table = table_cfg["name"] division = table_cfg["division"] sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, \ table, op_type, sqldata, "md5", division)