def get(url, heads=None, encode=False, timeout=30, use_proxy=0, d_config=None): if not "/" in url: return (-2, "URL error :" + url) domain = url.split("/")[2] if d_config: result = _get(url, domain, heads, timeout, use_proxy, d_config) if encode == True: (info, html) = result html = spider.html2utf8(html, d_config["default_code"]) result = (info, html) return result else: return _get(url, domain, heads, timeout, use_proxy)
def _save_html(self, md5, html, d_config): if CONFIG.G_IFSAVE_HTML == False: return dcode = d_config["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5":md5, "html":html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _save_html(self, md5, html, d_config): if CONFIG.G_IFSAVE_HTML == False: return dcode = d_config["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5": md5, "html": html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _save_html(self, md5, html): if self._test: return if CONFIG.G_IFSAVE_HTML == False: return if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\ CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]: return dcode = self._site["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5": md5, "html": html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5": md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)
def _save_html(self, md5, html): if self._test: return if CONFIG.G_IFSAVE_HTML == False: return if CONFIG.G_IFSAVE_PASS == False and self._site["task"]["type"] !=\ CONFIG.G_SITE_COMMON.G_PAGETYPE["detail"]["type"]: return dcode = self._site["config"]["default_code"] html = spider.html2utf8(html, dcode) item = {"md5":md5, "html":html} table = CONFIG.G_TABLE_HTML["name"] division = CONFIG.G_TABLE_HTML["division"] if not self._db_had(CONFIG.G_TABLE_HTML, {"md5":md5}): sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "insert", item, "md5", division) else: sql.data2redis(self._redis, CONFIG.G_SQL_QUEUE, table,\ "update", item, "md5", division)