Exemplo n.º 1
0
class Base(object):
    def __init__(self):
        self.local = LOCAL
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
            'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def _init_pool(self):
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
        self.sql_pool = PyMysqlPoolBase(**conf)

    def _get_proxy(self):
        if self.local:
            return requests.get(LOCAL_PROXY_URL).text.strip()
        else:
            random_num = random.randint(0, 10)
            if random_num % 2:
                time.sleep(1)
                return requests.get(PROXY_URL).text.strip()
            else:
                return requests.get(LOCAL_PROXY_URL).text.strip()

    def get(self, url):
        count = 0
        while True:
            count += 1
            if count > 10:
                return None
            try:
                proxy = {"proxy": self._get_proxy()}
                print("proxy is >> {}".format(proxy))
                resp = requests.get(url, headers=self.headers, proxies=proxy)
            except:
                traceback.print_exc()
                time.sleep(0.5)
            else:
                if resp.status_code == 200:
                    return resp
                elif resp.status_code == 404:
                    return None
                else:
                    print("status_code: >> {}".format(resp.status_code))
                    time.sleep(1)
                    pass

    def convert_dt(self, time_stamp):
        d = str(datetime.datetime.fromtimestamp(time_stamp))
        return d

    def _contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        ks = sorted(ks)
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}` '''.format(
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        return base_sql

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in [
                '\n',
                '\r',
                '\t',
                '\u200a',
                '\u200b',
                '\u200c',
                '\u200d',
                '\u200e',
                '\u202a',
                '\u202b',
                '\u202c',
                '\u202d',
                '\u202e',
        ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        content = "".join(params).strip()
        return content

    def _get_values(self, item: dict):
        # self.fields: []  插入所需字段列表 同时与上文的 ks = sorted(ks) 对应
        value = tuple(item.get(field) for field in sorted(self.fields))
        return value

    def _save(self, item):
        insert_sql = self._contract_sql(item)
        value = self._get_values(item)
        try:
            ret = self.sql_pool.insert(insert_sql, value)
        except pymysql.err.IntegrityError:
            print("重复数据 ")
            return 1
        except:
            traceback.print_exc()
        else:
            return ret

    def _save_many(self, items):
        values = [self._get_values(item) for item in items]  # list of tuple
        insert_many_sql = self._contract_sql(items[0])
        try:
            ret = self.sql_pool.insert_many(insert_many_sql, values)
        except pymysql.err.IntegrityError:
            print("批量中有重复数据")
        except:
            traceback.print_exc()
        else:
            return ret
        finally:
            self.sql_pool.end()

    def save_one(self, item):
        self._save(item)
        self.sql_pool.end()

    def save(self, items):
        ret = self._save_many(items)
        if not ret:
            print("批量保存失败 开始单独保存 .. ")
            count = 0
            for item in items:
                print(item)
                self._save(item)
                count += 1
                if count > 9:
                    self.sql_pool.end()
                    count = 0
            # self.sql_pool.dispose()
            self.sql_pool.end()
        else:
            print("批量成功..")
            print(items)
            print(len(items))

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def start(self):
        try:
            self._init_pool()
            self._start()
        except:
            traceback.print_exc()
Exemplo n.º 2
0
class STCN_Base(object):
    def __init__(self):
        self.table = "stcn_info"
        self.local = LOCAL
        self.check_dt = datetime.datetime.today() - datetime.timedelta(days=2)
        self.dt_fmt = '%Y-%m-%d'

        # if self.local:
        #     conf = {
        #         "host": LOCAL_MYSQL_HOST,
        #         "port": LOCAL_MYSQL_PORT,
        #         "user": LOCAL_MYSQL_USER,
        #         "password": LOCAL_MYSQL_PASSWORD,
        #         "db": LOCAL_MYSQL_DB,
        #     }
        # else:
        #     conf = {
        #         "host": MYSQL_HOST,
        #         "port": MYSQL_PORT,
        #         "user": MYSQL_USER,
        #         "password": MYSQL_PASSWORD,
        #         "db": MYSQL_DB,
        #     }
        # self.sql_pool = PyMysqlPoolBase(**conf)

        # 默认是不需要翻页的
        self.pages = False
        self.extractor = GeneralNewsExtractor()

    def _init_pool(self):
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
        self.sql_pool = PyMysqlPoolBase(**conf)

    def _get(self, url):
        resp = requests.get(url)
        if resp.status_code == 200:
            return resp.text

    def _extract_content(self, body):
        result = self.extractor.extract(body)
        content = result.get("content")
        return content

    def _parse_detail(self, body):
        try:
            doc = html.fromstring(body)
            node = doc.xpath("//div[@class='txt_con']")[0]
            content = node.text_content()
        except:
            content = None
        else:
            return content
        if not content:
            content = self._extract_content(body)
            return content

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in [
                '\n',
                '\r',
                '\t',
                '\u200a',
                '\u200b',
                '\u200c',
                '\u200d',
                '\u200e',
                '\u202a',
                '\u202b',
                '\u202c',
                '\u202d',
                '\u202e',
        ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        content = "".join(params).strip()
        return content

    def _contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        ks = sorted(ks)  # article,link,pub_date,title
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}` '''.format(
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        # return base_sql, tuple(vs)
        return base_sql

    def _save(self, item):
        insert_sql = self._contract_sql(item)
        # print(insert_sql)
        value = (item.get("article"), item.get("link"), item.get("pub_date"),
                 item.get("title"))
        # print(value)
        try:
            ret = self.sql_pool.insert(insert_sql, value)
        except pymysql.err.IntegrityError:
            # print("重复数据 ")
            return 1
        except:
            traceback.print_exc()
        else:
            return ret

    def _save_many(self, items):
        values = [(item.get("article"), item.get("link"), item.get("pub_date"),
                   item.get("title")) for item in items]
        insert_many_sql = self._contract_sql(items[0])
        try:
            ret = self.sql_pool.insert_many(insert_many_sql, values)
        except pymysql.err.IntegrityError:
            print("批量中有重复数据")
        except:
            traceback.print_exc()
        else:
            return ret
        finally:
            self.sql_pool.end()

    def _add_article(self, item: dict):
        link = item.get("link")
        if link:
            detail_page = self._get(link)
            if detail_page:
                article = self._parse_detail(detail_page)
                if article:
                    item['article'] = article
                    return True
        return False

    def _check_dt(self, pub_dt):
        if not pub_dt:
            return False

        try:
            pub_dt = datetime.datetime.strptime(pub_dt[:10], self.dt_fmt)
        except:
            print("截取增量时间点失败.. 重新爬取.. ")
            # traceback.print_exc()
            return False

        if pub_dt < self.check_dt:
            print("当前天: ", pub_dt)
            print("检查时刻: ", self.check_dt)
            print("增量结束 .. ")
            return True
        else:
            return False

    def _start(self):
        self._init_pool()
        if not self.pages:
            list_body = self._get(self.list_url)
            if list_body:
                items = self._parse_list_body(list_body)
                count = 0
                for item in items:
                    if self._check_dt(item.get("pub_date")):
                        self.sql_pool.end()
                        return
                    ret = self._save(item)
                    if ret:
                        count += 1
                        # print("保存成功: {}".format(item))
                    else:
                        # print("保存失败: {}".format(item))
                        pass
                    if count > 9:
                        self.sql_pool.end()
                        print("提交 .. ")
                        count = 0
                self.sql_pool.dispose()
        else:
            count = 0
            for page in range(1, self.page_num + 1):
                print("\nThe page is {}".format(page))
                list_url = self.format_url.format(page)
                print(list_url)
                list_body = self._get(list_url)
                if list_body:
                    items = self._parse_list_body(list_body)
                    for item in items:
                        if self._check_dt(item.get("pub_date")):
                            self.sql_pool.end()
                            return
                        ret = self._save(item)
                        if ret:
                            count += 1
                            # print("保存成功: {}".format(item))
                        else:
                            # print("保存失败: {}".format(item))
                            pass
                        if count > 9:
                            self.sql_pool.end()
                            print("提交 .. ")
                            count = 0

    def __del__(self):
        try:
            self.sql_pool.dispose()
        except:
            pass

    def start(self):
        print("{} 开始爬取".format(self.name))
        try:
            self._start()
        except:
            traceback.print_exc()
            print("{} 爬取失败".format(self.name))