Python Db 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: util.db

클래스/타입: Db

hotexamples.com에서의 예제들: 13

Python Db - 13개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 util.db.Db에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

select_db(7)

Db(6)

insert_db(6)

update_db(5)

create_table(1)

예제 #1

파일 보기

    def __init__(self):
        self.Db = Db("china_regions")
        china = pd.read_csv('news/china_city_list.csv', encoding='gbk')
        self.province = list(china.groupby(by=['Province']).count().axes[0])
        self.city = list(china.groupby(by=['City']).count().axes[0])
        self.filelists = ['google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail', 'bjh_detail_poison',
                          'news_steal.txt', 'news_poison.txt']
        self.city_province = {}
        self.province_map = {}

        self.pre_data()
        for index, row in china.iterrows():
            self.city_province[row['City']] = row['Province']

예제 #2

파일 보기

파일: getproxy.py 프로젝트: Marin111/spider-1

 def __init__(self):
     self.Db = Db("proxy")
     self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s '''
     self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0'''
     self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s '''
     self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s'''
     self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s'''
     self.canuseip = {}
     self.waitjudge = []
     self.cannotuseip = {}
     self.failuredtime = {}
     self.canuse_proxies = []
     self.initproxy()

예제 #3

파일 보기

 def __init__(self):
     self.Db = Db("netease")
     self.classifylist = {}
     self.playlists = []
     self.failuredmap = {}
     self.songmap = {}
     self.songlist = []
     self.finishlist = []
     self.get_classify()
     self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' '''
     self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 '''
     self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' '''
     self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s'''
     self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)'''  # change to your file absolute address
     self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s'''
     self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''

예제 #4

파일 보기

파일: getproxy.py 프로젝트: onlyoneprogram/spider

 def __init__(self):
     self.Db = Db("proxy")
     self.insert_sql = """INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s """
     self.select_list = (
         """SELECT address, http_type from ip_proxy WHERE `is_failured` = 0"""
     )
     self.select_sql = """SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s """
     self.select_all = """SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s"""
     self.random_select = """SELECT `address`, `http_type` FROM ip_proxy WHERE `is_failured` >= 5 and (`id` >= ((SELECT MAX(`id`) FROM ip_proxy)-(SELECT MIN(`id`) FROM ip_proxy)) * RAND() + (SELECT MIN(`id`) FROM ip_proxy)) and http_type in %s LIMIT 6000"""
     self.replace_ip = """REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s"""
     self.can_use_ip = {}
     self.waitjudge = []
     self.cannot_use_ip = {}
     self.failured_time = {}
     self.canuse_proxies = []
     self.init_proxy()

예제 #5

파일 보기

파일: titleviews.py 프로젝트: zhujuanzhu/spider

 def __init__(self):
     self.Db = Db("blog")
     self.local_views = {}
     self.title_map = {}
     self.title2slug = {}
     self.zhihu_views = {}
     self.zhihu_id = {}
     self.jianshu_views = {}
     self.jianshu_id = {}
     self.csdn_views = {}
     self.csdn_id = {}
     self.exist_data = {}
     self.getTitleMap()
     self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s'''
     self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s'''
     self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''

예제 #6

파일 보기

 def __init__(self):
     super(ActivateArticle, self).__init__()
     self.Db = Db("tbk")
     self.Db.create_table(os.path.join(root_dir, "tpwd.sql"))
     self.Db.create_table(os.path.join(root_dir, "article.sql"))
     self.tpwd_map = {}
     self.tpwd_db_map = {}
     self.tpwds = {}
     self.cookies = {}
     self.share2article = {}
     self.article_list = {}
     self.list_recent = {}
     self.idx = []
     self.empty_content = ""
     self.tpwd_exec = ThreadPoolExecutor(max_workers=20)
     self.need_del = {}
     self.get_share_list()

예제 #7

파일 보기

class Get_playlist_song():
    """
    1. get playlist id from classify;
    2. get song from play list;
    use url:
    """
    def __init__(self):
        self.Db = Db("netease")
        self.classifylist = {}
        self.playlists = []
        self.failuredmap = {}
        self.songmap = {}
        self.songlist = []
        self.finishlist = []
        self.get_classify()
        self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' '''
        self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 '''
        self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' '''
        self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s'''
        self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)'''  # change to your file absolute address
        self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s'''
        self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''

    def get_classify(self):
        """
        get classify from /discover/playlist
        """

        version = begin_time()
        self.classifylist = {}
        host = 'https://music.163.com/discover/playlist'
        html = proxy_req(host, 0)

        if not html:
            print('Empty')
            if can_retry(host):
                self.get_classify()
            return []

        alist = html.find_all('a', class_='s-fc1')
        if not len(alist):
            if can_retry(host):
                self.get_classify()
            print(html)
        for index in alist:
            self.classifylist[index.text] = index['href']
        end_time(version)

    def get_playlist_id(self, classify, offset):
        """
        get playlist id from classify
        """

        host = 'https://music.163.com'
        allclassify = classify == '全部风格'
        url = host + self.classifylist[classify] + (
            '?' if allclassify else
            '&') + 'order=hot&limit=35&offset=' + str(offset)
        html = basic_req(url, 0)

        if not html:
            if can_retry(url):
                self.get_playlist_id(classify, offset)
            return []
        alist = html.find_all('a', class_='icon-play')
        if not len(alist):
            if can_retry(url):
                self.get_playlist_id(classify, offset)
        for index in alist:
            self.playlists.append(index['data-res-id'])

    def get_playlist_id_thread(self):
        """
        get play list id in threading
        """

        version = begin_time()
        if not len(self.classifylist):
            self.get_classify()

        for index in self.classifylist:
            threadings = []
            for offset in range(41):
                work = threading.Thread(target=self.get_playlist_id,
                                        args=(
                                            index,
                                            offset * 35,
                                        ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            print(len(self.playlists))
            self.test_queue(index)
            self.playlists = []
            print(index + " Over")
        end_time(version)

    def test_queue(self, classify):
        """
        test data if in playlist_queue
        """
        if len(self.playlists) == 1:
            waitlist = '(' + str(self.playlists[0]) + ')'
        else:
            waitlist = tuple(self.playlists)
        results = self.Db.select_db(self.select_one %
                                    (str(waitlist), classify))
        if not results:
            return []
        hadexist = []
        for index in results:
            hadexist.append(index[0])
        insertlist = []
        for index in self.playlists:
            if index not in hadexist:
                # file_d.write(str([index, classify])[1:-1] + '\n')
                insertlist.append((index, classify))
        print('Insert ' + str(len(insertlist)) + ' ' + classify)
        self.insert_queue(insertlist)

    def insert_queue(self, ids):
        """
        insert data to playlist_queue
        """

        if not len(ids):
            return []
        results = self.Db.insert_db(self.insert_sql % str(ids)[1:-1])
        if results:
            if len(ids):
                print('Insert ' + ids[0][1] + ' ' + str(len(ids)) +
                      ' Success!')
        else:
            pass

    def get_list_ids(self, classify):
        """
        get list ids from db
        """
        results = self.Db.select_db(self.select_ids % classify)
        ids = []
        if results:
            for index in results:
                ids.append([index[0], index[1]])
        return ids

    def get_song_detail_thread(self):
        """
        get song detail threadings
        """

        version = begin_time()
        for classify in self.classifylist:
            ids = self.get_list_ids(classify)
            threadings = []
            for oneid in ids:
                work = threading.Thread(target=self.get_song_detail,
                                        args=(oneid[1], ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()
            self.clean_data()
            self.test_song(classify, ids)
            self.songlist = []
            self.songmap = {}
            self.finishlist = []
            self.successtime = 0
            print(classify + ' Over!')
        end_time(version)

    def clean_data(self):
        """
        aggregation data
        """
        for song in self.songlist:
            [songid, songname, playcount] = song
            if songid not in self.songmap:
                self.songmap[songid] = [1, playcount, songname]
            else:
                orgin = self.songmap[songid]
                self.songmap[songid] = [
                    orgin[0] + 1, orgin[1] + playcount, songname
                ]

    def get_song_detail(self, id):
        """
        get song detail form playlist
        """

        host = 'http://music.163.com/api/playlist/detail?id=' + str(id)
        json = proxy_req(host, 1)
        if json == 0:
            if can_retry(host):
                self.get_song_detail(id)
            return []
        result = json['result']
        tracks = result['tracks']

        if len(tracks) <= 1:
            if can_retry(host):
                self.get_song_detail(id)
            return []
        else:
            playcount = result['playCount']
            for track in tracks:
                songid = track['id']
                songname = track['name']
                self.songlist.append([songid, songname, playcount])
            self.finishlist.append(id)

    def test_song(self, classify, ids):
        """
        test song if in db
        """
        songs = []
        for song in self.songmap:
            songs.append(song)
        if not len(songs):
            return []
        elif len(songs) == 1:
            waitlist = '(' + songs[0] + ')'
        else:
            waitlist = tuple(songs)
        results = self.Db.select_db(self.select_song %
                                    (str(waitlist), classify))
        resultmap = {}
        for detail in results:
            resultmap[detail[1]] = [detail[0], detail[2], detail[3]]

        replacelist = []
        insertlist = []
        replacequeue = []
        file_d = codecs.open("song_detail", 'a', encoding='utf-8')
        file_d.seek(0)
        file_d.truncate()
        idsmap = {}
        for indexid in ids:
            idsmap[indexid[1]] = indexid[0]
        for song in self.songmap:
            songdetail = self.songmap[song]
            if song in resultmap:
                dbdetail = resultmap[song]
                replacelist.append(
                    (dbdetail[0], song, classify, songdetail[2],
                     songdetail[0] + dbdetail[1], songdetail[1] + dbdetail[2]))
            else:
                file_d.write(u'' + str([
                    song, u'' + str(u'' + songdetail[2].replace(',', ' '))
                    [0:20], classify, songdetail[0], songdetail[1]
                ])[1:-1] + '\n')
                insertlist.append((song, songdetail[2], classify,
                                   songdetail[0], songdetail[1]))
        for playlist in self.finishlist:
            replacequeue.append((idsmap[playlist], playlist, classify, 1))
        file_d.close()
        if len(insertlist):
            self.db_song_detail(insertlist, 'Insert', replacequeue)
        if len(replacelist):
            self.db_song_detail(replacelist, 'Update', [])

    def db_song_detail(self, waitlist, types, replacequeue):
        """
        batch insert/update song detail
        """

        if types == 'Update':
            results = self.Db.update_db(self.replace_song %
                                        str(blocklist)[1:-1])
        else:
            results = self.Db.update_db(self.insert_song)
        if results:
            if len(waitlist):
                print(types + ' song detail for ' + waitlist[0][2] + ' ' +
                      str(len(waitlist)) + ' Success!')
            if types == 'Insert':
                self.replace_queue_db(replacequeue)

    def replace_queue_db(self, replacequeue):
        """
        replace db for fininsh playlist id
        """

        results = self.Db.update_db(self.replace_queue %
                                    str(replacequeue)[1:-1])
        if results:
            if len(replacequeue):
                print('Update queue fininsh for ' + str(len(replacequeue)) +
                      ' item!')
        else:
            pass

예제 #8

파일 보기

파일: getproxy.py 프로젝트: onlyoneprogram/spider

class GetFreeProxy:
    """ proxy pool """
    def __init__(self):
        self.Db = Db("proxy")
        self.insert_sql = """INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s """
        self.select_list = (
            """SELECT address, http_type from ip_proxy WHERE `is_failured` = 0"""
        )
        self.select_sql = """SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s """
        self.select_all = """SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s"""
        self.random_select = """SELECT `address`, `http_type` FROM ip_proxy WHERE `is_failured` >= 5 and (`id` >= ((SELECT MAX(`id`) FROM ip_proxy)-(SELECT MIN(`id`) FROM ip_proxy)) * RAND() + (SELECT MIN(`id`) FROM ip_proxy)) and http_type in %s LIMIT 6000"""
        self.replace_ip = """REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s"""
        self.can_use_ip = {}
        self.waitjudge = []
        self.cannot_use_ip = {}
        self.failured_time = {}
        self.canuse_proxies = []
        self.init_proxy()

    def proxy_req(
        self,
        url: str,
        types: int,
        data=None,
        header=None,
        test_func=None,
        need_cookie: bool = False,
        config: dict = {},
        proxies: dict = {},
    ):
        """
        use proxy to send requests, and record the proxy can't use
        @types S0XY: X=0.->get;   =1.->post;
                     Y=0.->html;  =1.->json; =2.->basic
                     S=0.->basic ;=1.->ss

        support failured retry && failured auto record
        """

        httptype = url[4] == "s"
        ss_type = types // 1000
        types %= 1000
        if ss_type:
            proxylist = self.proxylists_ss if httptype else self.proxylist_ss
        else:
            proxylist = self.proxylists if httptype else self.proxylist

        if proxies != {}:
            proxies = proxies
        elif not len(proxylist):
            if self.Db.db:
                echo(
                    "0|critical",
                    "Proxy pool empty!!! Please check the db conn & db dataset!!!",
                )
            proxies = {}
        else:
            index = random.randint(0, len(proxylist) - 1)
            proxies_url = proxylist[index]
            proxies = {type_map[httptype]: proxies_url}

        try:
            result = basic_req(
                url,
                types=types,
                proxies=proxies,
                data=data,
                header=header,
                need_cookie=need_cookie,
                config=config,
            )
            if test_func is not None:
                if not test_func(result):
                    if self.check_retry(url):
                        return self.proxy_req(
                            url,
                            types=types + 1000 * ss_type,
                            data=data,
                            header=header,
                            test_func=test_func,
                            need_cookie=need_cookie,
                            config=config,
                            proxies=proxies,
                        )
                    else:
                        self.failured_time[url] = 0
                        return
                return result
            return result

        except:
            self.cannot_use_ip[random.randint(0, MAXN)] = proxies_url

            if proxies_url in proxylist:
                proxylist.remove(proxylist.index(proxies_url))

            if not len(self.cannot_use_ip.keys()) % 10:
                self.clean_cannot_use()

            if self.check_retry(url):
                return self.proxy_req(
                    url,
                    types=types + 1000 * ss_type,
                    data=data,
                    test_func=test_func,
                    header=header,
                    need_cookie=need_cookie,
                    config=config,
                    proxies=proxies,
                )

    def check_retry(self, url: str) -> bool:
        """ check try time """
        if url not in self.failured_time:
            self.failured_time[url] = 0
            return True
        elif self.failured_time[url] < 3:
            self.failured_time[url] += 1
            return True
        else:
            self.log_write(url)
            self.failured_time[url] = 0
            return False

    def log_write(self, url: str):
        """ failure log """
        echo("0|warning", "url {} retry max time".format(url))

    def insert_proxy(self, insert_list: list):
        """ insert data to db """
        results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1])
        if results:
            echo("2|info",
                 "Insert " + str(len(insert_list)) + " items Success!")

    def update_proxy(self, update_list: list, types: int):
        """ update data to db"""
        results = self.Db.update_db(self.replace_ip % str(update_list)[1:-1])
        typemap = {0: "can use ", 1: "can not use "}
        if results:
            echo(
                "2|info",
                "Update",
                typemap[types],
                str(len(update_list)),
                " items Success!",
            )

    def select_proxy(self, target_list: list) -> list:
        """ select ip proxy by ids """
        if not len(target_list):
            return []
        elif len(target_list) == 1:
            waitlist = "('" + target_list[0] + "')"
        else:
            waitlist = tuple(target_list)
        return self.Db.select_db(self.select_sql % str(waitlist))

    def db_can_use_proxy(self):
        """ test db have or not this data """

        results = self.select_proxy([ii[0] for ii in self.can_use_ip.values()])
        ss_len = len([1 for ii in self.can_use_ip.values() if ii[1] > 1])
        echo("2|info", "SS proxies", ss_len)

        insert_list = []
        update_list = []
        ip_map = {}
        if results != False:
            for ip_info in results:
                ip_map[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.can_use_ip.values():
                http_type = ip_now[1]
                ip_now = ip_now[0]
                if ip_now in ip_map:
                    if ip_map[ip_now][1]:
                        update_list.append(
                            (ip_map[ip_now][0], ip_now, http_type, 0))
                else:
                    insert_list.append((ip_now, http_type))
            if len(insert_list):
                self.insert_proxy(insert_list)
            if len(update_list):
                self.update_proxy(update_list, 0)
        else:
            pass
        self.can_use_ip = {}

    def clean_cannot_use(self):
        """ update db proxy cannot use """
        results = self.select_proxy(self.cannot_use_ip.values())
        update_list = []
        ip_map = {}
        if results:
            for ip_info in results:
                ip_map[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.cannot_use_ip.values():
                http_type = ip_now[4] == "s"
                if ip_now in ip_map:
                    update_list.append((ip_map[ip_now][0], ip_now, http_type,
                                        ip_map[ip_now][1] + 1))

            if len(update_list):
                self.update_proxy(update_list, 1)
        self.cannot_use_ip = {}

    def init_proxy(self):
        """ init proxy list """

        results = self.Db.select_db(self.select_list)
        self.proxylist = []
        self.proxylists = []
        self.proxylist_ss = []
        self.proxylists_ss = []
        if not results:
            echo("0|error",
                 "Please check db configure!!! The proxy pool cant use!!!>>>")
            return
        for index in results:
            if index[1] == 1:
                self.proxylists.append(index[0])
            elif index[1] == 2:
                self.proxylist.append(index[0])
                self.proxylist_ss.append(index[0])
            elif index[1] == 3:
                self.proxylists.append(index[0])
                self.proxylists_ss.append(index[0])
            else:
                self.proxylist.append(index[0])
        echo("2|info", len(self.proxylist), " http proxy can use.")
        echo("2|info", len(self.proxylists), " https proxy can use.")
        echo("2|info", len(self.proxylist_ss), " ss http proxy can use.")
        echo("2|info", len(self.proxylists_ss), " ss https proxy can use.")

    def judge_url(self,
                  urls: str,
                  index: int,
                  times: int,
                  ss_test: bool = False):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == "s"
        proxies = {type_map[http_type]: urls}

        test_url = (type_map[http_type] +
                    "://music.163.com/api/playlist/detail?id=432853362")
        ss_url = "https://www.google.com/?gws_rd=ssl"
        try:
            data = basic_req(test_url, 1, proxies)
            result = data["result"]
            tracks = result["tracks"]
            if len(tracks) == 10:
                if times < 0:
                    self.judge_url(urls, index, times + 1)
                else:
                    echo("1|debug", urls, proxies, "Proxies can use.")
                    self.canuse_proxies.append(urls)
                    self.can_use_ip[index] = [urls, int(http_type)]
                    if ss_test:
                        data = basic_req(ss_url, 0)
                        if len(str(data)) > 5000:
                            self.can_use_ip[index] = [urls, int(http_type) + 2]
            else:
                echo("0|debug", urls, proxies, "Tracks len error ^--<^>--^ ")
                self.cannot_use_ip[index] = urls
        except:
            echo("0|debug", urls, proxies, "return error [][][][][][]")
            if not index in self.can_use_ip:
                self.cannot_use_ip[index] = urls

    def thread_judge(self, batch_size: int = 500):
        """ threading to judge proxy """
        changeJsonTimeout(2)
        changeHtmlTimeout(3)

        proxy_exec = ThreadPoolExecutor(max_workers=batch_size // 2)
        text = self.waitjudge
        num = len(text)
        for block in range(num // batch_size + 1):
            proxy_th = [
                proxy_exec.submit(self.judge_url, jj, ii, 0)
                for ii, jj in enumerate(text[block * batch_size:batch_size *
                                             (block + 1)])
            ]
            list(as_completed(proxy_th))
            self.db_can_use_proxy()
            self.clean_cannot_use()
        self.waitjudge = []

    def test_db(self, types: int):
        """ test proxy in db can use """

        version = begin_time()
        typestr = ""
        if types == 2:
            typestr = "(0,1,2,3)"
        elif types == 1:
            typestr = "(1,3)"
        else:
            typestr = "(0,2)"
        results = self.Db.select_db(self.select_all % typestr)
        random_select = self.Db.select_db(self.random_select % typestr)
        if not results:
            results = []
        if not random_select:
            random_select = []
        for index in results + random_select:
            self.waitjudge.append(index[0])
        self.thread_judge()
        self.init_proxy()
        end_time(version, 2)

    def xici_proxy(self, page: int):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            echo("0|warning", "Please input num!")
            return []

        version = begin_time()
        url = "http://www.xicidaili.com/nn/%d"
        for index in range(1, page + 1):
            html = basic_req(url % index, 0)
            tem = html.find_all("tr")
            for index in range(1, len(tem)):
                tds = tem[index].find_all("td")
                ip = tds[5].text.lower()
                self.waitjudge.append("{}://{}:{}".format(
                    ip, tds[1].text, tds[2].text))
        self.thread_judge()
        end_time(version, 2)

    def gatherproxy(self, types: int):
        """
        :100: very nice website
        first of all you should download proxy ip txt from:
        http://www.gatherproxy.com/zh/proxylist/country/?c=China
        """
        if not os.path.exists("{}gatherproxy".format(data_dir)):
            echo("0|warning", "Gather file not exist!!!")
            return
        file_d = read_file("{}gatherproxy".format(data_dir))
        waitjudge_http = ["http://" + ii for ii in file_d]
        waitjudge_https = ["https://" + ii for ii in file_d]
        if not types:
            self.waitjudge += waitjudge_http
        elif types == 1:
            self.waitjudge += waitjudge_https
        elif types == 2:
            self.waitjudge += waitjudge_http + waitjudge_https
        else:
            self.waitjudge += file_d
        echo("2|warning", "load gather over!")

    def goubanjia(self):
        """
        :-1: html tag mixed with invalid data
        :100:And the most important thing is the port writed in 'class' rather in text.
        The website is difficult to spider, but the proxys are very goog
        goubanjia proxy http://www.goubanjia.com
        """

        version = begin_time()
        host = "http://www.goubanjia.com"
        html = self.proxy_req(host, 0)

        if not html:
            return []
        trs = html.find_all("tr", class_=["warning", "success"])
        for tr in trs:
            tds = tr.find_all("td")
            ip = tds[2].find_all("a")[0].text + "://"
            iplist = tds[0].find_all(["div", "span", not "p"],
                                     class_=not "port")
            for index in iplist:
                ip += index.text
            encode = tds[0].find_all(["div", "span", "p"],
                                     class_="port")[0]["class"][1]
            uncode = functools.reduce(
                lambda x, y: x * 10 + (ord(y) - ord("A")),
                map(lambda x: x, encode), 0)
            self.waitjudge.append(ip + ":" + str(int(uncode / 8)))
        self.thread_judge()
        end_time(version, 2)

    def schedulegou(self):
        sched = BlockingScheduler()
        sched.add_job(self.goubanjia, "interval", seconds=100)
        sched.start()

    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        version = begin_time()
        url_list = ["", "free/gngn/index.shtml", "free/gwgn/index.shtml"]
        host = "http://www.data5u.com/"
        for uri in url_list:
            html = self.proxy_req(host + uri, 0)
            if not html:
                continue
            table = html.find_all("ul", class_="l2")
            for index in table:
                tds = index.find_all("li")
                ip = tds[3].text
                self.waitjudge.append("{}://{}:{}".format(
                    ip, tds[1].text, tds[2].text))
        self.thread_judge()
        end_time(version, 2)

    def sixsixip(self, area: int, page: int):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                echo("2|debug", "{} {}".format(index, pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.thread_judge()
        end_time(version, 2)

    def sixsixthread(self, index: int, pageindex: int):
        host = """http://www.66ip.cn/areaindex_%d/%d.html"""
        html = self.proxy_req(host % (index, pageindex), 0)
        if not html:
            return []
        trs = html.find_all("table")[2].find_all("tr")
        for test in range(1, len(trs) - 1):
            tds = trs[test].find_all("td")
            self.waitjudge.append("http://{}:{}".format(
                tds[0].text, tds[1].text))
            self.waitjudge.append("https://{}:{}".format(
                tds[0].text, tds[1].text))

    def kuaidaili(self, page: int):
        """
        kuaidaili https://www.kuaidaili.com/free/
        """

        version = begin_time()
        threadings = []
        for index in range(1, page + 1):
            work = threading.Thread(target=self.kuaidailithread,
                                    args=(index, ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.thread_judge()
        end_time(version, 2)

    def kuaidailithread(self, index: int):
        host = """https://www.kuaidaili.com/free/inha/%d/"""
        html = self.proxy_req(host % index, 0)
        if not html:
            return []
        trs = html.find_all("tr")
        for index in range(1, len(trs)):
            tds = trs[index].find_all("td")
            ip = tds[3].text.lower() + "://" + tds[0].text + ":" + tds[1].text
            self.waitjudge.append(ip)

    def get_cookie(self):
        """
        make cookie login
        PS: Though cookie expired time is more than 1 year,
            but It will be break when the connect close.
            So you need reactive the cookie by this function.
        """
        headers = {
            "Cookie":
            "_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57",
            "Accept": get_accept("html") + ";q=0.9",
        }
        login_url = "http://www.gatherproxy.com/subscribe/login"

        cookie_html = basic_req(login_url, 3, header=headers)
        try:
            verify_text = re.findall('<span class="blue">(.*?)</span>',
                                     cookie_html)[0]
        except:
            return
        verify_list = verify_text.replace("= ", "").strip().split()
        num_map = {
            "Zero": 0,
            "One": 1,
            "Two": 2,
            "Three": 3,
            "Four": 4,
            "Five": 5,
            "Six": 6,
            "Seven": 7,
            "Eight": 8,
            "Nine": 9,
            "Ten": 10,
        }
        verify_num = [verify_list[0], verify_list[2]]
        for index, num in enumerate(verify_num):
            if num.isdigit():
                verify_num[index] = int(num)
            elif num in num_map:
                verify_num[index] = num_map[num]
            else:
                echo("0|error", "Error", num)
                # return False
        verify_code = 0
        error = True

        operation = verify_list[1]
        if (operation == "+" or operation == "plus" or operation == "add"
                or operation == "multiplied"):
            verify_code = verify_num[0] + verify_num[1]
            error = False
        if operation == "-" or operation == "minus":
            verify_code = verify_num[0] - verify_num[1]
            error = False
        if operation == "X" or operation == "multiplication":
            verify_code = verify_num[0] * verify_num[1]
            error = False
        if error:
            echo("0|error", "Error", operation)
        if not os.path.exists("%spassage" % data_dir):
            echo("0|warning", "gather passage not exist!!!")
            return
        with codecs.open("%spassage" % data_dir, "r", encoding="utf-8") as f:
            passage = [index[:-1] for index in f.readlines()]
        data = {
            "Username": passage[0],
            "Password": passage[1],
            "Captcha": str(verify_code),
        }
        time.sleep(2.163)
        r = requests.session()
        r.cookies = cj.LWPCookieJar()
        login_req = r.post(login_url, headers=headers, data=data, verify=False)

    def load_gather(self):
        """
        load gather proxy pool text
        If failured, you should reactive the cookie.
        """
        headers = {
            "Cookie":
            "_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57",
            "Accept": get_accept("html") + ";q=0.9",
        }
        url = "http://www.gatherproxy.com/subscribe/infos"
        try:
            sid_url_req = requests.get(url,
                                       headers=headers,
                                       verify=False,
                                       timeout=10)
        except:
            return
        sid_url_html = BeautifulSoup(sid_url_req.text, "html.parser")
        sid_url = sid_url_html.find_all(
            "div", class_="wrapper")[1].find_all("a")[0]["href"]
        if len(sid_url.split("sid=")) < 2:
            echo("0|warning", "cookie error")
            self.get_cookie()
            self.load_gather()
            return
        sid = sid_url.split("sid=")[1]
        sid_url = "http://www.gatherproxy.com" + sid_url

        data = {"ID": sid, "C": "", "P": "", "T": "", "U": "0"}
        gatherproxy = requests.post(sid_url,
                                    headers=headers,
                                    data=data,
                                    verify=False)
        with codecs.open(data_dir + "gatherproxy", "w", encoding="utf-8") as f:
            f.write(gatherproxy.text)

    def load_proxies_list(self, types: int = 2):
        """ load proxies """
        SITES = [
            "http://www.proxyserverlist24.top/", "http://www.live-socks.net/"
        ]
        spider_pool = []
        self.waitjudge = []
        for site in SITES:
            self.get_other_proxies(site)
        self.gatherproxy(3)
        waitjudge = list(set(self.waitjudge))
        waitjudge_http = ["http://" + ii for ii in waitjudge]
        waitjudge_https = ["https://" + ii for ii in waitjudge]
        if not types:
            self.waitjudge = waitjudge_http
        elif types == 1:
            self.waitjudge = waitjudge_https
        else:
            self.waitjudge = waitjudge_http + waitjudge_https
        echo(
            "1|info",
            "-_-_-_-_-_-_-",
            len(waitjudge),
            "Proxies wait to judge -_-_-_-_-_-_-",
        )

    def request_text(self, url: str) -> str:
        """ requests text """
        req = basic_req(url, 2)
        if req is None:
            echo("0|debug", url)
            if can_retry(url):
                return self.request_text(url)
            else:
                return ""
        echo("1|debug", url)
        text = req.text
        if type(text) == str:
            return text
        elif type(text) == bytes:
            return text.decode()
        else:
            return ""

    def get_free_proxy(self, url: str):
        req = basic_req(url, 2)
        if req is None:
            return []
        tt = req.text
        t_list = re.findall("<tr><td>(\d*\.\d*\.\d*\.\d*)</td><td>(\d*?)</td>",
                            tt)
        echo(1, "Get Free proxy List", url, len(t_list))
        return ["{}:{}".format(ii, jj) for ii, jj in t_list]

    def get_proxy_free(self):
        urls = [
            "https://www.sslproxies.org",
            "https://free-proxy-list.net",
            "https://www.us-proxy.org",
            "https://free-proxy-list.net/uk-proxy.html",
            "https://free-proxy-list.net/anonymous-proxy.html",
            "http://www.google-proxy.net",
        ]
        t_list = []
        for url in urls:
            t_list.extend(self.get_free_proxy(url))
        t_list.extend(self.get_api())
        for ii in ["http", "https"]:
            t_list.extend(self.get_download(ii))
        t_list = list(set(t_list))
        with open(data_dir + "gatherproxy", "w") as f:
            f.write("\n".join(t_list))

    def ip_decoder(self, data: str):
        data = re.sub("\+", "\x20", data)
        data = re.sub(
            "%([a-fA-F0-9][a-fA-F0-9])",
            lambda i: chr(int("0x" + i.group()[1:], 16)),
            data,
        )
        return re.findall(">(.*?)</a", data)

    def get_api(self):
        API_KEY = "xxx"
        url = "http://api.scraperapi.com/?api_key={}&url=http://httpbin.org/ip".format(
            API_KEY)
        t_list = []
        for ii in range(38):
            tt = basic_req(url, 1)
            if tt is None:
                continue
            t_list.append(tt["origin"])
        echo(1, "Get scraperapi", len(t_list))
        return t_list

    def get_download(self, types: str):
        url = "https://www.proxy-list.download/api/v0/get?l=en&t=" + types
        tt = basic_req(url, 1)
        if tt is None:
            return []
        tt_list = tt[0]["LISTA"]
        echo(1, "Get download", types, len(tt_list))
        return ["{}:{}".format(ii["IP"], ii["PORT"]) for ii in tt_list]

    def get_other_proxies(self, url: str):
        """ get other proxies """
        pages = re.findall(r"<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>",
                           self.request_text(url))
        if not len(pages):
            echo("0|warning",
                 "Please do not frequently request {}!!!".format(url))
        else:
            proxies = [
                re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}",
                           self.request_text(ii)) for ii in pages
            ]
            self.waitjudge = [*self.waitjudge, *sum(proxies, [])]

    def load_proxies_test(self):
        """ load mode & test proxies """
        version = begin_time()
        self.load_proxies_list()
        proxies_len = len(self.waitjudge)
        self.thread_judge()
        canuse_len = len(self.canuse_proxies)
        echo(
            "1|info",
            "\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {}\n".
            format(proxies_len, canuse_len, end_time(version)),
        )
        with open("{}canuse_proxies.txt".format(data_dir), "w") as f:
            f.write("\n".join(self.canuse_proxies))

예제 #9

파일 보기

파일: getproxy.py 프로젝트: Marin111/spider-1

class GetFreeProxy:
    ''' proxy pool '''
    def __init__(self):
        self.Db = Db("proxy")
        self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s '''
        self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0'''
        self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s '''
        self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s'''
        self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s'''
        self.canuseip = {}
        self.waitjudge = []
        self.cannotuseip = {}
        self.failuredtime = {}
        self.canuse_proxies = []
        self.initproxy()

    def proxy_req(self,
                  url: str,
                  types: int,
                  data=None,
                  test_func=None,
                  header=None,
                  need_cookie: bool = False):
        """
        use proxy to send requests, and record the proxy cann't use
        @types S0XY: X=0.->get;   =1.->post;
                     Y=0.->html;  =1.->json; =2.->basic
                     S=0.->basic ;=1.->ss

        support failured retry && failured auto record
        """

        httptype = url[4] == 's'
        ss_type = types // 1000
        types %= 1000
        if ss_type:
            proxylist = self.proxylists_ss if httptype else self.proxylist_ss
        else:
            proxylist = self.proxylists if httptype else self.proxylist

        if not len(proxylist):
            if self.Db.db:
                echo(
                    0,
                    'Proxy pool empty!!! Please check the db conn & db dataset!!!'
                )
            proxies = {}
        else:
            index = random.randint(0, len(proxylist) - 1)
            proxies_url = proxylist[index]
            proxies = {type_map[httptype]: proxies_url}

        try:
            result = basic_req(url, types, proxies, data, header, need_cookie)
            if not test_func is None:
                if not test_func(result):
                    if self.check_retry(url):
                        self.proxy_req(url, types + 1000 * ss_type, data,
                                       test_func, header, need_cookie)
                    else:
                        self.failuredtime[url] = 0
                        return
                else:
                    return result
            else:
                return result

        except:
            self.cannotuseip[random.randint(0, MAXN)] = proxies_url

            if proxies_url in proxylist:
                proxylist.remove(proxylist.index(proxies_url))

            if not len(self.cannotuseip.keys()) % 10:
                self.cleancannotuse()

            if self.check_retry(url):
                self.proxy_req(url, types + 1000 * ss_type, data, test_func,
                               header, need_cookie)
            else:
                return

    def check_retry(self, url):
        """
        check cannt retry
        """
        if url not in self.failuredtime:
            self.failuredtime[url] = 0
            return True
        elif self.failuredtime[url] < 3:
            self.failuredtime[url] += 1
            return True
        else:
            self.log_write(url)
            self.failuredtime[url] = 0
            return False

    def log_write(self, url):
        """
        failure log
        """
        with codecs.open("proxy.log", 'a', encoding='utf-8') as f:
            f.write(time_str() + url + '\n')

    def insertproxy(self, insertlist):
        """
        insert data to db
        """
        results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1])
        if results:
            echo(2, 'Insert ' + str(len(insertlist)) + ' items Success!')
        else:
            pass

    def updateproxy(self, updatelist, types):
        """
        update data to db
        """

        results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1])
        typemap = {0: 'can use ', 1: 'can not use '}
        if results:
            echo(2, 'Update', typemap[types], str(len(updatelist)),
                 ' items Success!')
        else:
            pass

    def selectproxy(self, targetlist):
        """
        select ip proxy by ids
        """
        if not len(targetlist):
            return []
        elif len(targetlist) == 1:
            waitlist = '(\'' + targetlist[0] + '\')'
        else:
            waitlist = tuple(targetlist)
        return self.Db.select_db(self.select_sql % str(waitlist))

    def dbcanuseproxy(self):
        """
        test db have or not this data
        """

        results = self.selectproxy([ii[0] for ii in self.canuseip.values()])
        ss_len = len([1 for ii in self.canuseip.values() if ii[1] > 1])
        echo(2, "SS proxies %d" % ss_len)

        insertlist = []
        updatelist = []
        ipmap = {}
        if results != False:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.canuseip.values():
                http_type = ip_now[1]
                ip_now = ip_now[0]
                if ip_now in ipmap:
                    if ipmap[ip_now][1]:
                        updatelist.append(
                            (ipmap[ip_now][0], ip_now, http_type, 0))
                else:
                    insertlist.append((ip_now, http_type))
            if len(insertlist):
                self.insertproxy(insertlist)
            if len(updatelist):
                self.updateproxy(updatelist, 0)
        else:
            pass
        self.canuseip = {}

    def cleancannotuse(self):
        """
        update db proxy cann't use
        """
        results = self.selectproxy(self.cannotuseip.values())
        updatelist = []
        ipmap = {}
        if results:
            for ip_info in results:
                ipmap[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.cannotuseip.values():
                http_type = ip_now[4] == 's'
                if ip_now in ipmap:
                    updatelist.append((ipmap[ip_now][0], ip_now, http_type,
                                       ipmap[ip_now][1] + 1))

            if len(updatelist):
                self.updateproxy(updatelist, 1)
        else:
            pass
        self.cannotuseip = {}

    def initproxy(self):
        """
        init proxy list
        """

        results = self.Db.select_db(self.select_list)
        self.proxylist = []
        self.proxylists = []
        self.proxylist_ss = []
        self.proxylists_ss = []
        if not results:
            echo(0,
                 'Please check db configure!!! The proxy pool cant use!!!>>>')
            return
        for index in results:
            if index[1] == 1:
                self.proxylists.append(index[0])
            elif index[1] == 2:
                self.proxylist.append(index[0])
                self.proxylist_ss.append(index[0])
            elif index[1] == 3:
                self.proxylists.append(index[0])
                self.proxylists_ss.append(index[0])
            else:
                self.proxylist.append(index[0])
        echo(2, len(self.proxylist), ' http proxy can use.')
        echo(2, len(self.proxylists), ' https proxy can use.')
        echo(2, len(self.proxylist_ss), ' ss http proxy can use.')
        echo(2, len(self.proxylists_ss), ' ss https proxy can use.')

    def judgeurl(self, urls, index, times, ss_test=False):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == 's'
        proxies = {type_map[http_type]: urls}

        test_url = type_map[
            http_type] + '://music.163.com/api/playlist/detail?id=432853362'
        ss_url = 'https://www.google.com/?gws_rd=ssl'
        try:
            data = basic_req(test_url, 1, proxies)
            result = data['result']
            tracks = result['tracks']
            if len(tracks) == 56:
                if times < 0:
                    self.judgeurl(urls, index, times + 1)
                else:
                    echo(1, urls, proxies, 'Proxies can use.')
                    self.canuse_proxies.append(urls)
                    self.canuseip[index] = [urls, int(http_type)]
                    if ss_test:
                        data = basic_req(ss_url, 0)
                        if len(str(data)) > 5000:
                            self.canuseip[index] = [urls, int(http_type) + 2]
            else:
                echo(0, urls, proxies, 'Tracks len error ^--<^>--^ ')
                self.cannotuseip[index] = urls
        except:
            echo(0, urls, proxies, 'return error [][][][][][]')
            if not index in self.canuseip:
                self.cannotuseip[index] = urls
            pass

    def threadjude(self, batch_size=500):
        """
        threading to judge proxy
        """
        changeJsonTimeout(2)
        changeHtmlTimeout(3)

        text = self.waitjudge
        num = len(text)
        for block in range(num // batch_size + 1):
            blockthreads = []
            for index in range(block * batch_size,
                               min(num, batch_size * (block + 1))):
                work = threading.Thread(target=self.judgeurl,
                                        args=(
                                            text[index],
                                            index,
                                            0,
                                        ))
                blockthreads.append(work)
            for work in blockthreads:
                work.start()
            for work in blockthreads:
                work.join()
            self.dbcanuseproxy()
            self.cleancannotuse()
        self.waitjudge = []

    def testdb(self, types):
        '''
        test proxy in db can use
        '''

        version = begin_time()
        typestr = ''
        if types == 2:
            typestr = '(0,1,2,3)'
        elif types == 1:
            typestr = '(1,3)'
        else:
            typestr = '(0,2)'
        results = self.Db.select_db(self.select_all % typestr)
        if results != 0:
            for index in results:
                self.waitjudge.append(index[0])
            self.threadjude()
        else:
            pass
        self.initproxy()
        end_time(version)

    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            echo(0, "Please input num!")
            return []

        version = begin_time()
        url = 'http://www.xicidaili.com/nn/%d'
        for index in range(1, page + 1):
            html = basic_req(url % (index), 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(ip + '://' + tds[1].text + ':' +
                                      tds[2].text)
        self.threadjude()
        end_time(version)

    def gatherproxy(self, types):
        """
        :100: very nice website
        first of all you should download proxy ip txt from:
        http://www.gatherproxy.com/zh/proxylist/country/?c=China
        """
        if not os.path.exists('{}gatherproxy'.format(data_dir)):
            echo(0, 'Gather file not exist!!!')
            return
        with codecs.open('{}gatherproxy'.format(data_dir),
                         'r',
                         encoding='utf-8') as f:
            file_d = [ii.strip() for ii in f.readlines()]
        waitjudge_http = ['http://' + ii for ii in file_d]
        waitjudge_https = ['https://' + ii for ii in file_d]
        if not types:
            self.waitjudge += waitjudge_http
        elif types == 1:
            self.waitjudge += waitjudge_https
        elif types == 2:
            self.waitjudge += (waitjudge_http + waitjudge_https)
        else:
            self.waitjudge += file_d
        echo(2, 'load gather over!')

    def goubanjia(self):
        """
        :-1: html tag mixed with invalid data
        :100:And the most important thing is the port writed in 'class' rather in text.
        The website is difficult to spider, but the proxys are very goog
        goubanjia proxy http://www.goubanjia.com
        """

        version = begin_time()
        host = 'http://www.goubanjia.com'
        html = self.proxy_req(host, 0)

        if not html:
            return []
        trs = html.find_all('tr', class_=['warning', 'success'])
        for tr in trs:
            tds = tr.find_all('td')
            ip = tds[2].find_all('a')[0].text + '://'
            iplist = tds[0].find_all(['div', 'span', not 'p'],
                                     class_=not 'port')
            for index in iplist:
                ip += index.text
            encode = tds[0].find_all(['div', 'span', 'p'],
                                     class_='port')[0]['class'][1]
            uncode = functools.reduce(
                lambda x, y: x * 10 + (ord(y) - ord('A')),
                map(lambda x: x, encode), 0)
            self.waitjudge.append(ip + ':' + str(int(uncode / 8)))
        self.threadjude()
        end_time(version)

    def schedulegou(self):
        sched = BlockingScheduler()
        sched.add_job(self.goubanjia, 'interval', seconds=100)
        sched.start()

    def data5u(self):
        """
        data5u proxy http://www.data5u.com/
        no one can use
        """

        version = begin_time()
        url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml']
        host = 'http://www.data5u.com/'
        for uri in url_list:
            html = self.proxy_req(host + uri, 0)
            if not html:
                continue
            table = html.find_all('ul', class_='l2')
            for index in table:
                tds = index.find_all('li')
                ip = tds[3].text
                self.waitjudge.append(ip + '://' + tds[0].text + ':' +
                                      tds[1].text)
        self.threadjude()
        end_time(version)

    def sixsixip(self, area, page):
        """
        66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html
        """

        version = begin_time()
        threadings = []
        for index in range(1, area + 1):
            for pageindex in range(1, page + 1):
                echo(2, str(index) + ' ' + str(pageindex))
                work = threading.Thread(target=self.sixsixthread,
                                        args=(index, pageindex))
                threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)

    def sixsixthread(self, index, pageindex):
        host = '''http://www.66ip.cn/areaindex_%d/%d.html'''
        html = self.proxy_req(host % (index, pageindex), 0)
        if not html:
            return []
        trs = html.find_all('table')[2].find_all('tr')
        for test in range(1, len(trs) - 1):
            tds = trs[test].find_all('td')
            self.waitjudge.append('http://' + tds[0].text + ':' + tds[1].text)
            self.waitjudge.append('https://' + tds[0].text + ':' + tds[1].text)

    def kuaidaili(self, page):
        """
        kuaidaili https://www.kuaidaili.com/free/
        """

        version = begin_time()
        threadings = []
        for index in range(1, page + 1):
            work = threading.Thread(target=self.kuaidailithread,
                                    args=(index, ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        self.threadjude()
        end_time(version)

    def kuaidailithread(self, index):
        host = '''https://www.kuaidaili.com/free/inha/%d/'''
        html = self.proxy_req(host % index, 0)
        if not html:
            return []
        trs = html.find_all('tr')
        for index in range(1, len(trs)):
            tds = trs[index].find_all('td')
            ip = tds[3].text.lower() + "://" + tds[0].text + ':' + tds[1].text
            self.waitjudge.append(ip)

    def get_cookie(self):
        """
        make cookie login
        PS: Though cookie expired time is more than 1 year,
            but It will be break when the connect close.
            So you need reactive the cookie by this function.
        """
        headers = {
            'pragma':
            'no-cache',
            'cache-control':
            'no-cache',
            'Host':
            'www.gatherproxy.com',
            'Origin':
            'http://www.gatherproxy.com',
            'Referer':
            'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie':
            '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        login_url = 'http://www.gatherproxy.com/subscribe/login'

        cookie_html = basic_req(login_url, 0, header=headers)
        verify_text = cookie_html.find_all('div', class_='label')[2].span.text
        verify_list = verify_text.replace('= ', '').strip().split()
        num_map = {
            'Zero': 0,
            'One': 1,
            'Two': 2,
            'Three': 3,
            'Four': 4,
            'Fine': 5,
            'Six': 6,
            'Seven': 7,
            'Eight': 8,
            'Nine': 9,
            'Ten': 10
        }
        verify_num = [verify_list[0], verify_list[2]]
        for index, num in enumerate(verify_num):
            if num.isdigit():
                verify_num[index] = int(num)
            elif num in num_map:
                verify_num[index] = num_map[num]
            else:
                echo(0, 'Error', index)
                # return False
        verify_code = 0
        error = True

        operation = verify_list[1]
        if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied':
            verify_code = verify_num[0] + verify_num[1]
            error = False
        if operation == '-' or operation == 'minus':
            verify_code = verify_num[0] - verify_num[1]
            error = False
        if operation == 'X' or operation == 'multiplication':
            verify_code = verify_num[0] * verify_num[1]
            error = False
        if error:
            echo(0, 'Error', operation)
        if not os.path.exists('%spassage' % data_dir):
            echo(0, 'gather passage not exist!!!')
            return
        with codecs.open('%spassage' % data_dir, 'r', encoding='utf-8') as f:
            passage = [index[:-1] for index in f.readlines()]
        data = {
            'Username': passage[0],
            'Password': passage[1],
            'Captcha': str(verify_code)
        }
        time.sleep(2.163)
        r = requests.session()
        r.cookies = cj.LWPCookieJar()
        login_req = r.post(login_url, headers=headers, data=data, verify=False)

    def load_gather(self):
        """
        load gather proxy pool text
        If failured, you should reactive the cookie.
        """
        headers = {
            'pragma':
            'no-cache',
            'cache-control':
            'no-cache',
            'Host':
            'www.gatherproxy.com',
            'Origin':
            'http://www.gatherproxy.com',
            'Referer':
            'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie':
            '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        url = 'http://www.gatherproxy.com/subscribe/infos'
        sid_url_req = requests.get(url, headers=headers, verify=False)
        sid_url_html = BeautifulSoup(sid_url_req.text, 'html.parser')
        sid_url = sid_url_html.find_all(
            'div', class_='wrapper')[1].find_all('a')[0]['href']
        if len(sid_url.split('sid=')) < 2:
            echo(0, 'cookie error')
            self.get_cookie()
            self.load_gather()
            return
        sid = sid_url.split('sid=')[1]
        sid_url = 'http://www.gatherproxy.com' + sid_url

        data = {'ID': sid, 'C': '', 'P': '', 'T': '', 'U': '0'}
        gatherproxy = requests.post(sid_url,
                                    headers=headers,
                                    data=data,
                                    verify=False)
        with codecs.open(data_dir + 'gatherproxy', 'w', encoding='utf-8') as f:
            f.write(gatherproxy.text)

    def load_proxies_list(self, types=2):
        ''' load proxies '''
        SITES = [
            'http://www.proxyserverlist24.top/', 'http://www.live-socks.net/'
        ]
        spider_pool = []
        self.waitjudge = []
        for site in SITES:
            self.get_other_proxies(site)
        if os.path.exists('{}gatherproxy'.format(data_dir)):
            self.gatherproxy(3)
        waitjudge = list(set(self.waitjudge))
        waitjudge_http = ['http://' + ii for ii in waitjudge]
        waitjudge_https = ['https://' + ii for ii in waitjudge]
        if not types:
            self.waitjudge = waitjudge_http
        elif types == 1:
            self.waitjudge = waitjudge_https
        else:
            self.waitjudge = (waitjudge_http + waitjudge_https)
        echo(1, '-_-_-_-_-_-_-', len(waitjudge),
             'Proxies wait to judge -_-_-_-_-_-_-')

    def request_text(self, url):
        ''' requests text '''
        req = basic_req(url, 2)
        if req is None:
            echo(0, url)
            if can_retry(url):
                self.request_text(url)
            else:
                return ''
        else:
            echo(1, url)
            return req.text

    def get_other_proxies(self, url):
        ''' get other proxies '''
        text = self.request_text(url)
        pages = re.findall(r'<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>',
                           '' if text is None else text)
        if not len(pages):
            echo(0, 'Please do not frequently request {}!!!'.format(url))
        else:
            proxies = [
                re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}',
                           self.request_text(ii)) for ii in pages
            ]
            self.waitjudge = [*self.waitjudge, *sum(proxies, [])]

    def load_proxies_test(self):
        ''' load mode & test proxies '''
        start = time.time()
        self.load_proxies_list()
        proxies_len = len(self.waitjudge)
        self.threadjude()
        canuse_len = len(self.canuse_proxies)
        echo(
            1,
            '\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {:.2f}s\n'.
            format(proxies_len, canuse_len,
                   time.time() - start))
        with open('{}canuse_proxies.txt'.format(data_dir), 'w') as f:
            f.write('\n'.join(self.canuse_proxies))

예제 #10

파일 보기

class find_location(object):
    """
    find location
    """

    def __init__(self):
        self.Db = Db("china_regions")
        china = pd.read_csv('news/china_city_list.csv', encoding='gbk')
        self.province = list(china.groupby(by=['Province']).count().axes[0])
        self.city = list(china.groupby(by=['City']).count().axes[0])
        self.filelists = ['google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail', 'bjh_detail_poison',
                          'news_steal.txt', 'news_poison.txt']
        self.city_province = {}
        self.province_map = {}

        self.pre_data()
        for index, row in china.iterrows():
            self.city_province[row['City']] = row['Province']

    def search_location(self):
        word = ''
        count = 0
        for file in self.filelists:
            temp_word_list = codecs.open(
                file, 'r', encoding='utf-8').readlines()
            count += len(temp_word_list)
            word += " ".join(temp_word_list)
        # return word
        print(count)
        word_province = {}
        word_city = {}
        word_city_pro = {}
        for index in self.province:
            temp_num = word.count(index)
            if temp_num:
                word_province[index] = temp_num
        for index in self.city:
            temp_num = word.count(index)
            if temp_num:
                word_city[index] = temp_num
        for index in word_city:
            province = self.city_province[index]
            if province in word_city_pro:
                word_city_pro[province] += word_city[index]
            else:
                word_city_pro[province] = word_city[index]
        print(sum(word_province.values()), sum(
            word_city.values()), sum(word_city_pro.values()))
        return word_province, word_city, word_city_pro

    def participles_word(self):
        """
        participles word
        """
        version = begin_time()

        for file in self.filelists:
            pkuseg.test(file, file[:-4] + '_pkuseg.txt',
                        model_name='../Model_retrieval/pkuseg', nthread=20)
        end_time(version)

    def pre_data(self):
        """
        load city key-value from mysql
        """
        province = self.Db.select_db(
            'select * from china_regions where level=1')
        self.province_map = {int(index[2]): index[3][:3] if len(index[3]) == 4 or len(
            index[3]) == 6 else index[3][:2] for index in province}

        city = self.Db.select_db(
            'select * from china_regions where level=2')
        city_state = [index for index in city if index[3][-1:] == '州']
        seg = pkuseg.pkuseg()
        city_state = {seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(
            index[3])[0] + seg.cut(index[3])[1]: int(index[1]) for index in city if index[3][-1:] == '州'}
        seg = pkuseg.pkuseg(model_name='../Model_retrieval/pkuseg')
        city_state1 = {seg.cut(index)[0] if len(seg.cut(index)[0]) > 1 else seg.cut(
            index)[0] + seg.cut(index)[1]: city_state[index] for index in city_state}
        city_area = {index[3][:-2]: int(index[1])
                     for index in city if '地区' in index[3]}
        city_other = {index[3][:-1]: int(index[1])
                      for index in city if index[3][-1:] == '市' or index[3][-1:] == '盟'}
        self.city_province = {**city_state1, **city_area, **city_other}
        self.city_province = {
            index: self.province_map[self.city_province[index]] for index in self.city_province}
        county = self.Db.select_db(
            'select * from china_regions where level=3')
        county_area_pre = {index for index in county if index[3][-1] == '区'}
        county_area_two = {index[3][:-2]: int(index[1][:2]) for index in county_area_pre if len(
            index[3]) > 3 and (index[3][-2] == '矿' or index[3][-2] == '林')}
        # print('芒' in county_area_two, 'two')
        county_area_state = {seg.cut(index[3][:-2])[0]: int(index[1][:2])
                             for index in county_area_pre if len(index[3]) > 2 and index[3][-2] == '族'}
        # print('芒' in county_area_state, 'state')
        county_area_other = {index[3][:-1]: int(index[1][:2]) for index in county_area_pre if len(
            index[3]) > 2 and index[3][-2] != '族' and index[3][-2] != '林' and index[3][-2] != '矿'}
        # print('芒' in county_area_other, 'other')
        county_county_pre = {index for index in county if index[3][-1] == '县'}
        county_county_two = {index[3]: int(
            index[1][:2]) for index in county_county_pre if len(index[3]) == 2}
        # print('芒' in county_county_two, 'two')
        seg = pkuseg.pkuseg()
        county_county_state = {seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(index[3])[0] + seg.cut(
            index[3])[1]: int(index[1][:2]) for index in county_county_pre if len(index[3]) > 2 and index[3][-3:-1] == '自治'}
        county_county_state = {
            index[:-2] if '族' in index and len(index) > 3 else index: county_county_state[index] for index in county_county_state}
        # print('芒' in county_county_state, 'state')
        county_county_other = {
            index[3][:-1]: int(index[1][:2]) for index in county_county_pre if index[3][-3:-1] != '自治' and len(index[3]) > 2}
        # print('芒' in county_county_other, 'other')
        county_city = {index[3][:-1] if len(index[3]) > 2 else index[3]: int(index[1][:2])
                       for index in county if index[3][-1] == '市'}
        # print('芒' in county_city, 'city')
        county_domain = {index[3][:4]: int(
            index[1][:2]) for index in county if index[3][-1] == '域'}
        # print('芒' in county_domain, 'domain')
        county_other = {index[3]: int(
            index[1][:2]) for index in county if index[3][-1] == '盟' or index[3][-1] == '岛'}
        # print('芒' in county_other, 'other')
        county_province = {**county_area_two, **county_area_state, **county_area_other, **county_county_two,
                           **county_county_state, **county_county_other, **county_city, **county_domain, **county_other}
        county_province = {
            index: self.province_map[county_province[index]] for index in county_province}
        self.city_province = {**self.city_province, **county_province}
        print({index for index in self.city_province if len(index) == 1})

    def test_province(self, maps, words):
        word_city = {}
        for index in maps:
            temp_num = words.count(index)
            province = maps[index]
            if temp_num:
                if province in word_city:
                    word_city[province] += temp_num
                else:
                    word_city[province] = temp_num
        print(sum(word_city.values()))
        return word_city

예제 #11

파일 보기

class TitleViews(object):
    """
    update title views
    """

    def __init__(self):
        self.Db = Db("blog")
        self.local_views = {}
        self.title_map = {}
        self.title2slug = {}
        self.failured_map = {}
        self.zhihu_views = {}
        self.zhihu_id = {}
        self.jianshu_views = {}
        self.jianshu_id = {}
        self.csdn_views = {}
        self.csdn_id = {}
        self.exist_data = {}
        self.getTitleMap()
        self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s'''
        self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s'''
        self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''

    def loadLocalView(self):
        """
        load local view
        """
        if not os.path.exists("%sgoogle" % data_dir):
            return
        with codecs.open("%sgoogle" % data_dir, 'r', encoding='utf-8') as f:
            test = f.readlines()
        test = test[7:]
        for index in test:
            arr = index.split(',')
            slug = self.matchSlug(arr[0])
            if slug is None or slug not in self.title_map:
                continue
            print(slug + ' ' + str(arr[1]) + ' ' + arr[0])
            if slug in self.local_views:
                self.local_views[slug] += int(arr[1])
            else:
                self.local_views[slug] = int(arr[1])

    def getTitleMap(self):
        """
        get title map
        """
        if os.path.exists('%sslug' % data_dir):
            with codecs.open('%sslug' % data_dir, 'r', encoding='utf-8') as f:
                slug = f.readlines()
        else:
            slug = []
        if os.path.exists('%stitle' % data_dir):
            with codecs.open('%stitle' % data_dir, 'r', encoding='utf-8') as f:
                title = f.readlines()
        else:
            title = []
        self.title_map = {tempslug.split(
            '"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug)}
        title2slug = {
            self.title_map[index]: index for index in self.title_map.keys()}
        noemoji_title = {self.filter_emoji(
            self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys()}
        self.title2slug = {**noemoji_title, **title2slug}

    def matchSlug(self, pattern):
        """
        match slug
        """
        arr = re.search(r'\/([^\/]+).html', pattern)
        return None if arr is None else arr.group(1)

    def getZhihuView(self):
        if os.path.exists('%scookie' % data_dir):
            with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
                cookie = f.readline()
        else:
            cookie = ' '
        changeCookie(cookie[:-1])
        url_basic = [
            'https://www.zhihu.com/api/v4/creator/content_statistics/',
            'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
            datetime.datetime.now().strftime("%Y-%m-%d"),
            '&page_no='
        ]
        url = "".join(url_basic)
        json = self.get_request(url + '1', 1)
        if not json:
            return
        if not 'data' in json:
            if 'code' in json:
                print(json)
            return
        for index in json['data']:
            zhihu_title = index['title']
            zhihu_id = int(index['url_token'])
            zhihu_count = int(index['read_count'])

            if zhihu_title in self.title2slug:
                temp_slug = self.title2slug[zhihu_title]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            elif zhihu_id in self.zhihu_id_map:
                temp_slug = self.zhihu_id_map[zhihu_id]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            else:
                print(index['title'])

        for index in range(json['count'] // 10):
            print('zhihu', index)
            json = self.get_request(url + str(index + 2), 1)
            if not json:
                continue
            for index in json['data']:
                zhihu_title = index['title']
                zhihu_id = int(index['url_token'])
                zhihu_count = int(index['read_count'])

                if zhihu_title in self.title2slug:
                    temp_slug = self.title2slug[zhihu_title]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                elif zhihu_id in self.zhihu_id_map:
                    temp_slug = self.zhihu_id_map[zhihu_id]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                else:
                    print(index['title'])

    def get_request(self, url, types):

        result = basic_req(url, 1)

        if not result:
            if can_retry(url):
                self.get_request(url, types)
            return
        return result

    def get_request_v2(self, url, types, header):

        result = proxy_req(url, 0, header=header)

        if not result or not len(result.find_all('div', class_='content')):
            if can_retry(url):
                self.get_request_v2(url, types, header)
            return
        return result

    def get_request_v3(self, url, types):

        result = basic_req(url, 0)

        if result is None or not result or not len(result.find_all('p', class_='content')):
            if can_retry(url):
                self.get_request_v3(url, types)
            return
        return result

    def getJianshuViews(self):
        """
        get jianshu views
        """
        header = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'no-cache',
            'pragma': 'no-cache',
            'sec-ch-ua': 'Google Chrome 75',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'cross-site',
            'sec-fetch-user': '******',
            'sec-origin-policy': '0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3736.0 Safari/537.36'
        }

        basic_url = 'https://www.jianshu.com/u/2e0f69e4a4f0'

        for rounds in range(1, 4):
            url = basic_url if rounds == 1 else basic_url + \
                '?order_by=shared_at&page=' + str(rounds)
            print(url)
            html = self.get_request_v2(url, 0, header)
            if html is None:
                print('None')
                return
            for index in html.find_all('li', class_=["", 'have-img']):
                if len(index.find_all('i')) < 3:
                    continue
                title = index.find_all('a', class_='title')[
                    0].text.replace('`', '')
                jianshu_id = int(index['data-note-id'])
                jianshu_count = int(index.find_all('a')[-2].text)
                if title in self.title2slug:
                    temp_slug = self.title2slug[title]
                    self.jianshu_id[temp_slug] = jianshu_id
                    self.jianshu_views[temp_slug] = jianshu_count
                elif jianshu_id in self.jianshu_id_map:
                    temp_slug = self.jianshu_id_map[jianshu_id]
                    self.jianshu_id[temp_slug] = jianshu_id
                    self.jianshu_views[temp_slug] = jianshu_count
                else:
                    print(title)

    def getCsdnViews(self):
        """
        get csdn views
        """

        basic_url = "https://blog.csdn.net/iofu728"

        for index in range(1, 3):
            url = basic_url if index == 1 else basic_url + \
                '/article/list/' + str(index) + '?'

            html = self.get_request_v3(url, 0)
            if html is None:
                print('None')
                return
            for div_lists in html.find_all('div', class_='article-item-box csdn-tracking-statistics'):
                if 'style' in div_lists.attrs:
                    continue
                csdn_id = int(div_lists['data-articleid'])
                title = div_lists.a.contents[2].replace(
                    '\n', '').strip().replace('`', '')
                csdn_count = int(div_lists.find_all(
                    'span', class_='read-num')[0].span.text)
                if title in self.title2slug:
                    temp_slug = self.title2slug[title]
                    self.csdn_id[temp_slug] = csdn_id
                    self.csdn_views[temp_slug] = csdn_count
                elif csdn_id in self.csdn_id_map:
                    temp_slug = self.csdn_id_map[csdn_id]
                    self.csdn_id[temp_slug] = csdn_id
                    self.csdn_views[temp_slug] = csdn_count
                else:
                    print(title)

    def filter_emoji(self, desstr, restr=''):
        '''
        filter emoji
        '''
        desstr = str(desstr)
        try:
            co = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
        return co.sub(restr, desstr)

    def init_db(self):
        self.loadLocalView()
        self.getZhihuView()
        self.getJianshuViews()
        self.getCsdnViews()
        insert_list = []
        for index in self.title_map.keys():
            insert_list.append((index, self.local_views[index] if index in self.local_views else 0, self.zhihu_views[index] if index in self.zhihu_views else 0, self.csdn_views[index] if index in self.csdn_views else 0, self.jianshu_views[index]
                                if index in self.jianshu_views else 0, self.zhihu_id[index] if index in self.zhihu_id else 0, self.csdn_id[index] if index in self.csdn_id else 0, self.jianshu_id[index] if index in self.jianshu_id else 0))
        # return insert_list
        results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1])
        if results:
            if len(insert_list):
                print('Insert ' + str(len(insert_list)) + ' Success!')
        else:
            pass

    def select_all(self):
        result = self.Db.select_db(
            "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0")
        if result == False:
            print("SELECT Error!")
        else:
            self.exist_data = {index[1]: list(index) for index in result}
            self.zhihu_id_map = {index[6]: index[1]
                                 for index in result if index[6]}
            self.csdn_id_map = {index[7]: index[1]
                                for index in result if index[7]}
            self.jianshu_id_map = {index[8]: index[1]
                                   for index in result if index[8]}
            for index in self.exist_data:
                self.exist_data[index][-1] = self.exist_data[index][-1].strftime(
                    '%Y-%m-%d %H:%M:%S')

    def update_view(self):
        changeHtmlTimeout(10)
        wait_map = {}
        self.select_all()
        self.getZhihuView()
        self.getJianshuViews()
        self.getCsdnViews()
        for index in self.zhihu_views.keys():
            if self.zhihu_views[index] == self.exist_data[index][3] and self.zhihu_id[index] == self.exist_data[index][6]:
                continue
            wait_map[index] = self.exist_data[index]
            wait_map[index][3] = self.zhihu_views[index]
            wait_map[index][6] = self.zhihu_id[index]
        for index in self.csdn_views.keys():
            if self.csdn_views[index] == self.exist_data[index][4] and self.csdn_id[index] == self.exist_data[index][7]:
                continue
            if index not in wait_map:
                wait_map[index] = self.exist_data[index]
            wait_map[index][4] = self.csdn_views[index]
            wait_map[index][7] = self.csdn_id[index]
        for index in self.jianshu_views.keys():
            if self.jianshu_views[index] == self.exist_data[index][5] and self.jianshu_id[index] == self.exist_data[index][8]:
                continue
            wait_map[index] = self.exist_data[index]
            wait_map[index][5] = self.jianshu_views[index]
            wait_map[index][8] = self.jianshu_id[index]
        update_list = [tuple(index) for index in wait_map.values()]
        # return update_list:q
        if not len(update_list):
            return
        results = self.Db.update_db(self.update_sql % str(update_list)[1:-1])
        if results:
            if len(update_list):
                print('Update ' + str(len(update_list)) + ' Success!')
        else:
            pass

    def new_day(self):
        day_data = self.Db.select_db(
            "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1")
        if not os.path.exists('../blog/log/basic'):
            print('File not exist!!!')
            return
        with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f:
            existed_spider = int(f.readlines()[1])
        today_date = datetime.datetime.now().strftime('%Y-%m-%d')
        new_day_list = [(today_date, day_data[0][0] +
                         day_data[0][1], existed_spider)]
        results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1])
        if results:
            if len(new_day_list):
                print('New day update' + str(len(new_day_list)) + ' Success!')
        else:
            pass

예제 #12

파일 보기

파일: titleviews.py 프로젝트: zhujuanzhu/spider

class TitleViews(object):
    ''' script of load my blog data -> analysis '''
    CSDN_URL = 'https://blog.csdn.net/iofu728'
    JIANSHU_URL = 'https://www.jianshu.com/u/2e0f69e4a4f0'
    ZHIHU_URL = 'https://www.zhihu.com/api/v4/creator/content_statistics/'

    def __init__(self):
        self.Db = Db("blog")
        self.local_views = {}
        self.title_map = {}
        self.title2slug = {}
        self.zhihu_views = {}
        self.zhihu_id = {}
        self.jianshu_views = {}
        self.jianshu_id = {}
        self.csdn_views = {}
        self.csdn_id = {}
        self.exist_data = {}
        self.getTitleMap()
        self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s'''
        self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s'''
        self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''

    def loadLocalView(self):
        '''  load local view '''
        test = read_file('{}google'.format(data_dir))[7:]
        for index in test:
            arr = index.split(',')
            slug = self.matchSlug(arr[0])
            if slug is None or slug not in self.title_map:
                continue
            print(slug + ' ' + str(arr[1]) + ' ' + arr[0])
            if slug in self.local_views:
                self.local_views[slug] += int(arr[1])
            else:
                self.local_views[slug] = int(arr[1])

    def getTitleMap(self):
        ''' get title map '''
        slug = read_file('{}slug'.format(data_dir))
        title = read_file('{}title'.format(data_dir))
        self.title_map = {
            tempslug.split('"')[1]: title[num].split('"')[1]
            for num, tempslug in enumerate(slug)
        }
        title2slug = {
            self.title_map[index]: index
            for index in self.title_map.keys()
        }
        noemoji_title = {
            self.filter_emoji(self.title_map[index]).replace('\u200d', ''):
            index
            for index in self.title_map.keys()
        }
        self.title2slug = {**noemoji_title, **title2slug}

    def matchSlug(self, pattern: str):
        ''' match slug '''
        arr = re.search(r'\/([^\/]+).html', pattern)
        return None if arr is None else arr.group(1)

    def getZhihuView(self):
        cookie = ''.join(read_file('{}cookie'.format(data_dir)))
        changeCookie(cookie)
        url_basic = [
            self.ZHIHU_URL,
            'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
            datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no='
        ]
        url = ''.join(url_basic)

        json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i)
        if not json:
            return
        if not 'data' in json:
            if 'code' in json:
                echo('0|warning', json)
            return
        echo(3, 'zhihu', json)
        for index in json['data']:
            zhihu_title = index['title']
            zhihu_id = int(index['url_token'])
            zhihu_count = int(index['read_count'])

            if zhihu_title in self.title2slug:
                temp_slug = self.title2slug[zhihu_title]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            elif zhihu_id in self.zhihu_id_map:
                temp_slug = self.zhihu_id_map[zhihu_id]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            else:
                echo('0|debug', index['title'])

        for index in range(1, json['count'] // 10):
            echo(1, 'zhihu', index)
            json = self.get_request('{}{}'.format(url, 1 + index), 1,
                                    lambda i: not i)
            echo(2, 'zhihu', json)
            if not json:
                continue
            for index in json['data']:
                zhihu_title = index['title']
                zhihu_id = int(index['url_token'])
                zhihu_count = int(index['read_count'])

                if zhihu_title in self.title2slug:
                    temp_slug = self.title2slug[zhihu_title]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                elif zhihu_id in self.zhihu_id_map:
                    temp_slug = self.zhihu_id_map[zhihu_id]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                else:
                    echo('0|debug', index['title'])

    def get_request(self, url: str, types: int, functs, header: dict = {}):
        if len(header):
            req = basic_req(url, types, header=header)
        else:
            req = basic_req(url, types)

        if functs(req):
            if can_retry(url):
                self.get_request(url, types, functs, header)
            return
        return req

    def getJianshuViews(self):
        ''' get jianshu views '''
        header = {'accept': get_accept('html')}

        for rounds in range(1, 4):
            url = self.JIANSHU_URL
            if rounds > 1:
                url += '?order_by=shared_at&page={}'.format(rounds)
            echo('1|debug', 'jianshu req url:', url)
            html = self.get_request(
                url, 0, lambda i: not i or not len(
                    i.find_all('div', class_='content')), header)
            if html is None:
                echo(0, 'None')
                return
            for index in html.find_all('li', class_=["", 'have-img']):
                if len(index.find_all('i')) < 3:
                    continue
                title = index.find_all('a', class_='title')[0].text.replace(
                    '`', '')
                jianshu_id = int(index['data-note-id'])
                jianshu_count = int(index.find_all('a')[-2].text)
                if title in self.title2slug:
                    temp_slug = self.title2slug[title]
                    self.jianshu_id[temp_slug] = jianshu_id
                    self.jianshu_views[temp_slug] = jianshu_count
                elif jianshu_id in self.jianshu_id_map:
                    temp_slug = self.jianshu_id_map[jianshu_id]
                    self.jianshu_id[temp_slug] = jianshu_id
                    self.jianshu_views[temp_slug] = jianshu_count
                else:
                    echo(1, title)

    def getCsdnViews(self):
        ''' get csdn views '''

        for index in range(1, 3):
            url = self.CSDN_URL
            if index > 1:
                url += '/article/list/{}?'.format(index)
            echo(1, 'csdn url', url)

            html = self.get_request(
                url, 0, lambda i: i is None or not i or not len(
                    i.find_all('p', class_='content')))
            if html is None:
                echo(0, 'None')
                return
            for div_lists in html.find_all(
                    'div', class_='article-item-box csdn-tracking-statistics'):
                if 'style' in div_lists.attrs:
                    continue
                csdn_id = int(div_lists['data-articleid'])
                title = div_lists.a.contents[2].replace('\n',
                                                        '').strip().replace(
                                                            '`', '')
                csdn_count = int(
                    div_lists.find_all('span', class_='read-num')[0].span.text)
                if title in self.title2slug:
                    temp_slug = self.title2slug[title]
                    self.csdn_id[temp_slug] = csdn_id
                    self.csdn_views[temp_slug] = csdn_count
                elif csdn_id in self.csdn_id_map:
                    temp_slug = self.csdn_id_map[csdn_id]
                    self.csdn_id[temp_slug] = csdn_id
                    self.csdn_views[temp_slug] = csdn_count
                else:
                    echo(1, title)

    def filter_emoji(self, desstr, restr=''):
        ''' filter emoji '''
        desstr = str(desstr)
        try:
            co = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
        return co.sub(restr, desstr)

    def init_db(self):
        self.loadLocalView()
        self.getZhihuView()
        self.getJianshuViews()
        self.getCsdnViews()
        insert_list = []
        for index in self.title_map.keys():
            insert_list.append(
                (index,
                 self.local_views[index] if index in self.local_views else 0,
                 self.zhihu_views[index] if index in self.zhihu_views else 0,
                 self.csdn_views[index] if index in self.csdn_views else 0,
                 self.jianshu_views[index] if index in self.jianshu_views else
                 0, self.zhihu_id[index] if index in self.zhihu_id else 0,
                 self.csdn_id[index] if index in self.csdn_id else 0,
                 self.jianshu_id[index] if index in self.jianshu_id else 0))
        # return insert_list
        results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1])
        if results:
            if len(insert_list):
                print('Insert ' + str(len(insert_list)) + ' Success!')
        else:
            pass

    def select_all(self):
        result = self.Db.select_db(
            "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0"
        )
        if result == False:
            print("SELECT Error!")
        else:
            self.exist_data = {index[1]: list(index) for index in result}
            self.zhihu_id_map = {
                index[6]: index[1]
                for index in result if index[6]
            }
            self.csdn_id_map = {
                index[7]: index[1]
                for index in result if index[7]
            }
            self.jianshu_id_map = {
                index[8]: index[1]
                for index in result if index[8]
            }
            for index in self.exist_data:
                self.exist_data[index][-1] = self.exist_data[index][
                    -1].strftime('%Y-%m-%d %H:%M:%S')

    def update_view(self):
        changeHtmlTimeout(10)
        wait_map = {}
        self.select_all()
        self.getZhihuView()
        self.getJianshuViews()
        self.getCsdnViews()
        for index in self.zhihu_views.keys():
            if self.zhihu_views[index] == self.exist_data[index][
                    3] and self.zhihu_id[index] == self.exist_data[index][6]:
                continue
            wait_map[index] = self.exist_data[index]
            wait_map[index][3] = self.zhihu_views[index]
            wait_map[index][6] = self.zhihu_id[index]
        for index in self.csdn_views.keys():
            if self.csdn_views[index] == self.exist_data[index][
                    4] and self.csdn_id[index] == self.exist_data[index][7]:
                continue
            if index not in wait_map:
                wait_map[index] = self.exist_data[index]
            wait_map[index][4] = self.csdn_views[index]
            wait_map[index][7] = self.csdn_id[index]
        for index in self.jianshu_views.keys():
            if self.jianshu_views[index] == self.exist_data[index][
                    5] and self.jianshu_id[index] == self.exist_data[index][8]:
                continue
            wait_map[index] = self.exist_data[index]
            wait_map[index][5] = self.jianshu_views[index]
            wait_map[index][8] = self.jianshu_id[index]
        update_list = [tuple(index) for index in wait_map.values()]
        # return update_list:q
        if not len(update_list):
            return
        results = self.Db.update_db(self.update_sql % str(update_list)[1:-1])
        if results:
            if len(update_list):
                print('Update ' + str(len(update_list)) + ' Success!')
        else:
            pass

    def new_day(self):
        day_data = self.Db.select_db(
            "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1"
        )
        if not os.path.exists('../blog/log/basic'):
            print('File not exist!!!')
            return
        with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f:
            existed_spider = int(f.readlines()[1])
        today_date = datetime.datetime.now().strftime('%Y-%m-%d')
        new_day_list = [(today_date, day_data[0][0] + day_data[0][1],
                         existed_spider)]
        results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1])
        if results:
            if len(new_day_list):
                print('New day update' + str(len(new_day_list)) + ' Success!')
        else:
            pass

    def load_csdn_img(self):
        ''' load csdn img '''
        mkdir(data_dir)
        urls = ['/article/list/2?', '']
        article_ids = []
        for url in urls:
            req = basic_req('{}{}'.format(self.CSDN_URL, url), 3)
            article_ids.extend(re.findall('data-articleid="(\w*?)"', req))
        echo(0, article_ids)
        article_thread = [
            threading.Thread(target=self.load_csdn_img_batch, args=(ii, ))
            for ii in article_ids
        ]
        for work in article_thread:
            work.start()
        for work in article_thread:
            work.join()

    def load_csdn_img_batch(self, article_id: int):
        url = '{}/article/details/{}'.format(self.CSDN_URL, article_id)
        req = proxy_req(url, 3)
        if not 'iofu728' in req:
            if can_retry(url):
                self.load_csdn_img_batch(article_id)
            return
        img_lists = re.findall('"(https://cdn.nlark.com.*)" alt', req)
        img_thread = [
            threading.Thread(target=self.load_csdn_img_load,
                             args=(jj, article_id, ii))
            for ii, jj in enumerate(img_lists)
        ]
        echo(1, 'Article Need Load {} Img...'.format(len(img_lists)))
        for work in img_thread:
            work.start()
        for work in img_thread:
            work.join()

    def load_csdn_img_load(self, img_url: str, article_id: int, idx: int):
        img_dir = '{}{}/'.format(data_dir, article_id)
        img_path = '{}{}.png'.format(img_dir, idx)
        if os.path.exists(img_path):
            return
        req = proxy_req(img_url, 2)
        if type(req) == bool or req is None:
            if can_retry(img_url):
                self.load_csdn_img_load(img_url, article_id, idx)
            return
        mkdir(img_dir)
        with open(img_path, 'wb') as f:
            f.write(req.content)

예제 #13

파일 보기

class ActivateArticle(TBK):
    """ activate article in youdao Cloud"""

    Y_URL = "https://note.youdao.com/"
    WEB_URL = f"{Y_URL}web/"
    API_P_URL = f"{Y_URL}yws/api/personal/"
    SYNC_URL = f"{API_P_URL}sync?method=%s&keyfrom=web&cstk=%s"
    NOTE_URL = f"{Y_URL}yws/public/note/%s?editorType=0"
    SHARE_URL = f"{Y_URL}ynoteshare1/index.html?id=%s&type=note"
    GET_SHARE_URL = f"{API_P_URL}share?method=get&shareKey=%s"
    LISTRECENT_URL = (
        f"{API_P_URL}file?method=listRecent&offset=%d&limit=30&keyfrom=web&cstk=%s"
    )
    MYSHARE_URL = (
        f"{API_P_URL}myshare?method=get&checkBan=true&entryId=%s&keyfrom=web&cstk=%s"
    )
    DECODER_TPWD_URL = "https://api.taokouling.com/tkl/tkljm?apikey=%s&tkl=￥%s￥"
    Y_DOC_JS_URL = "https://shared-https.ydstatic.com/ynote/ydoc/index-6f5231c139.js"
    MTOP_URL = "https://h5api.m.taobao.com/h5/%s/%d.0/"
    ITEM_URL = "https://item.taobao.com/item.htm?id=%d"
    DETAIL_URL = 'https://detail.m.tmall.com/item.htm?id=%d'
    S_LIST_SQL = "SELECT `id`, article_id, title, q, created_at from article;"
    I_LIST_SQL = "INSERT INTO article (article_id, title, q) VALUES %s;"
    R_LIST_SQL = "REPLACE INTO article (`id`, article_id, title, q, is_deleted, created_at) VALUES %s;"
    S_ARTICLE_SQL = 'SELECT `id`, article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at, created_at from article_tpwd WHERE `article_id` = "%s";'
    I_ARTICLE_SQL = "INSERT INTO article_tpwd (article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at) VALUES %s;"
    R_ARTICLE_SQL = "REPLACE INTO article_tpwd (`id`, article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at, created_at, is_deleted) VALUES %s;"
    END_TEXT = "</text><inline-styles/><styles/></para></body></note>"
    TPWD_REG = "\p{Sc}(\w{8,12}?)\p{Sc}"
    TPWD_REG2 = "(\p{Sc}\w{8,12}\p{Sc})"
    JSON_KEYS = ["p", "ct", "su", "pr",
        "au","pv","mt","sz","domain",
        "tl","content",
    ]
    URL_DOMAIN = {
        0: "s.click.taobao.com",
        1: "item.taobao.com",
        2: "detail.tmall.com",
        5: "uland.taobao.com",
        10: "taoquan.taobao.com",
        11: "a.m.taobao.com",
        15: "empty",
        16: "failure",
    }
    NEED_KEY = ["content", "url", "validDate", "picUrl"]
    ONE_HOURS = 3600
    ONE_DAY = 24
    M = "_m_h5_tk"
    ZERO_STAMP = "0天0小时0分0秒"
    T_FORMAT = "%m-%d %H:%M"
    BASIC_STAMP = (
        time_stamp(time_format="%d天%H小时%M分%S秒", time_str="1天0小时0分0秒")
        - ONE_DAY * ONE_HOURS
    )

    def __init__(self):
        super(ActivateArticle, self).__init__()
        self.Db = Db("tbk")
        self.Db.create_table(os.path.join(root_dir, "tpwd.sql"))
        self.Db.create_table(os.path.join(root_dir, "article.sql"))
        self.tpwd_map = {}
        self.tpwd_db_map = {}
        self.tpwds = {}
        self.cookies = {}
        self.share2article = {}
        self.article_list = {}
        self.list_recent = {}
        self.idx = []
        self.empty_content = ""
        self.tpwd_exec = ThreadPoolExecutor(max_workers=20)
        self.need_del = {}
        self.get_share_list()

    def load_process(self):
        self.load_ids()
        if len(self.idx) < 30:
            time.sleep(np.random.rand() * 30 + 6)
            self.load_ids()
        self.load_article_list()
        # self.update_tpwd()
        self.get_m_h5_tk()
        self.get_ynote_file()
        self.get_ynote_file(1)

    def load_ids(self):
        changeJsonTimeout(5)
        req = self.basic_youdao(self.home_id)
        if req == "":
            echo("0|error", "Get The Home Page Info Error!!! Please retry->->->")
            return
        self.idx = regex.findall("id=(\w*?)<", req)
        if len(self.idx) < 30:
            echo("0|error", "The Num of id is error!! Please check it.")
        else:
            echo(1, "Load Article List {} items.".format(len(self.idx)))

    def get_share_info(self, share_id: str):
        changeJsonTimeout(4)
        url = self.GET_SHARE_URL % share_id
        headers = self.get_tb_headers(self.Y_URL)
        req = basic_req(url, 1, header=headers)
        if req is None:
            return
        info = req["entry"]
        self.share2article[share_id] = (info["name"].replace('.note', ''), info["id"], info["lastUpdateTime"])
        return req

    def basic_youdao(self, idx: str, use_proxy: bool = True):
        url = self.NOTE_URL % idx
        refer_url = self.SHARE_URL % idx
        headers = {
            "Accept": "*/*",
            "Referer": refer_url,
            "X-Requested-With": "XMLHttpRequest",
        }
        req_req = proxy_req if use_proxy else basic_req
        req = req_req(url, 1, header=headers, config={'timeout': 8})
        if req is None or list(req.keys()) != self.JSON_KEYS:
            if can_retry(url):
                echo(2, "retry")
                return self.basic_youdao(idx)
            else:
                echo(1, "retry upper time")
                return ""
        return req["content"]

    def load_article_pipeline(self, mode: int = 0):
        article_exec = ThreadPoolExecutor(max_workers=5)
        a_list = [article_exec.submit(self.load_article, ii, mode) for ii in self.idx]
        list(as_completed(a_list))
        self.load_list2db()

    def load_article(self, article_id: str, mode: int = 0, is_load2db: bool = True):
        if mode:
            self.get_share_info(article_id)
            self.load_list2db()
            return
        if article_id not in self.tpwds:
            article = self.basic_youdao(article_id)
            tpwds = list({ii: 0 for ii in regex.findall(self.TPWD_REG, article)})
            self.tpwds[article_id] = tpwds
        else:
            tpwds = self.tpwds[article_id]
        if article_id not in self.tpwd_map:
            self.tpwd_map[article_id] = {}
        time = 0
        au_list = []
        no_type = [
            ii
            for ii, jj in self.tpwd_map[article_id].items()
            if "type" not in jj or jj["item_id"] is None
        ]
        while (
            len(self.tpwd_map[article_id]) < len(tpwds) or (len(no_type) and not time)
        ) and time < 5:
            thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[article_id]]
            echo(1, article_id, "tpwds len:", len(tpwds), "need load", len(thread_list))
            thread_list = [
                self.tpwd_exec.submit(self.decoder_tpwd_once, article_id, ii)
                for ii in thread_list
            ]
            list(as_completed(thread_list))
            no_type = [
                ii
                for ii, jj in self.tpwd_map[article_id].items()
                if "type" not in jj or jj["item_id"] is None
            ]
            au_list.extend(
                [
                    self.tpwd_exec.submit(self.decoder_tpwd_url, article_id, ii)
                    for ii in no_type
                ]
            )
            time += 1
        list(as_completed(au_list))
        no_title = [
            ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
        ]
        time = 0
        while len(no_title) and time < 5:
            title_list = [
                self.tpwd_exec.submit(self.get_item_title, article_id, ii)
                for ii in no_title
            ]
            echo(1, article_id, "need get title:", len(title_list))
            list(as_completed(title_list))
            time += 1
            no_title = [
                ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
            ]
        if is_load2db:
            self.load_article2db(article_id)

    def update_title(self, article_id: str):
        self.tpwd_map[article_id] = {
            ii[3]: {"content": ii[1], "item_id": ii[0]}
            for ii in self.article_list[article_id].values()
        }
        no_title = [
            ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
        ]
        time = 0
        while len(no_title) and time < 5:
            title_list = [
                self.tpwd_exec.submit(self.get_item_title, article_id, ii)
                for ii in no_title
            ]
            echo(1, article_id, "need get title:", len(title_list))
            list(as_completed(title_list))
            time += 1
            no_title = [
                ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
            ]
        update_num = len(
            [
                1
                for ii, jj in self.tpwd_map[article_id].items()
                if "title" in jj and jj["content"] != jj["title"]
            ]
        )
        echo(2, "Update", article_id, update_num, "Title Success!!!")
        self.update_article2db(article_id)

    def load_list2db(self):
        t_share_map = self.share2article.copy()
        share_map = self.get_share_list()
        insert_list, update_list = [], []
        for ii, jj in t_share_map.items():
            if ii in share_map:
                t = share_map[ii]
                update_list.append((t[0], ii, jj[0], jj[1], 0, t[-1]))
            else:
                insert_list.append((ii, jj[0], jj[1]))
        self.update_db(insert_list, "Insert Article List", 1)
        self.update_db(update_list, "Update Article List", 1)

    def get_share_list(self):
        share_list = self.Db.select_db(self.S_LIST_SQL)
        share_map = {}
        for ii, jj in enumerate(share_list):
            t = jj[-1].strftime("%Y-%m-%d %H:%M:%S")
            share_map[jj[1]] = (*jj[:-1], t)
        self.share2article = share_map
        return share_map

    def load_article2db(self, article_id: str):
        m = self.tpwd_map[article_id]
        m = {ii: jj for ii, jj in m.items() if jj["url"]}
        tpwds = list(set(self.tpwds[article_id]))
        data = [
            (
                article_id,
                ii,
                m[jj]["item_id"],
                jj,
                m[jj]["type"],
                m[jj]["content"],
                m[jj]["url"],
                0,
                "",
                m[jj]["validDate"],
            )
            for ii, jj in enumerate(tpwds)
            if jj in m and "item_id" in m[jj] and m[jj]["type"] != 15
        ]
        data_map = {ii[3]: ii for ii in data}
        update_list, insert_list = [], []
        for ii in data:
            if ii[3] in self.tpwd_db_map[article_id]:
                t = self.tpwd_db_map[article_id][ii[3]]
                update_list.append((t[0], *ii, t[-1], 0))
            else:
                insert_list.append(ii)
        for ii, jj in self.tpwd_db_map[article_id].items():
            if ii not in data_map:
                update_list.append((*jj, 1))
        self.update_db(insert_list, f"article_id {article_id} Insert")
        self.update_db(update_list, f"article_id {article_id} Update")

    def update_tpwd(self, mode: int = 0, is_renew: bool = True, a_id: str = None):
        update_num = 0
        for article_id, jj in self.article_list.items():
            if a_id is not None and article_id != a_id:
                continue
            for o_tpwd, (num_iid, title, domain, tpwd, _, _, url) in jj.items():
                c = jj[o_tpwd]
                if (
                    is_renew
                    and self.URL_DOMAIN[1] not in url
                    and self.URL_DOMAIN[2] not in url
                    and self.URL_DOMAIN[10] not in url
                ):
                    renew_type = 2 if url in self.URL_DOMAIN[5] else 1
                    origin_tpwd = self.convert2tpwd(url, title)
                    if origin_tpwd is None:
                        origin_tpwd = tpwd
                else:
                    renew_type = 0
                    origin_tpwd = tpwd
                if num_iid == "" or domain == 16:
                    c = (
                        *c[:2],
                        16,
                        origin_tpwd,
                        1 if renew_type == 0 else 2,
                        *c[-2:],
                    )
                else:
                    c = self.generate_tpwd(
                        title, int(num_iid), origin_tpwd, renew_type, c, mode
                    )
                self.article_list[article_id][o_tpwd] = c
                update_num += int(c[2] < 15 or (renew_type and not mode))
        echo(2, "Update {} Tpwd Info Success!!".format(update_num))

    def generate_tpwd(
        self, title: str, num_iid: int, renew_tpwd: str, renew_type: int, c: dict, mode: int
    ):
        goods = self.get_dg_material(title, num_iid)
        if goods is None or not len(goods):
            echo(0, "goods get", 'error' if goods is None else 'empty', ':', title, num_iid)
            return (*c[:2], 17, renew_tpwd, 1 if renew_type == 0 else 2, *c[-2:])
        goods = goods[0]
        if "ysyl_click_url" in goods and len(goods["ysyl_click_url"]):
            url = goods["ysyl_click_url"]
        elif "coupon_share_url" in goods and len(goods["coupon_share_url"]):
            url = goods["coupon_share_url"]
        else:
            url = goods["url"]
        url = "https:{}".format(url)
        commission_rate = int(goods["commission_rate"])
        commission_type = goods["commission_type"]
        tpwd = self.convert2tpwd(url, title)
        if tpwd is None:
            echo(0, "tpwd error:", tpwd)
            return (*c[:2], 18, renew_tpwd, 1 if renew_type == 0 else 2 * c[-2:])
        if mode:
           return (*c[:3], tpwd, commission_rate, commission_type, c[-1]) 
        if renew_type == 1:
            return (*c[:3], tpwd, 2, commission_type, c[-1])
        return (*c[:3], tpwd, commission_rate, commission_type, c[-1])

    def load_article_list(self):
        """
        tpwd: [goods_id, goods_name, domain, tpwd, commission_rate, commission_type, url]
        """
        for article_id in self.idx:
            article_list = self.get_article_db(article_id)
            self.article_list[article_id] = {
                ii[4]: [ii[3], ii[6], ii[5], ii[4], ii[8], ii[9], ii[7]]
                for ii in article_list
            }
            self.tpwd_db_map[article_id] = {ii[4]: ii for ii in article_list}
            have_id = [ii[0] for ii in self.tpwd_db_map[article_id].values()]
            need_del_id = [ii[0] for ii in article_list if ii[0] not in have_id]
            self.need_del[article_id] = need_del_id
        item_num = sum([len(ii) for ii in self.article_list.values()])
        echo(1, "Load {} article list from db.".format(item_num))

    def get_article_db(self, article_id: str):
        article_list = list(self.Db.select_db(self.S_ARTICLE_SQL % article_id)) 
        for ii, jj in enumerate(article_list):
            t = jj[-1].strftime("%Y-%m-%d %H:%M:%S")
            y = jj[-2].strftime("%Y-%m-%d %H:%M:%S")
            article_list[ii] = [*jj[:-2], y, t]
        return article_list

    def update_db(self, data: list, types: str, mode: int = 0):
        if not len(data):
            return
        if "insert" in types.lower():
            basic_sql = self.I_LIST_SQL if mode else self.I_ARTICLE_SQL
        else:
            basic_sql = self.R_LIST_SQL if mode else self.R_ARTICLE_SQL

        i_sql = basic_sql % str(data)[1:-1]
        insert_re = self.Db.insert_db(i_sql)
        if insert_re:
            echo(3, "{} {} info Success".format(types, len(data)))
        else:
            echo(0, "{} failed".format(types))

    def decoder_tpwd_once(self, article_id: str, tpwd: str, mode: int = 0):
        req = self.decoder_tpwd(tpwd)
        if req is None or not len(req):
            return
        temp_map = {ii: req[ii] for ii in self.NEED_KEY}
        if temp_map["validDate"] == self.ZERO_STAMP or "-" in temp_map["validDate"]:
            temp_map["validDate"] = 1500000000
        else:
            temp_map["validDate"] = (
                time_stamp(time_format="%d天%H小时%M分%S秒", time_str=req["validDate"])
                - self.BASIC_STAMP
                + time_stamp()
            )
        temp_map["validDate"] = time_str(temp_map["validDate"])
        temp_map["url"] = temp_map["url"].strip()
        if article_id not in self.tpwd_map:
            self.tpwd_map[article_id] = {}
        self.tpwd_map[article_id][tpwd] = temp_map
        if not mode:
            self.decoder_tpwd_url(article_id, tpwd)

    def decoder_tpwd_url(self, article_id: str, tpwd: str):
        temp_map = self.tpwd_map[article_id][tpwd]
        tpwd_type, item_id = self.analysis_tpwd_url(temp_map["url"])
        if item_id is None:
            return
        temp_map["type"] = tpwd_type
        temp_map["item_id"] = item_id
        if tpwd_type < 20:
            echo(2, "Domain:", self.URL_DOMAIN[tpwd_type], "item id:", item_id)
        self.tpwd_map[article_id][tpwd] = temp_map

    def analysis_tpwd_url(self, url: str):
        if self.URL_DOMAIN[5] in url:
            return 5, self.get_uland_url(url)
        elif self.URL_DOMAIN[11] in url:
            return 11, self.get_a_m_url(url)
        elif self.URL_DOMAIN[0] in url:
            return 0, self.get_s_click_url(url)
        elif self.URL_DOMAIN[10] in url:
            return 10, 0
        elif self.URL_DOMAIN[1] in url:
            good_id = self.get_item_detail(url)
            if good_id != "":
                return 1, good_id
            return 16, 0
        elif url == "":
            return 15, 0
        echo("0|warning", "New Domain:", regex.findall("https://(.*?)/", url), url)
        return 20, 0

    def decoder_tpwd(self, tpwd: str):
        """ decoder the tpwd from taokouling """
        url = self.DECODER_TPWD_URL % (self.api_key, tpwd)
        req = basic_req(url, 1)
        if (
            req is None
            or isinstance(req, str)
            or 'ret' not in list(req.keys())
        ):
            return {}
        return req

    def get_s_click_url(self, s_click_url: str):
        """ decoder s.click real jump url @validation time: 2019.10.23"""
        time.sleep(np.random.randint(0, 10))
        item_url = self.get_s_click_location(s_click_url)
        if item_url is None:
            echo(3, "s_click_url location Error..")
            return
        return self.get_item_detail(item_url)

    def get_s_click_url_v1(self, s_click_url: str):
        """ decoder s.click real jump url @validation time: 2019.08.31"""
        if "tu=" not in s_click_url:
            tu_url = self.get_s_click_tu(s_click_url)
        else:
            tu_url = s_click_url
        if tu_url is None or "tu=" not in tu_url:
            echo(3, "s_click_url tu url ENd Retry..", tu_url)
            return
        qso = decoder_url(tu_url)
        if "tu" not in qso:
            if "alisec" in tu_url:
                echo("0|debug", "Request Too Fast")
                time.sleep(np.random.randint(10) * np.random.rand())
            else:
                echo(0, s_click_url, tu_url)
            return
        redirect_url = urllib.parse.unquote(qso["tu"])
        return self.get_s_click_detail(redirect_url, tu_url)

    def get_tb_headers(self, url: str = "", refer_url: str = "") -> dict:
        headers = {"Accept": get_accept("html"), "User-Agent": get_use_agent()}
        if url != "":
            headers["Host"] = url.split("/")[2]
        if refer_url != "":
            headers["referer"] = refer_url
        return headers

    def get_s_click_basic(
        self,
        s_click_url: str,
        retry_func=(lambda x: False),
        referer: str = "",
        allow_redirects: bool = True,
        is_direct: bool = False,
    ):
        headers = self.get_tb_headers(refer_url=referer)
        req_func = basic_req if is_direct else proxy_req
        req = req_func(
            s_click_url, 2, header=headers, config={"allow_redirects": allow_redirects}
        )
        if is_direct:
            return req
        if req is None or retry_func(req):
            if can_retry(s_click_url):
                return self.get_s_click_basic(
                    s_click_url, retry_func, referer, allow_redirects, is_direct
                )
            else:
                return
        return req

    def get_s_click_tu(self, s_click_url: str):
        req = self.get_s_click_basic(s_click_url, lambda i: "tu=" not in i.url)
        if req is None:
            return
        return req.url

    def get_s_click_location(self, s_click_url: str):
        req = self.get_s_click_basic(s_click_url)
        if req is None:
            echo("0|warning", "s_click_url first click error.")
            return
        echo("1", "real_jump_address get")
        rj = regex.findall("real_jump_address = '(.*?)'", req.text)
        if not len(rj):
            echo("0|warning", "real_jump_address get error.")
            return
        rj = rj[0].replace("&amp;", "&")
        req_rj = self.get_s_click_basic(
            rj, lambda i: "Location" not in i.headers, referer=rj, allow_redirects=False
        )
        if req_rj is None:
            return
        return req_rj.headers["Location"]

    def get_s_click_detail(self, redirect_url: str, tu_url: str):
        headers = self.get_tb_headers(refer_url=tu_url)
        req = proxy_req(redirect_url, 2, header=headers)
        if req is None or "id=" not in req.url:
            if can_retry(redirect_url):
                return self.get_s_click_detail(redirect_url, tu_url)
            else:
                return
        return self.get_item_detail(req.url)

    def get_item_detail(self, item_url: str) -> str:
        item = decoder_url(item_url)
        if not "id" in item:
            echo(0, "id not found:", item_url)
            return ""
        return item["id"]

    def get_item_title_once(self, item_id: int) -> str:
        item = self.get_tb_getdetail(item_id)
        if item is None:
            return ''
        return item['title']
        

    def get_item_title(self, article_id: str, tpwd: str):
        temp_map = self.tpwd_map[article_id][tpwd]
        if (
            "item_id" not in temp_map
            or temp_map["item_id"] == ""
            or temp_map["item_id"] == "0"
        ):
            return
        item_id = int(temp_map["item_id"])
        title = self.get_item_title_once(item_id)
        if title != "":
            self.tpwd_map[article_id][tpwd]["title"] = title

    def get_item_title_once_v1(self, item_id: int) -> str:
        req = self.get_item_basic(item_id)
        if req is None:
            return ""
        req_text = req.text
        req_title = regex.findall('data-title="(.*?)">', req_text)
        if len(req_title):
            return req_title[0]
        req_title = regex.findall('<meta name="keywords" content="(.*?)"', req_text)
        if len(req_title):
            return req_title[0]
        return ""

    def get_item_basic(self, item_id: int, url: str = ""):
        url = self.ITEM_URL % item_id if url == "" else url
        headers = {"Accept": get_accept("html")}
        req = proxy_req(url, 2, header=headers, config={"allow_redirects": False})
        if req is None:
            if can_retry(url):
                return self.get_item_basic(item_id, url)
            return
        if req.status_code != 200:
            return self.get_item_basic(item_id, req.headers["Location"])
        return req

    def get_uland_url(self, uland_url: str):
        if (
            not 'uland' in self.cookies
            # or not self.M in self.cookies['uland']
            or time_stamp() - self.m_time > self.ONE_HOURS / 2
        ):
            self.get_m_h5_tk()
        s_req = self.get_uland_url_once(uland_url, self.cookies['uland'])
        req_text = s_req.text
        re_json = json.loads(req_text[req_text.find("{") : -1])
        return re_json["data"]["resultList"][0]["itemId"]

    def get_a_m_url(self, a_m_url: str):
        req = self.get_a_m_basic(a_m_url)
        if req is None:
            return
        item_url = req.headers["location"]
        return self.get_item_detail(item_url)

    def get_a_m_basic(self, a_m_url: str):
        headers = self.get_tb_headers(a_m_url)
        req = proxy_req(a_m_url, 2, header=headers, config={"allow_redirects": False})
        if req is None or "location" not in req.headers:
            if can_retry(a_m_url):
                return self.get_a_m_basic(a_m_url)
            return
        return req

    def get_m_h5_tk(self):
        self.m_time = time_stamp()
        def get_cookie_once(key, func, *param):
            req = func(*param)
            if req is not None: 
                self.cookies[key] = req.cookies.get_dict()
                echo(1, "get {} cookie:".format(key), self.cookies[key])

        get_cookie_once('uland', self.get_uland_url_once, self.uland_url)
        if False:
            get_cookie_once('finger', self.get_finger_once, self.test_item_id)
            get_cookie_once('baichuan', self.get_baichuan_once, self.test_item_id, self.test_finger_id) 


    def get_baichuan(self, item_id: int):
        if (
            not 'baichuan' in self.cookies
            or not self.M in self.cookies['baichuan']
            or time_stamp() - self.m_time > self.ONE_HOURS / 2
        ):
            self.get_m_h5_tk()
        finger_id = self.get_finger(item_id)
        if finger_id is None:
            return
        echo(4, 'finger id:', finger_id) 
        req = self.get_baichuan_once(item_id, finger_id, self.cookies['baichuan'])
        if req is not None:
            return req.json()['data']

    def get_tb_getdetail(self, item_id: int):
        if (
            not 'uland' in self.cookies
            or time_stamp() - self.m_time > self.ONE_HOURS / 2
        ):
            self.get_m_h5_tk()
        req = self.get_tb_getdetail_once(item_id, self.cookies['uland'])
        if req is not None:
            req_text = req.text
            re_json = json.loads(req_text[req_text.find("{") : -1])
            return re_json["data"]["item"]


    def get_tb_getdetail_once(self, item_id: int, cookies: dict = {}):
        refer_url = self.DETAIL_URL % item_id
        data = {"itemNumId": str(item_id)}
        jsv = '2.4.8'
        api = 'mtop.taobao.detail.getdetail'
        j_data_t = {'v': 6.0,
        'ttid': '2017@taobao_h5_6.6.0',
        'AntiCreep': True,
        'callback': 'mtopjsonp1'
        }
        return self.get_tb_h5_api(api, jsv, refer_url, data, j_data_t, cookies)


    def get_baichuan_once(self, item_id: int, finger_id: str, cookies: dict = {}):
        refer_url = self.DETAIL_URL % item_id
        data = {
            'pageCode': 'mallDetail',
            'ua': get_use_agent('mobile'),
            'params': json_str({
                "url": refer_url,
                "referrer": "",
                "oneId": None, "isTBInstalled": "null", "fid": finger_id
                })
            }
        data_str = r'{"pageCode":"mallDetail","ua":"%s","params":"{\"url\":\"%s\",\"referrer\":\"\",\"oneId\":null,\"isTBInstalled\":\"null\",\"fid\":\"%s\"}"}' % (get_use_agent('mobile'), refer_url, finger_id)
        print(data)
        api = 'mtop.taobao.baichuan.smb.get'
        jsv = '2.4.8'
        
        return self.get_tb_h5_api(api, jsv, refer_url, data, cookies=cookies, mode=1, data_str=data_str)
        

    def get_tb_h5_api(self, api: str, jsv: str, refer_url: str, data: dict, j_data_t: dict = {}, cookies: dict = {}, mode: int = 0, data_str: str = None):
        """ tb h5 api @2019.11.6 ✔️Tested"""
        step = self.M in cookies
        if data_str is None:
            data_str = json_str(data)
        
        headers = {
            "Accept": 'application/json',
            "referer": refer_url,
            "Agent": get_use_agent('mobile')
        }
        if step:
            headers["Cookie"] = encoder_cookie(cookies)
        appkey = "12574478"

        token = cookies[self.M].split("_")[0] if step else ""
        t = int(time_stamp() * 1000)
        
        j_data = {
            "jsv": jsv,
            "appKey": appkey,
            "t": t,
            "sign": self.get_tb_h5_token(token, t, appkey, data_str),
            "api": api,
            "v": 1.0,
            "timeout": 20000,
            "AntiCreep": True,
            "AntiFlood": True,
            "type": "originaljson",
            "dataType": "jsonp",
            **j_data_t
        }
        if mode == 0:
            j_data['data'] = data_str
        mtop_url = encoder_url(j_data, self.MTOP_URL % (api, int(j_data['v'])))
        if mode == 0:
            req = proxy_req(mtop_url, 2, header=headers)
        else:
            req = proxy_req(mtop_url, 12, data=data, header=headers)
        # echo(4, 'request once.')
        if req is None:
            if can_retry(self.MTOP_URL % (api, int(j_data['v']))):
                return self.get_tb_h5_api(api, jsv, refer_url, data, j_data_t, cookies, mode)
            else:
                return
        return req

    def get_uland_url_once(self, uland_url: str, cookies: dict = {}):
        """ tb h5 api @2019.11.9 ✔️Tested"""
        step = self.M in cookies
        uland_params = decoder_url(uland_url)
        tt = {
                "floorId": "13193" if step else "13052",
                "variableMap": json_str(
                    {
                        "taoAppEnv": "0",
                        "e": uland_params["e"],
                        "scm": uland_params["scm"],
                    }
                ),
                }
        api = "mtop.alimama.union.xt.en.api.entry"
        jsv = '2.4.0'
        j_data = {'type': 'jsonp', "callback": "mtopjsonp{}".format(int(step) + 1)}
        return self.get_tb_h5_api(api, jsv, uland_url, tt, j_data, cookies)
    
    def get_finger(self, item_id: int):
        if (
            not 'finger' in self.cookies
            or not self.M in self.cookies['finger']
            or time_stamp() - self.m_time > self.ONE_HOURS / 2
        ):
            self.get_m_h5_tk()
        s_req = self.get_finger_once(item_id, self.cookies['finger'])
        if s_req is None:
            return
        try:
            return s_req.json()['data']['fingerId']
        except Exception as e:
            return

    def get_finger_once(self, item_id: int, cookies: dict = {}):
        step = self.M in cookies
        api = 'mtop.taobao.hacker.finger.create'
        refer_url = self.ITEM_URL % item_id
        jsv = '2.4.11'
        j_data = {'type': 'jsonp', "callback": "mtopjsonp{}".format(int(step) + 1),}
        return self.get_tb_h5_api(api, jsv, refer_url, {}, cookies=cookies)

    def get_tb_h5_token(self, *data: list):
        md5 = hashlib.md5()
        wait_enc = "&".join([str(ii) for ii in data])
        md5.update(wait_enc.encode())
        return md5.hexdigest()

    def get_ynote_file(self, offset: int = 0):
        url = self.LISTRECENT_URL % (offset, self.cstk)
        data = {"cstk": self.cstk}
        req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1))
        if req is None or type(req) != list:
            return None
        list_recent = {ii["fileEntry"]["id"]: ii["fileEntry"] for ii in req}
        self.list_recent = {**self.list_recent, **list_recent}
        echo(1, "Load ynote file {} items.".format(len(self.list_recent)))
        return req

    def get_ynote_web_header(self, mode: int = 0):
        headers = {
            "Content-Type": get_content_type(),
            "Cookie": self.cookie,
            "Host": self.Y_URL.split("/")[2],
            "Origin": self.Y_URL,
            "Referer": self.WEB_URL,
        }
        if mode:
            headers["Accept"] = get_accept("xhr")
        else:
            headers["Accept"] = get_accept("html")
        return headers

    def get_empty_content(self):
        headers = {"Referer": self.WEB_URL}
        req = proxy_req(self.Y_DOC_JS_URL, 3, header=headers)
        if len(req) < 1000:
            if can_retry(self.Y_DOC_JS_URL):
                return self.get_empty_content()
            else:
                return
        empty_content = regex.findall("t.EMPTY_NOTE_CONTENT='(.*?)'", req)[0]
        empty_content = empty_content.split(self.END_TEXT)[0]
        self.empty_content = empty_content
        echo(1, "Load empty content", empty_content)
        return empty_content

    def get_web_content(self):
        req = proxy_req(self.WEB_URL, 3, header=self.get_ynote_web_header())
        if len(req) < 1000:
            if can_retry(self.WEB_URL):
                return self.get_web_content()
            else:
                return
        return req

    def update_article_pipeline(self, article_id: str):
        xml = self.get_xml(article_id)
        if xml is None:
            echo("0|warning", "get xml error")
            return
        xml, r_log, r_num = self.replace_tpwd(article_id, xml)
        if not r_num:
            echo("0|warning", "r_num == 0")
            return
        flag = self.update_article(article_id, xml)
        if flag:
            self.email_update_result(article_id, r_log, r_num)
            self.update_valid(article_id)
            self.update_article2db(article_id, True)
            self.share_article(article_id)

    def email_update_result(self, article_id: str, r_log: list, r_num: int):
        p = self.share2article[article_id][-2].split("/")[-1]
        article_info = self.list_recent[p]
        name = article_info["name"].replace(".note", "")
        subject = "更新({}){}/{}条[{}]".format(
            time_str(time_format=self.T_FORMAT), r_num, len(r_log), article_info["name"]
        )
        content = "\n".join(
            [
                "Title: {}".format(article_info["name"]),
                "Time: {}".format(time_str()),
                "Update Num: {}/{}条".format(r_num, len(r_log)),
                "",
                *r_log,
            ]
        )
        send_email(content, subject, assign_rec=self.assign_rec)

    def update_valid(self, article_id: str):
        if article_id not in self.tpwd_map:
            self.tpwd_map[article_id] = {}
        wait_list = [
            ii
            for ii in self.article_list[article_id].keys()
            if ii not in self.tpwd_map[article_id]
        ]
        update_time = 0
        while len(wait_list) and update_time < 5:
            echo(2, "Begin Update No.{} times Tpwd validDate".format(update_time + 1))
            update_v = [
                self.tpwd_exec.submit(self.decoder_tpwd_once, article_id, ii, 1)
                for ii in wait_list
            ]
            list(as_completed(update_v))
            wait_list = [
                ii
                for ii in self.article_list[article_id].keys()
                if ii not in self.tpwd_map[article_id]
            ]
            update_time += 1

    def update_article2db(self, article_id: str, is_tpwd_update: bool = False):
        def valid_t(types: str, maps: dict):
            return types in maps and maps[types] != ''
        m = {ii[4]: ii for ii in self.get_article_db(article_id)}
        data = []
        for (
            o_tpwd,
            (num_iid, title, domain, tpwd, commission_rate, commission_type, ur),
        ) in self.article_list[article_id].items():
            """
            `id`, article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at, created_at, is_deleted
            """
            n = m[o_tpwd]
            if o_tpwd in self.tpwd_map[article_id]:
                t = self.tpwd_map[article_id][o_tpwd]
                content = (
                    t["title"]
                    if valid_t('title', t)
                    else (t['content'] if valid_t('content', t) else n[6])
                )
                url = t["url"] if valid_t('url', t) else n[7]
                validDate = t["validDate"] if valid_t('validDate', t) else n[-2]
                data.append(
                    (
                        *n[:4],
                        tpwd if is_tpwd_update else o_tpwd,
                        domain,
                        content,
                        url,
                        commission_rate,
                        commission_type,
                        validDate,
                        n[-1],
                        0,
                    )
                )
            else:
                data.append(
                    (
                        *n[:4],
                        tpwd if is_tpwd_update else o_tpwd,
                        domain,
                        n[6],
                        n[7],
                        commission_rate,
                        commission_type,
                        n[-2],
                        n[-1],
                        0,
                    )
                )
        self.update_db(data, "Update Article {} TPWD".format(article_id))

    def replace_tpwd(self, article_id: str, xml: str):
        tpwds = regex.findall(self.TPWD_REG2, xml)
        m = self.article_list[article_id]
        r_log, r_num = [], 0
        EXIST = "PASSWORD_NOT_EXIST::口令不存在"
        DECODER_EXC = "DECODER_EXCEPTION::商品已下架"
        NO_GOODS = "GOODS_NOT_FOUND::未参加淘客"
        TPWD_ERROR = "TPWD_ERROR::淘口令生成异常"
        for ii, jj in enumerate(tpwds):
            pure_jj = jj[1:-1]
            no_t = "No.{} tpwd: {}, ".format(ii + 1, jj)
            if pure_jj not in m:
                r_log.append("{}{}".format(no_t, EXIST))
                continue
                # tpwd = 'NOTNOTEXIST'
            num_iid, title, domain, tpwd, commission_rate, commission_type, ur = m[pure_jj]
            if domain >= 15:
                if domain == 15:
                    applied = "{},{}".format(EXIST, title)
                elif domain == 16:
                    applied = "{},{}".format(DECODER_EXC, title)
                elif domain == 17:
                    applied = "{},{}".format(NO_GOODS, title)
                elif domain == 18:
                    applied = "{},{}".format(TPWD_ERROR, title)
            else:
                applied = title
            xml = xml.replace(jj, "￥{}￥".format(tpwd))
            if commission_rate == 2:
                COMMISSION = "->￥{}￥ SUCCESS, 保持原链接, {}".format(tpwd, applied)
            elif commission_rate == 1:
                COMMISSION = "未能更新淘口令, {}".format(applied)
            else:
                COMMISSION = "->￥{}￥ SUCCESS, 佣金: {}, 类型: {}, {}".format(
                    tpwd, commission_rate, commission_type, applied
                )
            r_log.append("{}{}".format(no_t, COMMISSION))
            r_num += int(commission_rate != 1)
        return xml, r_log, r_num

    def get_xml(self, article_id: str):
        url = self.SYNC_URL % ("download", self.cstk)
        data = {
            "fileId": self.share2article[article_id][-2].split("/")[-1],
            "version": -1,
            "convert": True,
            "editorType": 1,
            "cstk": self.cstk,
        }
        req = proxy_req(url, 12, data=data, header=self.get_ynote_web_header(1))
        if req is None or len(req.text) < 100:
            if can_retry(url):
                return self.get_xml(article_id)
            else:
                return
        return req.text

    def update_article(self, article_id: str, article_body: str):
        p = self.share2article[article_id][-2].split("/")[-1]
        article_info = self.list_recent[p]
        data = {
            "fileId": p,
            "parentId": article_info["parentId"],
            "domain": article_info["domain"],
            "rootVersion": -1,
            "sessionId": "",
            "modifyTime": int(time_stamp()),
            "bodyString": article_body,
            "transactionId": p,
            "transactionTime": int(time_stamp()),
            "orgEditorType": article_info["orgEditorType"],
            "tags": article_info["tags"],
            "cstk": self.cstk,
        }
        url = self.SYNC_URL % ("push", self.cstk)
        req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1))
        if req is None or list(req.keys()) != [
            "entry",
            "meta",
            "effectedShareEntries",
            "forcePullVersion",
            "effected",
        ]:
            echo(
                "0|error",
                "Update atricle_id {} Error".format(article_id),
                req.json() if req is not None else "",
            )
            return False
        echo("1|warning", "Update atricle_id {} Success!!!".format(article_id))
        return True

    def share_article(self, article_id: str):
        p = self.share2article[article_id][-2].split("/")[-1]
        url = self.MYSHARE_URL % (p, self.cstk)
        req = proxy_req(url, 1, header=self.get_ynote_web_header(1))
        if req is None or list(req.keys()) != ["entry", "meta"]:
            if can_retry(url):
                return self.share_article(article_id)
            return False
        echo("2", "Share article {} Success!!!".format(article_id))
        return True

    def load_article_local(self, file_path: str):
        if file_path not in self.tpwds:
            tt = '||||'.join(read_file(file_path))
            tpwds = regex.findall(self.TPWD_REG, tt)
            self.tpwds[file_path] = tpwds
        else:
            tpwds = self.tpwds[file_path]
        if file_path not in self.tpwd_map:
            self.tpwd_map[file_path] = {}
        time = 0
        while (len(self.tpwd_map[file_path]) < len(tpwds)) and time < 5:
            thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[file_path]]
            echo(1, file_path, "tpwds len:", len(tpwds), "need load", len(thread_list))
            thread_list = [
                self.tpwd_exec.submit(self.decoder_tpwd_once, file_path, ii, 1)
                for ii in thread_list
            ]
            list(as_completed(thread_list))
            time += 1

    def load_picture(self, url: str, idx: int):
        td = basic_req(url, 2)
        picture_path = 'picture/{}.jpg'.format(idx)
        with open(picture_path, 'wb') as f:
            f.write(td.content)

    def load_picture_pipeline(self, file_path: str):
        mkdir('picture')
        tpk_list = self.tpwds[file_path]
        picture_url = [(self.tpwd_map[file_path][tpk]['picUrl'], idx) for idx, tpk in enumerate(tpk_list) if tpk in self.tpwd_map[file_path]]
        picture_url = [(ii, idx) for ii, idx in picture_url if not os.path.exists('picture/{}.jpg'.format(idx))]
        echo(1, 'Load {} picture Begin'.format(len(picture_url)))
        pp = [self.tpwd_exec.submit(self.load_picture, ii, jj) for ii, jj in picture_url]
        return pp
    
    def check_overdue(self):
        def check_overdue_once(data: list) -> bool:
            dif_time = time_stamp(data[-2]) - time_stamp() 
            return dif_time > 0 and dif_time <= self.ONE_HOURS * self.ONE_DAY
        overdue_article = [(article_id, article_list[4]) for article_id, ii in self.tpwd_db_map.items() for article_list in ii.values() if check_overdue_once(article_list)]
        overdue_id = set([article_id for article_id, _ in overdue_article])
        overdue_list = [(article_id, len([1 for a_id, tpwd in overdue_article if article_id == a_id])) for article_id in overdue_id]
        if not len(overdue_list):
            return
        title = '链接需要更新#{}#篇'.format(len(overdue_list))
        content = title + '\n \n'
        for article_id, num in overdue_list:
            content += '{}, 需要更新{}个链接，{}\n'.format(self.share2article[article_id][2], num, self.NOTE_URL % article_id)
        content += '\n\nPlease update within 6 hours, Thx!'
        echo('2|debug', title, content)
        send_email(content, title)        

    def load_share_total(self):
        self.check_overdue()
        for article_id in self.idx:
            self.get_share_info(article_id)
        self.load_list2db()
        self.__init__()
        self.load_process()
    
    def load_article_new(self):
        for article_id in self.idx:
            self.load_article(article_id)

    def load_click(self, num=1000000):
        ''' schedule click '''

        for index in range(num):
            threading_list = []
            if index % 12 != 1:
                threading_list.append(threading.Thread(target=self.load_article_new, args=()))
            if index % 12 == 1:
                threading_list.append(threading.Thread(target=self.load_share_total, args=()))
            for work in threading_list:
                work.start()
            time.sleep(self.ONE_HOURS / 2)