def __init__(self): self.Db = Db("china_regions") china = pd.read_csv('news/china_city_list.csv', encoding='gbk') self.province = list(china.groupby(by=['Province']).count().axes[0]) self.city = list(china.groupby(by=['City']).count().axes[0]) self.filelists = ['google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail', 'bjh_detail_poison', 'news_steal.txt', 'news_poison.txt'] self.city_province = {} self.province_map = {} self.pre_data() for index, row in china.iterrows(): self.city_province[row['City']] = row['Province']
def __init__(self): self.Db = Db("proxy") self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s ''' self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0''' self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s ''' self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s''' self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s''' self.canuseip = {} self.waitjudge = [] self.cannotuseip = {} self.failuredtime = {} self.canuse_proxies = [] self.initproxy()
def __init__(self): self.Db = Db("netease") self.classifylist = {} self.playlists = [] self.failuredmap = {} self.songmap = {} self.songlist = [] self.finishlist = [] self.get_classify() self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' ''' self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 ''' self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' ''' self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s''' self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)''' # change to your file absolute address self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s''' self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''
def __init__(self): self.Db = Db("proxy") self.insert_sql = """INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s """ self.select_list = ( """SELECT address, http_type from ip_proxy WHERE `is_failured` = 0""" ) self.select_sql = """SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s """ self.select_all = """SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s""" self.random_select = """SELECT `address`, `http_type` FROM ip_proxy WHERE `is_failured` >= 5 and (`id` >= ((SELECT MAX(`id`) FROM ip_proxy)-(SELECT MIN(`id`) FROM ip_proxy)) * RAND() + (SELECT MIN(`id`) FROM ip_proxy)) and http_type in %s LIMIT 6000""" self.replace_ip = """REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s""" self.can_use_ip = {} self.waitjudge = [] self.cannot_use_ip = {} self.failured_time = {} self.canuse_proxies = [] self.init_proxy()
def __init__(self): self.Db = Db("blog") self.local_views = {} self.title_map = {} self.title2slug = {} self.zhihu_views = {} self.zhihu_id = {} self.jianshu_views = {} self.jianshu_id = {} self.csdn_views = {} self.csdn_id = {} self.exist_data = {} self.getTitleMap() self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s''' self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s''' self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''
def __init__(self): super(ActivateArticle, self).__init__() self.Db = Db("tbk") self.Db.create_table(os.path.join(root_dir, "tpwd.sql")) self.Db.create_table(os.path.join(root_dir, "article.sql")) self.tpwd_map = {} self.tpwd_db_map = {} self.tpwds = {} self.cookies = {} self.share2article = {} self.article_list = {} self.list_recent = {} self.idx = [] self.empty_content = "" self.tpwd_exec = ThreadPoolExecutor(max_workers=20) self.need_del = {} self.get_share_list()
class Get_playlist_song(): """ 1. get playlist id from classify; 2. get song from play list; use url: """ def __init__(self): self.Db = Db("netease") self.classifylist = {} self.playlists = [] self.failuredmap = {} self.songmap = {} self.songlist = [] self.finishlist = [] self.get_classify() self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' ''' self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 ''' self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' ''' self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s''' self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)''' # change to your file absolute address self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s''' self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s''' def get_classify(self): """ get classify from /discover/playlist """ version = begin_time() self.classifylist = {} host = 'https://music.163.com/discover/playlist' html = proxy_req(host, 0) if not html: print('Empty') if can_retry(host): self.get_classify() return [] alist = html.find_all('a', class_='s-fc1') if not len(alist): if can_retry(host): self.get_classify() print(html) for index in alist: self.classifylist[index.text] = index['href'] end_time(version) def get_playlist_id(self, classify, offset): """ get playlist id from classify """ host = 'https://music.163.com' allclassify = classify == '全部风格' url = host + self.classifylist[classify] + ( '?' if allclassify else '&') + 'order=hot&limit=35&offset=' + str(offset) html = basic_req(url, 0) if not html: if can_retry(url): self.get_playlist_id(classify, offset) return [] alist = html.find_all('a', class_='icon-play') if not len(alist): if can_retry(url): self.get_playlist_id(classify, offset) for index in alist: self.playlists.append(index['data-res-id']) def get_playlist_id_thread(self): """ get play list id in threading """ version = begin_time() if not len(self.classifylist): self.get_classify() for index in self.classifylist: threadings = [] for offset in range(41): work = threading.Thread(target=self.get_playlist_id, args=( index, offset * 35, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() print(len(self.playlists)) self.test_queue(index) self.playlists = [] print(index + " Over") end_time(version) def test_queue(self, classify): """ test data if in playlist_queue """ if len(self.playlists) == 1: waitlist = '(' + str(self.playlists[0]) + ')' else: waitlist = tuple(self.playlists) results = self.Db.select_db(self.select_one % (str(waitlist), classify)) if not results: return [] hadexist = [] for index in results: hadexist.append(index[0]) insertlist = [] for index in self.playlists: if index not in hadexist: # file_d.write(str([index, classify])[1:-1] + '\n') insertlist.append((index, classify)) print('Insert ' + str(len(insertlist)) + ' ' + classify) self.insert_queue(insertlist) def insert_queue(self, ids): """ insert data to playlist_queue """ if not len(ids): return [] results = self.Db.insert_db(self.insert_sql % str(ids)[1:-1]) if results: if len(ids): print('Insert ' + ids[0][1] + ' ' + str(len(ids)) + ' Success!') else: pass def get_list_ids(self, classify): """ get list ids from db """ results = self.Db.select_db(self.select_ids % classify) ids = [] if results: for index in results: ids.append([index[0], index[1]]) return ids def get_song_detail_thread(self): """ get song detail threadings """ version = begin_time() for classify in self.classifylist: ids = self.get_list_ids(classify) threadings = [] for oneid in ids: work = threading.Thread(target=self.get_song_detail, args=(oneid[1], )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.clean_data() self.test_song(classify, ids) self.songlist = [] self.songmap = {} self.finishlist = [] self.successtime = 0 print(classify + ' Over!') end_time(version) def clean_data(self): """ aggregation data """ for song in self.songlist: [songid, songname, playcount] = song if songid not in self.songmap: self.songmap[songid] = [1, playcount, songname] else: orgin = self.songmap[songid] self.songmap[songid] = [ orgin[0] + 1, orgin[1] + playcount, songname ] def get_song_detail(self, id): """ get song detail form playlist """ host = 'http://music.163.com/api/playlist/detail?id=' + str(id) json = proxy_req(host, 1) if json == 0: if can_retry(host): self.get_song_detail(id) return [] result = json['result'] tracks = result['tracks'] if len(tracks) <= 1: if can_retry(host): self.get_song_detail(id) return [] else: playcount = result['playCount'] for track in tracks: songid = track['id'] songname = track['name'] self.songlist.append([songid, songname, playcount]) self.finishlist.append(id) def test_song(self, classify, ids): """ test song if in db """ songs = [] for song in self.songmap: songs.append(song) if not len(songs): return [] elif len(songs) == 1: waitlist = '(' + songs[0] + ')' else: waitlist = tuple(songs) results = self.Db.select_db(self.select_song % (str(waitlist), classify)) resultmap = {} for detail in results: resultmap[detail[1]] = [detail[0], detail[2], detail[3]] replacelist = [] insertlist = [] replacequeue = [] file_d = codecs.open("song_detail", 'a', encoding='utf-8') file_d.seek(0) file_d.truncate() idsmap = {} for indexid in ids: idsmap[indexid[1]] = indexid[0] for song in self.songmap: songdetail = self.songmap[song] if song in resultmap: dbdetail = resultmap[song] replacelist.append( (dbdetail[0], song, classify, songdetail[2], songdetail[0] + dbdetail[1], songdetail[1] + dbdetail[2])) else: file_d.write(u'' + str([ song, u'' + str(u'' + songdetail[2].replace(',', ' ')) [0:20], classify, songdetail[0], songdetail[1] ])[1:-1] + '\n') insertlist.append((song, songdetail[2], classify, songdetail[0], songdetail[1])) for playlist in self.finishlist: replacequeue.append((idsmap[playlist], playlist, classify, 1)) file_d.close() if len(insertlist): self.db_song_detail(insertlist, 'Insert', replacequeue) if len(replacelist): self.db_song_detail(replacelist, 'Update', []) def db_song_detail(self, waitlist, types, replacequeue): """ batch insert/update song detail """ if types == 'Update': results = self.Db.update_db(self.replace_song % str(blocklist)[1:-1]) else: results = self.Db.update_db(self.insert_song) if results: if len(waitlist): print(types + ' song detail for ' + waitlist[0][2] + ' ' + str(len(waitlist)) + ' Success!') if types == 'Insert': self.replace_queue_db(replacequeue) def replace_queue_db(self, replacequeue): """ replace db for fininsh playlist id """ results = self.Db.update_db(self.replace_queue % str(replacequeue)[1:-1]) if results: if len(replacequeue): print('Update queue fininsh for ' + str(len(replacequeue)) + ' item!') else: pass
class GetFreeProxy: """ proxy pool """ def __init__(self): self.Db = Db("proxy") self.insert_sql = """INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s """ self.select_list = ( """SELECT address, http_type from ip_proxy WHERE `is_failured` = 0""" ) self.select_sql = """SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s """ self.select_all = """SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s""" self.random_select = """SELECT `address`, `http_type` FROM ip_proxy WHERE `is_failured` >= 5 and (`id` >= ((SELECT MAX(`id`) FROM ip_proxy)-(SELECT MIN(`id`) FROM ip_proxy)) * RAND() + (SELECT MIN(`id`) FROM ip_proxy)) and http_type in %s LIMIT 6000""" self.replace_ip = """REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s""" self.can_use_ip = {} self.waitjudge = [] self.cannot_use_ip = {} self.failured_time = {} self.canuse_proxies = [] self.init_proxy() def proxy_req( self, url: str, types: int, data=None, header=None, test_func=None, need_cookie: bool = False, config: dict = {}, proxies: dict = {}, ): """ use proxy to send requests, and record the proxy can't use @types S0XY: X=0.->get; =1.->post; Y=0.->html; =1.->json; =2.->basic S=0.->basic ;=1.->ss support failured retry && failured auto record """ httptype = url[4] == "s" ss_type = types // 1000 types %= 1000 if ss_type: proxylist = self.proxylists_ss if httptype else self.proxylist_ss else: proxylist = self.proxylists if httptype else self.proxylist if proxies != {}: proxies = proxies elif not len(proxylist): if self.Db.db: echo( "0|critical", "Proxy pool empty!!! Please check the db conn & db dataset!!!", ) proxies = {} else: index = random.randint(0, len(proxylist) - 1) proxies_url = proxylist[index] proxies = {type_map[httptype]: proxies_url} try: result = basic_req( url, types=types, proxies=proxies, data=data, header=header, need_cookie=need_cookie, config=config, ) if test_func is not None: if not test_func(result): if self.check_retry(url): return self.proxy_req( url, types=types + 1000 * ss_type, data=data, header=header, test_func=test_func, need_cookie=need_cookie, config=config, proxies=proxies, ) else: self.failured_time[url] = 0 return return result return result except: self.cannot_use_ip[random.randint(0, MAXN)] = proxies_url if proxies_url in proxylist: proxylist.remove(proxylist.index(proxies_url)) if not len(self.cannot_use_ip.keys()) % 10: self.clean_cannot_use() if self.check_retry(url): return self.proxy_req( url, types=types + 1000 * ss_type, data=data, test_func=test_func, header=header, need_cookie=need_cookie, config=config, proxies=proxies, ) def check_retry(self, url: str) -> bool: """ check try time """ if url not in self.failured_time: self.failured_time[url] = 0 return True elif self.failured_time[url] < 3: self.failured_time[url] += 1 return True else: self.log_write(url) self.failured_time[url] = 0 return False def log_write(self, url: str): """ failure log """ echo("0|warning", "url {} retry max time".format(url)) def insert_proxy(self, insert_list: list): """ insert data to db """ results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1]) if results: echo("2|info", "Insert " + str(len(insert_list)) + " items Success!") def update_proxy(self, update_list: list, types: int): """ update data to db""" results = self.Db.update_db(self.replace_ip % str(update_list)[1:-1]) typemap = {0: "can use ", 1: "can not use "} if results: echo( "2|info", "Update", typemap[types], str(len(update_list)), " items Success!", ) def select_proxy(self, target_list: list) -> list: """ select ip proxy by ids """ if not len(target_list): return [] elif len(target_list) == 1: waitlist = "('" + target_list[0] + "')" else: waitlist = tuple(target_list) return self.Db.select_db(self.select_sql % str(waitlist)) def db_can_use_proxy(self): """ test db have or not this data """ results = self.select_proxy([ii[0] for ii in self.can_use_ip.values()]) ss_len = len([1 for ii in self.can_use_ip.values() if ii[1] > 1]) echo("2|info", "SS proxies", ss_len) insert_list = [] update_list = [] ip_map = {} if results != False: for ip_info in results: ip_map[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.can_use_ip.values(): http_type = ip_now[1] ip_now = ip_now[0] if ip_now in ip_map: if ip_map[ip_now][1]: update_list.append( (ip_map[ip_now][0], ip_now, http_type, 0)) else: insert_list.append((ip_now, http_type)) if len(insert_list): self.insert_proxy(insert_list) if len(update_list): self.update_proxy(update_list, 0) else: pass self.can_use_ip = {} def clean_cannot_use(self): """ update db proxy cannot use """ results = self.select_proxy(self.cannot_use_ip.values()) update_list = [] ip_map = {} if results: for ip_info in results: ip_map[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.cannot_use_ip.values(): http_type = ip_now[4] == "s" if ip_now in ip_map: update_list.append((ip_map[ip_now][0], ip_now, http_type, ip_map[ip_now][1] + 1)) if len(update_list): self.update_proxy(update_list, 1) self.cannot_use_ip = {} def init_proxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) self.proxylist = [] self.proxylists = [] self.proxylist_ss = [] self.proxylists_ss = [] if not results: echo("0|error", "Please check db configure!!! The proxy pool cant use!!!>>>") return for index in results: if index[1] == 1: self.proxylists.append(index[0]) elif index[1] == 2: self.proxylist.append(index[0]) self.proxylist_ss.append(index[0]) elif index[1] == 3: self.proxylists.append(index[0]) self.proxylists_ss.append(index[0]) else: self.proxylist.append(index[0]) echo("2|info", len(self.proxylist), " http proxy can use.") echo("2|info", len(self.proxylists), " https proxy can use.") echo("2|info", len(self.proxylist_ss), " ss http proxy can use.") echo("2|info", len(self.proxylists_ss), " ss https proxy can use.") def judge_url(self, urls: str, index: int, times: int, ss_test: bool = False): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == "s" proxies = {type_map[http_type]: urls} test_url = (type_map[http_type] + "://music.163.com/api/playlist/detail?id=432853362") ss_url = "https://www.google.com/?gws_rd=ssl" try: data = basic_req(test_url, 1, proxies) result = data["result"] tracks = result["tracks"] if len(tracks) == 10: if times < 0: self.judge_url(urls, index, times + 1) else: echo("1|debug", urls, proxies, "Proxies can use.") self.canuse_proxies.append(urls) self.can_use_ip[index] = [urls, int(http_type)] if ss_test: data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.can_use_ip[index] = [urls, int(http_type) + 2] else: echo("0|debug", urls, proxies, "Tracks len error ^--<^>--^ ") self.cannot_use_ip[index] = urls except: echo("0|debug", urls, proxies, "return error [][][][][][]") if not index in self.can_use_ip: self.cannot_use_ip[index] = urls def thread_judge(self, batch_size: int = 500): """ threading to judge proxy """ changeJsonTimeout(2) changeHtmlTimeout(3) proxy_exec = ThreadPoolExecutor(max_workers=batch_size // 2) text = self.waitjudge num = len(text) for block in range(num // batch_size + 1): proxy_th = [ proxy_exec.submit(self.judge_url, jj, ii, 0) for ii, jj in enumerate(text[block * batch_size:batch_size * (block + 1)]) ] list(as_completed(proxy_th)) self.db_can_use_proxy() self.clean_cannot_use() self.waitjudge = [] def test_db(self, types: int): """ test proxy in db can use """ version = begin_time() typestr = "" if types == 2: typestr = "(0,1,2,3)" elif types == 1: typestr = "(1,3)" else: typestr = "(0,2)" results = self.Db.select_db(self.select_all % typestr) random_select = self.Db.select_db(self.random_select % typestr) if not results: results = [] if not random_select: random_select = [] for index in results + random_select: self.waitjudge.append(index[0]) self.thread_judge() self.init_proxy() end_time(version, 2) def xici_proxy(self, page: int): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): echo("0|warning", "Please input num!") return [] version = begin_time() url = "http://www.xicidaili.com/nn/%d" for index in range(1, page + 1): html = basic_req(url % index, 0) tem = html.find_all("tr") for index in range(1, len(tem)): tds = tem[index].find_all("td") ip = tds[5].text.lower() self.waitjudge.append("{}://{}:{}".format( ip, tds[1].text, tds[2].text)) self.thread_judge() end_time(version, 2) def gatherproxy(self, types: int): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ if not os.path.exists("{}gatherproxy".format(data_dir)): echo("0|warning", "Gather file not exist!!!") return file_d = read_file("{}gatherproxy".format(data_dir)) waitjudge_http = ["http://" + ii for ii in file_d] waitjudge_https = ["https://" + ii for ii in file_d] if not types: self.waitjudge += waitjudge_http elif types == 1: self.waitjudge += waitjudge_https elif types == 2: self.waitjudge += waitjudge_http + waitjudge_https else: self.waitjudge += file_d echo("2|warning", "load gather over!") def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ version = begin_time() host = "http://www.goubanjia.com" html = self.proxy_req(host, 0) if not html: return [] trs = html.find_all("tr", class_=["warning", "success"]) for tr in trs: tds = tr.find_all("td") ip = tds[2].find_all("a")[0].text + "://" iplist = tds[0].find_all(["div", "span", not "p"], class_=not "port") for index in iplist: ip += index.text encode = tds[0].find_all(["div", "span", "p"], class_="port")[0]["class"][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord("A")), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ":" + str(int(uncode / 8))) self.thread_judge() end_time(version, 2) def schedulegou(self): sched = BlockingScheduler() sched.add_job(self.goubanjia, "interval", seconds=100) sched.start() def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ version = begin_time() url_list = ["", "free/gngn/index.shtml", "free/gwgn/index.shtml"] host = "http://www.data5u.com/" for uri in url_list: html = self.proxy_req(host + uri, 0) if not html: continue table = html.find_all("ul", class_="l2") for index in table: tds = index.find_all("li") ip = tds[3].text self.waitjudge.append("{}://{}:{}".format( ip, tds[1].text, tds[2].text)) self.thread_judge() end_time(version, 2) def sixsixip(self, area: int, page: int): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): echo("2|debug", "{} {}".format(index, pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.thread_judge() end_time(version, 2) def sixsixthread(self, index: int, pageindex: int): host = """http://www.66ip.cn/areaindex_%d/%d.html""" html = self.proxy_req(host % (index, pageindex), 0) if not html: return [] trs = html.find_all("table")[2].find_all("tr") for test in range(1, len(trs) - 1): tds = trs[test].find_all("td") self.waitjudge.append("http://{}:{}".format( tds[0].text, tds[1].text)) self.waitjudge.append("https://{}:{}".format( tds[0].text, tds[1].text)) def kuaidaili(self, page: int): """ kuaidaili https://www.kuaidaili.com/free/ """ version = begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.thread_judge() end_time(version, 2) def kuaidailithread(self, index: int): host = """https://www.kuaidaili.com/free/inha/%d/""" html = self.proxy_req(host % index, 0) if not html: return [] trs = html.find_all("tr") for index in range(1, len(trs)): tds = trs[index].find_all("td") ip = tds[3].text.lower() + "://" + tds[0].text + ":" + tds[1].text self.waitjudge.append(ip) def get_cookie(self): """ make cookie login PS: Though cookie expired time is more than 1 year, but It will be break when the connect close. So you need reactive the cookie by this function. """ headers = { "Cookie": "_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57", "Accept": get_accept("html") + ";q=0.9", } login_url = "http://www.gatherproxy.com/subscribe/login" cookie_html = basic_req(login_url, 3, header=headers) try: verify_text = re.findall('<span class="blue">(.*?)</span>', cookie_html)[0] except: return verify_list = verify_text.replace("= ", "").strip().split() num_map = { "Zero": 0, "One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5, "Six": 6, "Seven": 7, "Eight": 8, "Nine": 9, "Ten": 10, } verify_num = [verify_list[0], verify_list[2]] for index, num in enumerate(verify_num): if num.isdigit(): verify_num[index] = int(num) elif num in num_map: verify_num[index] = num_map[num] else: echo("0|error", "Error", num) # return False verify_code = 0 error = True operation = verify_list[1] if (operation == "+" or operation == "plus" or operation == "add" or operation == "multiplied"): verify_code = verify_num[0] + verify_num[1] error = False if operation == "-" or operation == "minus": verify_code = verify_num[0] - verify_num[1] error = False if operation == "X" or operation == "multiplication": verify_code = verify_num[0] * verify_num[1] error = False if error: echo("0|error", "Error", operation) if not os.path.exists("%spassage" % data_dir): echo("0|warning", "gather passage not exist!!!") return with codecs.open("%spassage" % data_dir, "r", encoding="utf-8") as f: passage = [index[:-1] for index in f.readlines()] data = { "Username": passage[0], "Password": passage[1], "Captcha": str(verify_code), } time.sleep(2.163) r = requests.session() r.cookies = cj.LWPCookieJar() login_req = r.post(login_url, headers=headers, data=data, verify=False) def load_gather(self): """ load gather proxy pool text If failured, you should reactive the cookie. """ headers = { "Cookie": "_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57", "Accept": get_accept("html") + ";q=0.9", } url = "http://www.gatherproxy.com/subscribe/infos" try: sid_url_req = requests.get(url, headers=headers, verify=False, timeout=10) except: return sid_url_html = BeautifulSoup(sid_url_req.text, "html.parser") sid_url = sid_url_html.find_all( "div", class_="wrapper")[1].find_all("a")[0]["href"] if len(sid_url.split("sid=")) < 2: echo("0|warning", "cookie error") self.get_cookie() self.load_gather() return sid = sid_url.split("sid=")[1] sid_url = "http://www.gatherproxy.com" + sid_url data = {"ID": sid, "C": "", "P": "", "T": "", "U": "0"} gatherproxy = requests.post(sid_url, headers=headers, data=data, verify=False) with codecs.open(data_dir + "gatherproxy", "w", encoding="utf-8") as f: f.write(gatherproxy.text) def load_proxies_list(self, types: int = 2): """ load proxies """ SITES = [ "http://www.proxyserverlist24.top/", "http://www.live-socks.net/" ] spider_pool = [] self.waitjudge = [] for site in SITES: self.get_other_proxies(site) self.gatherproxy(3) waitjudge = list(set(self.waitjudge)) waitjudge_http = ["http://" + ii for ii in waitjudge] waitjudge_https = ["https://" + ii for ii in waitjudge] if not types: self.waitjudge = waitjudge_http elif types == 1: self.waitjudge = waitjudge_https else: self.waitjudge = waitjudge_http + waitjudge_https echo( "1|info", "-_-_-_-_-_-_-", len(waitjudge), "Proxies wait to judge -_-_-_-_-_-_-", ) def request_text(self, url: str) -> str: """ requests text """ req = basic_req(url, 2) if req is None: echo("0|debug", url) if can_retry(url): return self.request_text(url) else: return "" echo("1|debug", url) text = req.text if type(text) == str: return text elif type(text) == bytes: return text.decode() else: return "" def get_free_proxy(self, url: str): req = basic_req(url, 2) if req is None: return [] tt = req.text t_list = re.findall("<tr><td>(\d*\.\d*\.\d*\.\d*)</td><td>(\d*?)</td>", tt) echo(1, "Get Free proxy List", url, len(t_list)) return ["{}:{}".format(ii, jj) for ii, jj in t_list] def get_proxy_free(self): urls = [ "https://www.sslproxies.org", "https://free-proxy-list.net", "https://www.us-proxy.org", "https://free-proxy-list.net/uk-proxy.html", "https://free-proxy-list.net/anonymous-proxy.html", "http://www.google-proxy.net", ] t_list = [] for url in urls: t_list.extend(self.get_free_proxy(url)) t_list.extend(self.get_api()) for ii in ["http", "https"]: t_list.extend(self.get_download(ii)) t_list = list(set(t_list)) with open(data_dir + "gatherproxy", "w") as f: f.write("\n".join(t_list)) def ip_decoder(self, data: str): data = re.sub("\+", "\x20", data) data = re.sub( "%([a-fA-F0-9][a-fA-F0-9])", lambda i: chr(int("0x" + i.group()[1:], 16)), data, ) return re.findall(">(.*?)</a", data) def get_api(self): API_KEY = "xxx" url = "http://api.scraperapi.com/?api_key={}&url=http://httpbin.org/ip".format( API_KEY) t_list = [] for ii in range(38): tt = basic_req(url, 1) if tt is None: continue t_list.append(tt["origin"]) echo(1, "Get scraperapi", len(t_list)) return t_list def get_download(self, types: str): url = "https://www.proxy-list.download/api/v0/get?l=en&t=" + types tt = basic_req(url, 1) if tt is None: return [] tt_list = tt[0]["LISTA"] echo(1, "Get download", types, len(tt_list)) return ["{}:{}".format(ii["IP"], ii["PORT"]) for ii in tt_list] def get_other_proxies(self, url: str): """ get other proxies """ pages = re.findall(r"<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>", self.request_text(url)) if not len(pages): echo("0|warning", "Please do not frequently request {}!!!".format(url)) else: proxies = [ re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", self.request_text(ii)) for ii in pages ] self.waitjudge = [*self.waitjudge, *sum(proxies, [])] def load_proxies_test(self): """ load mode & test proxies """ version = begin_time() self.load_proxies_list() proxies_len = len(self.waitjudge) self.thread_judge() canuse_len = len(self.canuse_proxies) echo( "1|info", "\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {}\n". format(proxies_len, canuse_len, end_time(version)), ) with open("{}canuse_proxies.txt".format(data_dir), "w") as f: f.write("\n".join(self.canuse_proxies))
class GetFreeProxy: ''' proxy pool ''' def __init__(self): self.Db = Db("proxy") self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s ''' self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0''' self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s ''' self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s''' self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s''' self.canuseip = {} self.waitjudge = [] self.cannotuseip = {} self.failuredtime = {} self.canuse_proxies = [] self.initproxy() def proxy_req(self, url: str, types: int, data=None, test_func=None, header=None, need_cookie: bool = False): """ use proxy to send requests, and record the proxy cann't use @types S0XY: X=0.->get; =1.->post; Y=0.->html; =1.->json; =2.->basic S=0.->basic ;=1.->ss support failured retry && failured auto record """ httptype = url[4] == 's' ss_type = types // 1000 types %= 1000 if ss_type: proxylist = self.proxylists_ss if httptype else self.proxylist_ss else: proxylist = self.proxylists if httptype else self.proxylist if not len(proxylist): if self.Db.db: echo( 0, 'Proxy pool empty!!! Please check the db conn & db dataset!!!' ) proxies = {} else: index = random.randint(0, len(proxylist) - 1) proxies_url = proxylist[index] proxies = {type_map[httptype]: proxies_url} try: result = basic_req(url, types, proxies, data, header, need_cookie) if not test_func is None: if not test_func(result): if self.check_retry(url): self.proxy_req(url, types + 1000 * ss_type, data, test_func, header, need_cookie) else: self.failuredtime[url] = 0 return else: return result else: return result except: self.cannotuseip[random.randint(0, MAXN)] = proxies_url if proxies_url in proxylist: proxylist.remove(proxylist.index(proxies_url)) if not len(self.cannotuseip.keys()) % 10: self.cleancannotuse() if self.check_retry(url): self.proxy_req(url, types + 1000 * ss_type, data, test_func, header, need_cookie) else: return def check_retry(self, url): """ check cannt retry """ if url not in self.failuredtime: self.failuredtime[url] = 0 return True elif self.failuredtime[url] < 3: self.failuredtime[url] += 1 return True else: self.log_write(url) self.failuredtime[url] = 0 return False def log_write(self, url): """ failure log """ with codecs.open("proxy.log", 'a', encoding='utf-8') as f: f.write(time_str() + url + '\n') def insertproxy(self, insertlist): """ insert data to db """ results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1]) if results: echo(2, 'Insert ' + str(len(insertlist)) + ' items Success!') else: pass def updateproxy(self, updatelist, types): """ update data to db """ results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1]) typemap = {0: 'can use ', 1: 'can not use '} if results: echo(2, 'Update', typemap[types], str(len(updatelist)), ' items Success!') else: pass def selectproxy(self, targetlist): """ select ip proxy by ids """ if not len(targetlist): return [] elif len(targetlist) == 1: waitlist = '(\'' + targetlist[0] + '\')' else: waitlist = tuple(targetlist) return self.Db.select_db(self.select_sql % str(waitlist)) def dbcanuseproxy(self): """ test db have or not this data """ results = self.selectproxy([ii[0] for ii in self.canuseip.values()]) ss_len = len([1 for ii in self.canuseip.values() if ii[1] > 1]) echo(2, "SS proxies %d" % ss_len) insertlist = [] updatelist = [] ipmap = {} if results != False: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.canuseip.values(): http_type = ip_now[1] ip_now = ip_now[0] if ip_now in ipmap: if ipmap[ip_now][1]: updatelist.append( (ipmap[ip_now][0], ip_now, http_type, 0)) else: insertlist.append((ip_now, http_type)) if len(insertlist): self.insertproxy(insertlist) if len(updatelist): self.updateproxy(updatelist, 0) else: pass self.canuseip = {} def cleancannotuse(self): """ update db proxy cann't use """ results = self.selectproxy(self.cannotuseip.values()) updatelist = [] ipmap = {} if results: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.cannotuseip.values(): http_type = ip_now[4] == 's' if ip_now in ipmap: updatelist.append((ipmap[ip_now][0], ip_now, http_type, ipmap[ip_now][1] + 1)) if len(updatelist): self.updateproxy(updatelist, 1) else: pass self.cannotuseip = {} def initproxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) self.proxylist = [] self.proxylists = [] self.proxylist_ss = [] self.proxylists_ss = [] if not results: echo(0, 'Please check db configure!!! The proxy pool cant use!!!>>>') return for index in results: if index[1] == 1: self.proxylists.append(index[0]) elif index[1] == 2: self.proxylist.append(index[0]) self.proxylist_ss.append(index[0]) elif index[1] == 3: self.proxylists.append(index[0]) self.proxylists_ss.append(index[0]) else: self.proxylist.append(index[0]) echo(2, len(self.proxylist), ' http proxy can use.') echo(2, len(self.proxylists), ' https proxy can use.') echo(2, len(self.proxylist_ss), ' ss http proxy can use.') echo(2, len(self.proxylists_ss), ' ss https proxy can use.') def judgeurl(self, urls, index, times, ss_test=False): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == 's' proxies = {type_map[http_type]: urls} test_url = type_map[ http_type] + '://music.163.com/api/playlist/detail?id=432853362' ss_url = 'https://www.google.com/?gws_rd=ssl' try: data = basic_req(test_url, 1, proxies) result = data['result'] tracks = result['tracks'] if len(tracks) == 56: if times < 0: self.judgeurl(urls, index, times + 1) else: echo(1, urls, proxies, 'Proxies can use.') self.canuse_proxies.append(urls) self.canuseip[index] = [urls, int(http_type)] if ss_test: data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.canuseip[index] = [urls, int(http_type) + 2] else: echo(0, urls, proxies, 'Tracks len error ^--<^>--^ ') self.cannotuseip[index] = urls except: echo(0, urls, proxies, 'return error [][][][][][]') if not index in self.canuseip: self.cannotuseip[index] = urls pass def threadjude(self, batch_size=500): """ threading to judge proxy """ changeJsonTimeout(2) changeHtmlTimeout(3) text = self.waitjudge num = len(text) for block in range(num // batch_size + 1): blockthreads = [] for index in range(block * batch_size, min(num, batch_size * (block + 1))): work = threading.Thread(target=self.judgeurl, args=( text[index], index, 0, )) blockthreads.append(work) for work in blockthreads: work.start() for work in blockthreads: work.join() self.dbcanuseproxy() self.cleancannotuse() self.waitjudge = [] def testdb(self, types): ''' test proxy in db can use ''' version = begin_time() typestr = '' if types == 2: typestr = '(0,1,2,3)' elif types == 1: typestr = '(1,3)' else: typestr = '(0,2)' results = self.Db.select_db(self.select_all % typestr) if results != 0: for index in results: self.waitjudge.append(index[0]) self.threadjude() else: pass self.initproxy() end_time(version) def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): echo(0, "Please input num!") return [] version = begin_time() url = 'http://www.xicidaili.com/nn/%d' for index in range(1, page + 1): html = basic_req(url % (index), 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append(ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time(version) def gatherproxy(self, types): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ if not os.path.exists('{}gatherproxy'.format(data_dir)): echo(0, 'Gather file not exist!!!') return with codecs.open('{}gatherproxy'.format(data_dir), 'r', encoding='utf-8') as f: file_d = [ii.strip() for ii in f.readlines()] waitjudge_http = ['http://' + ii for ii in file_d] waitjudge_https = ['https://' + ii for ii in file_d] if not types: self.waitjudge += waitjudge_http elif types == 1: self.waitjudge += waitjudge_https elif types == 2: self.waitjudge += (waitjudge_http + waitjudge_https) else: self.waitjudge += file_d echo(2, 'load gather over!') def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ version = begin_time() host = 'http://www.goubanjia.com' html = self.proxy_req(host, 0) if not html: return [] trs = html.find_all('tr', class_=['warning', 'success']) for tr in trs: tds = tr.find_all('td') ip = tds[2].find_all('a')[0].text + '://' iplist = tds[0].find_all(['div', 'span', not 'p'], class_=not 'port') for index in iplist: ip += index.text encode = tds[0].find_all(['div', 'span', 'p'], class_='port')[0]['class'][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord('A')), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ':' + str(int(uncode / 8))) self.threadjude() end_time(version) def schedulegou(self): sched = BlockingScheduler() sched.add_job(self.goubanjia, 'interval', seconds=100) sched.start() def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ version = begin_time() url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml'] host = 'http://www.data5u.com/' for uri in url_list: html = self.proxy_req(host + uri, 0) if not html: continue table = html.find_all('ul', class_='l2') for index in table: tds = index.find_all('li') ip = tds[3].text self.waitjudge.append(ip + '://' + tds[0].text + ':' + tds[1].text) self.threadjude() end_time(version) def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): echo(2, str(index) + ' ' + str(pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version) def sixsixthread(self, index, pageindex): host = '''http://www.66ip.cn/areaindex_%d/%d.html''' html = self.proxy_req(host % (index, pageindex), 0) if not html: return [] trs = html.find_all('table')[2].find_all('tr') for test in range(1, len(trs) - 1): tds = trs[test].find_all('td') self.waitjudge.append('http://' + tds[0].text + ':' + tds[1].text) self.waitjudge.append('https://' + tds[0].text + ':' + tds[1].text) def kuaidaili(self, page): """ kuaidaili https://www.kuaidaili.com/free/ """ version = begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version) def kuaidailithread(self, index): host = '''https://www.kuaidaili.com/free/inha/%d/''' html = self.proxy_req(host % index, 0) if not html: return [] trs = html.find_all('tr') for index in range(1, len(trs)): tds = trs[index].find_all('td') ip = tds[3].text.lower() + "://" + tds[0].text + ':' + tds[1].text self.waitjudge.append(ip) def get_cookie(self): """ make cookie login PS: Though cookie expired time is more than 1 year, but It will be break when the connect close. So you need reactive the cookie by this function. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } login_url = 'http://www.gatherproxy.com/subscribe/login' cookie_html = basic_req(login_url, 0, header=headers) verify_text = cookie_html.find_all('div', class_='label')[2].span.text verify_list = verify_text.replace('= ', '').strip().split() num_map = { 'Zero': 0, 'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Fine': 5, 'Six': 6, 'Seven': 7, 'Eight': 8, 'Nine': 9, 'Ten': 10 } verify_num = [verify_list[0], verify_list[2]] for index, num in enumerate(verify_num): if num.isdigit(): verify_num[index] = int(num) elif num in num_map: verify_num[index] = num_map[num] else: echo(0, 'Error', index) # return False verify_code = 0 error = True operation = verify_list[1] if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied': verify_code = verify_num[0] + verify_num[1] error = False if operation == '-' or operation == 'minus': verify_code = verify_num[0] - verify_num[1] error = False if operation == 'X' or operation == 'multiplication': verify_code = verify_num[0] * verify_num[1] error = False if error: echo(0, 'Error', operation) if not os.path.exists('%spassage' % data_dir): echo(0, 'gather passage not exist!!!') return with codecs.open('%spassage' % data_dir, 'r', encoding='utf-8') as f: passage = [index[:-1] for index in f.readlines()] data = { 'Username': passage[0], 'Password': passage[1], 'Captcha': str(verify_code) } time.sleep(2.163) r = requests.session() r.cookies = cj.LWPCookieJar() login_req = r.post(login_url, headers=headers, data=data, verify=False) def load_gather(self): """ load gather proxy pool text If failured, you should reactive the cookie. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } url = 'http://www.gatherproxy.com/subscribe/infos' sid_url_req = requests.get(url, headers=headers, verify=False) sid_url_html = BeautifulSoup(sid_url_req.text, 'html.parser') sid_url = sid_url_html.find_all( 'div', class_='wrapper')[1].find_all('a')[0]['href'] if len(sid_url.split('sid=')) < 2: echo(0, 'cookie error') self.get_cookie() self.load_gather() return sid = sid_url.split('sid=')[1] sid_url = 'http://www.gatherproxy.com' + sid_url data = {'ID': sid, 'C': '', 'P': '', 'T': '', 'U': '0'} gatherproxy = requests.post(sid_url, headers=headers, data=data, verify=False) with codecs.open(data_dir + 'gatherproxy', 'w', encoding='utf-8') as f: f.write(gatherproxy.text) def load_proxies_list(self, types=2): ''' load proxies ''' SITES = [ 'http://www.proxyserverlist24.top/', 'http://www.live-socks.net/' ] spider_pool = [] self.waitjudge = [] for site in SITES: self.get_other_proxies(site) if os.path.exists('{}gatherproxy'.format(data_dir)): self.gatherproxy(3) waitjudge = list(set(self.waitjudge)) waitjudge_http = ['http://' + ii for ii in waitjudge] waitjudge_https = ['https://' + ii for ii in waitjudge] if not types: self.waitjudge = waitjudge_http elif types == 1: self.waitjudge = waitjudge_https else: self.waitjudge = (waitjudge_http + waitjudge_https) echo(1, '-_-_-_-_-_-_-', len(waitjudge), 'Proxies wait to judge -_-_-_-_-_-_-') def request_text(self, url): ''' requests text ''' req = basic_req(url, 2) if req is None: echo(0, url) if can_retry(url): self.request_text(url) else: return '' else: echo(1, url) return req.text def get_other_proxies(self, url): ''' get other proxies ''' text = self.request_text(url) pages = re.findall(r'<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>', '' if text is None else text) if not len(pages): echo(0, 'Please do not frequently request {}!!!'.format(url)) else: proxies = [ re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', self.request_text(ii)) for ii in pages ] self.waitjudge = [*self.waitjudge, *sum(proxies, [])] def load_proxies_test(self): ''' load mode & test proxies ''' start = time.time() self.load_proxies_list() proxies_len = len(self.waitjudge) self.threadjude() canuse_len = len(self.canuse_proxies) echo( 1, '\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {:.2f}s\n'. format(proxies_len, canuse_len, time.time() - start)) with open('{}canuse_proxies.txt'.format(data_dir), 'w') as f: f.write('\n'.join(self.canuse_proxies))
class find_location(object): """ find location """ def __init__(self): self.Db = Db("china_regions") china = pd.read_csv('news/china_city_list.csv', encoding='gbk') self.province = list(china.groupby(by=['Province']).count().axes[0]) self.city = list(china.groupby(by=['City']).count().axes[0]) self.filelists = ['google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail', 'bjh_detail_poison', 'news_steal.txt', 'news_poison.txt'] self.city_province = {} self.province_map = {} self.pre_data() for index, row in china.iterrows(): self.city_province[row['City']] = row['Province'] def search_location(self): word = '' count = 0 for file in self.filelists: temp_word_list = codecs.open( file, 'r', encoding='utf-8').readlines() count += len(temp_word_list) word += " ".join(temp_word_list) # return word print(count) word_province = {} word_city = {} word_city_pro = {} for index in self.province: temp_num = word.count(index) if temp_num: word_province[index] = temp_num for index in self.city: temp_num = word.count(index) if temp_num: word_city[index] = temp_num for index in word_city: province = self.city_province[index] if province in word_city_pro: word_city_pro[province] += word_city[index] else: word_city_pro[province] = word_city[index] print(sum(word_province.values()), sum( word_city.values()), sum(word_city_pro.values())) return word_province, word_city, word_city_pro def participles_word(self): """ participles word """ version = begin_time() for file in self.filelists: pkuseg.test(file, file[:-4] + '_pkuseg.txt', model_name='../Model_retrieval/pkuseg', nthread=20) end_time(version) def pre_data(self): """ load city key-value from mysql """ province = self.Db.select_db( 'select * from china_regions where level=1') self.province_map = {int(index[2]): index[3][:3] if len(index[3]) == 4 or len( index[3]) == 6 else index[3][:2] for index in province} city = self.Db.select_db( 'select * from china_regions where level=2') city_state = [index for index in city if index[3][-1:] == '州'] seg = pkuseg.pkuseg() city_state = {seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut( index[3])[0] + seg.cut(index[3])[1]: int(index[1]) for index in city if index[3][-1:] == '州'} seg = pkuseg.pkuseg(model_name='../Model_retrieval/pkuseg') city_state1 = {seg.cut(index)[0] if len(seg.cut(index)[0]) > 1 else seg.cut( index)[0] + seg.cut(index)[1]: city_state[index] for index in city_state} city_area = {index[3][:-2]: int(index[1]) for index in city if '地区' in index[3]} city_other = {index[3][:-1]: int(index[1]) for index in city if index[3][-1:] == '市' or index[3][-1:] == '盟'} self.city_province = {**city_state1, **city_area, **city_other} self.city_province = { index: self.province_map[self.city_province[index]] for index in self.city_province} county = self.Db.select_db( 'select * from china_regions where level=3') county_area_pre = {index for index in county if index[3][-1] == '区'} county_area_two = {index[3][:-2]: int(index[1][:2]) for index in county_area_pre if len( index[3]) > 3 and (index[3][-2] == '矿' or index[3][-2] == '林')} # print('芒' in county_area_two, 'two') county_area_state = {seg.cut(index[3][:-2])[0]: int(index[1][:2]) for index in county_area_pre if len(index[3]) > 2 and index[3][-2] == '族'} # print('芒' in county_area_state, 'state') county_area_other = {index[3][:-1]: int(index[1][:2]) for index in county_area_pre if len( index[3]) > 2 and index[3][-2] != '族' and index[3][-2] != '林' and index[3][-2] != '矿'} # print('芒' in county_area_other, 'other') county_county_pre = {index for index in county if index[3][-1] == '县'} county_county_two = {index[3]: int( index[1][:2]) for index in county_county_pre if len(index[3]) == 2} # print('芒' in county_county_two, 'two') seg = pkuseg.pkuseg() county_county_state = {seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(index[3])[0] + seg.cut( index[3])[1]: int(index[1][:2]) for index in county_county_pre if len(index[3]) > 2 and index[3][-3:-1] == '自治'} county_county_state = { index[:-2] if '族' in index and len(index) > 3 else index: county_county_state[index] for index in county_county_state} # print('芒' in county_county_state, 'state') county_county_other = { index[3][:-1]: int(index[1][:2]) for index in county_county_pre if index[3][-3:-1] != '自治' and len(index[3]) > 2} # print('芒' in county_county_other, 'other') county_city = {index[3][:-1] if len(index[3]) > 2 else index[3]: int(index[1][:2]) for index in county if index[3][-1] == '市'} # print('芒' in county_city, 'city') county_domain = {index[3][:4]: int( index[1][:2]) for index in county if index[3][-1] == '域'} # print('芒' in county_domain, 'domain') county_other = {index[3]: int( index[1][:2]) for index in county if index[3][-1] == '盟' or index[3][-1] == '岛'} # print('芒' in county_other, 'other') county_province = {**county_area_two, **county_area_state, **county_area_other, **county_county_two, **county_county_state, **county_county_other, **county_city, **county_domain, **county_other} county_province = { index: self.province_map[county_province[index]] for index in county_province} self.city_province = {**self.city_province, **county_province} print({index for index in self.city_province if len(index) == 1}) def test_province(self, maps, words): word_city = {} for index in maps: temp_num = words.count(index) province = maps[index] if temp_num: if province in word_city: word_city[province] += temp_num else: word_city[province] = temp_num print(sum(word_city.values())) return word_city
class TitleViews(object): """ update title views """ def __init__(self): self.Db = Db("blog") self.local_views = {} self.title_map = {} self.title2slug = {} self.failured_map = {} self.zhihu_views = {} self.zhihu_id = {} self.jianshu_views = {} self.jianshu_id = {} self.csdn_views = {} self.csdn_id = {} self.exist_data = {} self.getTitleMap() self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s''' self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s''' self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s''' def loadLocalView(self): """ load local view """ if not os.path.exists("%sgoogle" % data_dir): return with codecs.open("%sgoogle" % data_dir, 'r', encoding='utf-8') as f: test = f.readlines() test = test[7:] for index in test: arr = index.split(',') slug = self.matchSlug(arr[0]) if slug is None or slug not in self.title_map: continue print(slug + ' ' + str(arr[1]) + ' ' + arr[0]) if slug in self.local_views: self.local_views[slug] += int(arr[1]) else: self.local_views[slug] = int(arr[1]) def getTitleMap(self): """ get title map """ if os.path.exists('%sslug' % data_dir): with codecs.open('%sslug' % data_dir, 'r', encoding='utf-8') as f: slug = f.readlines() else: slug = [] if os.path.exists('%stitle' % data_dir): with codecs.open('%stitle' % data_dir, 'r', encoding='utf-8') as f: title = f.readlines() else: title = [] self.title_map = {tempslug.split( '"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug)} title2slug = { self.title_map[index]: index for index in self.title_map.keys()} noemoji_title = {self.filter_emoji( self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys()} self.title2slug = {**noemoji_title, **title2slug} def matchSlug(self, pattern): """ match slug """ arr = re.search(r'\/([^\/]+).html', pattern) return None if arr is None else arr.group(1) def getZhihuView(self): if os.path.exists('%scookie' % data_dir): with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() else: cookie = ' ' changeCookie(cookie[:-1]) url_basic = [ 'https://www.zhihu.com/api/v4/creator/content_statistics/', 'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=', datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no=' ] url = "".join(url_basic) json = self.get_request(url + '1', 1) if not json: return if not 'data' in json: if 'code' in json: print(json) return for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: print(index['title']) for index in range(json['count'] // 10): print('zhihu', index) json = self.get_request(url + str(index + 2), 1) if not json: continue for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: print(index['title']) def get_request(self, url, types): result = basic_req(url, 1) if not result: if can_retry(url): self.get_request(url, types) return return result def get_request_v2(self, url, types, header): result = proxy_req(url, 0, header=header) if not result or not len(result.find_all('div', class_='content')): if can_retry(url): self.get_request_v2(url, types, header) return return result def get_request_v3(self, url, types): result = basic_req(url, 0) if result is None or not result or not len(result.find_all('p', class_='content')): if can_retry(url): self.get_request_v3(url, types) return return result def getJianshuViews(self): """ get jianshu views """ header = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'sec-ch-ua': 'Google Chrome 75', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'cross-site', 'sec-fetch-user': '******', 'sec-origin-policy': '0', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3736.0 Safari/537.36' } basic_url = 'https://www.jianshu.com/u/2e0f69e4a4f0' for rounds in range(1, 4): url = basic_url if rounds == 1 else basic_url + \ '?order_by=shared_at&page=' + str(rounds) print(url) html = self.get_request_v2(url, 0, header) if html is None: print('None') return for index in html.find_all('li', class_=["", 'have-img']): if len(index.find_all('i')) < 3: continue title = index.find_all('a', class_='title')[ 0].text.replace('`', '') jianshu_id = int(index['data-note-id']) jianshu_count = int(index.find_all('a')[-2].text) if title in self.title2slug: temp_slug = self.title2slug[title] self.jianshu_id[temp_slug] = jianshu_id self.jianshu_views[temp_slug] = jianshu_count elif jianshu_id in self.jianshu_id_map: temp_slug = self.jianshu_id_map[jianshu_id] self.jianshu_id[temp_slug] = jianshu_id self.jianshu_views[temp_slug] = jianshu_count else: print(title) def getCsdnViews(self): """ get csdn views """ basic_url = "https://blog.csdn.net/iofu728" for index in range(1, 3): url = basic_url if index == 1 else basic_url + \ '/article/list/' + str(index) + '?' html = self.get_request_v3(url, 0) if html is None: print('None') return for div_lists in html.find_all('div', class_='article-item-box csdn-tracking-statistics'): if 'style' in div_lists.attrs: continue csdn_id = int(div_lists['data-articleid']) title = div_lists.a.contents[2].replace( '\n', '').strip().replace('`', '') csdn_count = int(div_lists.find_all( 'span', class_='read-num')[0].span.text) if title in self.title2slug: temp_slug = self.title2slug[title] self.csdn_id[temp_slug] = csdn_id self.csdn_views[temp_slug] = csdn_count elif csdn_id in self.csdn_id_map: temp_slug = self.csdn_id_map[csdn_id] self.csdn_id[temp_slug] = csdn_id self.csdn_views[temp_slug] = csdn_count else: print(title) def filter_emoji(self, desstr, restr=''): ''' filter emoji ''' desstr = str(desstr) try: co = re.compile(u'[\U00010000-\U0010ffff]') except re.error: co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') return co.sub(restr, desstr) def init_db(self): self.loadLocalView() self.getZhihuView() self.getJianshuViews() self.getCsdnViews() insert_list = [] for index in self.title_map.keys(): insert_list.append((index, self.local_views[index] if index in self.local_views else 0, self.zhihu_views[index] if index in self.zhihu_views else 0, self.csdn_views[index] if index in self.csdn_views else 0, self.jianshu_views[index] if index in self.jianshu_views else 0, self.zhihu_id[index] if index in self.zhihu_id else 0, self.csdn_id[index] if index in self.csdn_id else 0, self.jianshu_id[index] if index in self.jianshu_id else 0)) # return insert_list results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1]) if results: if len(insert_list): print('Insert ' + str(len(insert_list)) + ' Success!') else: pass def select_all(self): result = self.Db.select_db( "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0") if result == False: print("SELECT Error!") else: self.exist_data = {index[1]: list(index) for index in result} self.zhihu_id_map = {index[6]: index[1] for index in result if index[6]} self.csdn_id_map = {index[7]: index[1] for index in result if index[7]} self.jianshu_id_map = {index[8]: index[1] for index in result if index[8]} for index in self.exist_data: self.exist_data[index][-1] = self.exist_data[index][-1].strftime( '%Y-%m-%d %H:%M:%S') def update_view(self): changeHtmlTimeout(10) wait_map = {} self.select_all() self.getZhihuView() self.getJianshuViews() self.getCsdnViews() for index in self.zhihu_views.keys(): if self.zhihu_views[index] == self.exist_data[index][3] and self.zhihu_id[index] == self.exist_data[index][6]: continue wait_map[index] = self.exist_data[index] wait_map[index][3] = self.zhihu_views[index] wait_map[index][6] = self.zhihu_id[index] for index in self.csdn_views.keys(): if self.csdn_views[index] == self.exist_data[index][4] and self.csdn_id[index] == self.exist_data[index][7]: continue if index not in wait_map: wait_map[index] = self.exist_data[index] wait_map[index][4] = self.csdn_views[index] wait_map[index][7] = self.csdn_id[index] for index in self.jianshu_views.keys(): if self.jianshu_views[index] == self.exist_data[index][5] and self.jianshu_id[index] == self.exist_data[index][8]: continue wait_map[index] = self.exist_data[index] wait_map[index][5] = self.jianshu_views[index] wait_map[index][8] = self.jianshu_id[index] update_list = [tuple(index) for index in wait_map.values()] # return update_list:q if not len(update_list): return results = self.Db.update_db(self.update_sql % str(update_list)[1:-1]) if results: if len(update_list): print('Update ' + str(len(update_list)) + ' Success!') else: pass def new_day(self): day_data = self.Db.select_db( "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1") if not os.path.exists('../blog/log/basic'): print('File not exist!!!') return with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f: existed_spider = int(f.readlines()[1]) today_date = datetime.datetime.now().strftime('%Y-%m-%d') new_day_list = [(today_date, day_data[0][0] + day_data[0][1], existed_spider)] results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1]) if results: if len(new_day_list): print('New day update' + str(len(new_day_list)) + ' Success!') else: pass
class TitleViews(object): ''' script of load my blog data -> analysis ''' CSDN_URL = 'https://blog.csdn.net/iofu728' JIANSHU_URL = 'https://www.jianshu.com/u/2e0f69e4a4f0' ZHIHU_URL = 'https://www.zhihu.com/api/v4/creator/content_statistics/' def __init__(self): self.Db = Db("blog") self.local_views = {} self.title_map = {} self.title2slug = {} self.zhihu_views = {} self.zhihu_id = {} self.jianshu_views = {} self.jianshu_id = {} self.csdn_views = {} self.csdn_id = {} self.exist_data = {} self.getTitleMap() self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s''' self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s''' self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s''' def loadLocalView(self): ''' load local view ''' test = read_file('{}google'.format(data_dir))[7:] for index in test: arr = index.split(',') slug = self.matchSlug(arr[0]) if slug is None or slug not in self.title_map: continue print(slug + ' ' + str(arr[1]) + ' ' + arr[0]) if slug in self.local_views: self.local_views[slug] += int(arr[1]) else: self.local_views[slug] = int(arr[1]) def getTitleMap(self): ''' get title map ''' slug = read_file('{}slug'.format(data_dir)) title = read_file('{}title'.format(data_dir)) self.title_map = { tempslug.split('"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug) } title2slug = { self.title_map[index]: index for index in self.title_map.keys() } noemoji_title = { self.filter_emoji(self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys() } self.title2slug = {**noemoji_title, **title2slug} def matchSlug(self, pattern: str): ''' match slug ''' arr = re.search(r'\/([^\/]+).html', pattern) return None if arr is None else arr.group(1) def getZhihuView(self): cookie = ''.join(read_file('{}cookie'.format(data_dir))) changeCookie(cookie) url_basic = [ self.ZHIHU_URL, 'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=', datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no=' ] url = ''.join(url_basic) json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i) if not json: return if not 'data' in json: if 'code' in json: echo('0|warning', json) return echo(3, 'zhihu', json) for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: echo('0|debug', index['title']) for index in range(1, json['count'] // 10): echo(1, 'zhihu', index) json = self.get_request('{}{}'.format(url, 1 + index), 1, lambda i: not i) echo(2, 'zhihu', json) if not json: continue for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: echo('0|debug', index['title']) def get_request(self, url: str, types: int, functs, header: dict = {}): if len(header): req = basic_req(url, types, header=header) else: req = basic_req(url, types) if functs(req): if can_retry(url): self.get_request(url, types, functs, header) return return req def getJianshuViews(self): ''' get jianshu views ''' header = {'accept': get_accept('html')} for rounds in range(1, 4): url = self.JIANSHU_URL if rounds > 1: url += '?order_by=shared_at&page={}'.format(rounds) echo('1|debug', 'jianshu req url:', url) html = self.get_request( url, 0, lambda i: not i or not len( i.find_all('div', class_='content')), header) if html is None: echo(0, 'None') return for index in html.find_all('li', class_=["", 'have-img']): if len(index.find_all('i')) < 3: continue title = index.find_all('a', class_='title')[0].text.replace( '`', '') jianshu_id = int(index['data-note-id']) jianshu_count = int(index.find_all('a')[-2].text) if title in self.title2slug: temp_slug = self.title2slug[title] self.jianshu_id[temp_slug] = jianshu_id self.jianshu_views[temp_slug] = jianshu_count elif jianshu_id in self.jianshu_id_map: temp_slug = self.jianshu_id_map[jianshu_id] self.jianshu_id[temp_slug] = jianshu_id self.jianshu_views[temp_slug] = jianshu_count else: echo(1, title) def getCsdnViews(self): ''' get csdn views ''' for index in range(1, 3): url = self.CSDN_URL if index > 1: url += '/article/list/{}?'.format(index) echo(1, 'csdn url', url) html = self.get_request( url, 0, lambda i: i is None or not i or not len( i.find_all('p', class_='content'))) if html is None: echo(0, 'None') return for div_lists in html.find_all( 'div', class_='article-item-box csdn-tracking-statistics'): if 'style' in div_lists.attrs: continue csdn_id = int(div_lists['data-articleid']) title = div_lists.a.contents[2].replace('\n', '').strip().replace( '`', '') csdn_count = int( div_lists.find_all('span', class_='read-num')[0].span.text) if title in self.title2slug: temp_slug = self.title2slug[title] self.csdn_id[temp_slug] = csdn_id self.csdn_views[temp_slug] = csdn_count elif csdn_id in self.csdn_id_map: temp_slug = self.csdn_id_map[csdn_id] self.csdn_id[temp_slug] = csdn_id self.csdn_views[temp_slug] = csdn_count else: echo(1, title) def filter_emoji(self, desstr, restr=''): ''' filter emoji ''' desstr = str(desstr) try: co = re.compile(u'[\U00010000-\U0010ffff]') except re.error: co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') return co.sub(restr, desstr) def init_db(self): self.loadLocalView() self.getZhihuView() self.getJianshuViews() self.getCsdnViews() insert_list = [] for index in self.title_map.keys(): insert_list.append( (index, self.local_views[index] if index in self.local_views else 0, self.zhihu_views[index] if index in self.zhihu_views else 0, self.csdn_views[index] if index in self.csdn_views else 0, self.jianshu_views[index] if index in self.jianshu_views else 0, self.zhihu_id[index] if index in self.zhihu_id else 0, self.csdn_id[index] if index in self.csdn_id else 0, self.jianshu_id[index] if index in self.jianshu_id else 0)) # return insert_list results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1]) if results: if len(insert_list): print('Insert ' + str(len(insert_list)) + ' Success!') else: pass def select_all(self): result = self.Db.select_db( "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0" ) if result == False: print("SELECT Error!") else: self.exist_data = {index[1]: list(index) for index in result} self.zhihu_id_map = { index[6]: index[1] for index in result if index[6] } self.csdn_id_map = { index[7]: index[1] for index in result if index[7] } self.jianshu_id_map = { index[8]: index[1] for index in result if index[8] } for index in self.exist_data: self.exist_data[index][-1] = self.exist_data[index][ -1].strftime('%Y-%m-%d %H:%M:%S') def update_view(self): changeHtmlTimeout(10) wait_map = {} self.select_all() self.getZhihuView() self.getJianshuViews() self.getCsdnViews() for index in self.zhihu_views.keys(): if self.zhihu_views[index] == self.exist_data[index][ 3] and self.zhihu_id[index] == self.exist_data[index][6]: continue wait_map[index] = self.exist_data[index] wait_map[index][3] = self.zhihu_views[index] wait_map[index][6] = self.zhihu_id[index] for index in self.csdn_views.keys(): if self.csdn_views[index] == self.exist_data[index][ 4] and self.csdn_id[index] == self.exist_data[index][7]: continue if index not in wait_map: wait_map[index] = self.exist_data[index] wait_map[index][4] = self.csdn_views[index] wait_map[index][7] = self.csdn_id[index] for index in self.jianshu_views.keys(): if self.jianshu_views[index] == self.exist_data[index][ 5] and self.jianshu_id[index] == self.exist_data[index][8]: continue wait_map[index] = self.exist_data[index] wait_map[index][5] = self.jianshu_views[index] wait_map[index][8] = self.jianshu_id[index] update_list = [tuple(index) for index in wait_map.values()] # return update_list:q if not len(update_list): return results = self.Db.update_db(self.update_sql % str(update_list)[1:-1]) if results: if len(update_list): print('Update ' + str(len(update_list)) + ' Success!') else: pass def new_day(self): day_data = self.Db.select_db( "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1" ) if not os.path.exists('../blog/log/basic'): print('File not exist!!!') return with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f: existed_spider = int(f.readlines()[1]) today_date = datetime.datetime.now().strftime('%Y-%m-%d') new_day_list = [(today_date, day_data[0][0] + day_data[0][1], existed_spider)] results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1]) if results: if len(new_day_list): print('New day update' + str(len(new_day_list)) + ' Success!') else: pass def load_csdn_img(self): ''' load csdn img ''' mkdir(data_dir) urls = ['/article/list/2?', ''] article_ids = [] for url in urls: req = basic_req('{}{}'.format(self.CSDN_URL, url), 3) article_ids.extend(re.findall('data-articleid="(\w*?)"', req)) echo(0, article_ids) article_thread = [ threading.Thread(target=self.load_csdn_img_batch, args=(ii, )) for ii in article_ids ] for work in article_thread: work.start() for work in article_thread: work.join() def load_csdn_img_batch(self, article_id: int): url = '{}/article/details/{}'.format(self.CSDN_URL, article_id) req = proxy_req(url, 3) if not 'iofu728' in req: if can_retry(url): self.load_csdn_img_batch(article_id) return img_lists = re.findall('"(https://cdn.nlark.com.*)" alt', req) img_thread = [ threading.Thread(target=self.load_csdn_img_load, args=(jj, article_id, ii)) for ii, jj in enumerate(img_lists) ] echo(1, 'Article Need Load {} Img...'.format(len(img_lists))) for work in img_thread: work.start() for work in img_thread: work.join() def load_csdn_img_load(self, img_url: str, article_id: int, idx: int): img_dir = '{}{}/'.format(data_dir, article_id) img_path = '{}{}.png'.format(img_dir, idx) if os.path.exists(img_path): return req = proxy_req(img_url, 2) if type(req) == bool or req is None: if can_retry(img_url): self.load_csdn_img_load(img_url, article_id, idx) return mkdir(img_dir) with open(img_path, 'wb') as f: f.write(req.content)
class ActivateArticle(TBK): """ activate article in youdao Cloud""" Y_URL = "https://note.youdao.com/" WEB_URL = f"{Y_URL}web/" API_P_URL = f"{Y_URL}yws/api/personal/" SYNC_URL = f"{API_P_URL}sync?method=%s&keyfrom=web&cstk=%s" NOTE_URL = f"{Y_URL}yws/public/note/%s?editorType=0" SHARE_URL = f"{Y_URL}ynoteshare1/index.html?id=%s&type=note" GET_SHARE_URL = f"{API_P_URL}share?method=get&shareKey=%s" LISTRECENT_URL = ( f"{API_P_URL}file?method=listRecent&offset=%d&limit=30&keyfrom=web&cstk=%s" ) MYSHARE_URL = ( f"{API_P_URL}myshare?method=get&checkBan=true&entryId=%s&keyfrom=web&cstk=%s" ) DECODER_TPWD_URL = "https://api.taokouling.com/tkl/tkljm?apikey=%s&tkl=¥%s¥" Y_DOC_JS_URL = "https://shared-https.ydstatic.com/ynote/ydoc/index-6f5231c139.js" MTOP_URL = "https://h5api.m.taobao.com/h5/%s/%d.0/" ITEM_URL = "https://item.taobao.com/item.htm?id=%d" DETAIL_URL = 'https://detail.m.tmall.com/item.htm?id=%d' S_LIST_SQL = "SELECT `id`, article_id, title, q, created_at from article;" I_LIST_SQL = "INSERT INTO article (article_id, title, q) VALUES %s;" R_LIST_SQL = "REPLACE INTO article (`id`, article_id, title, q, is_deleted, created_at) VALUES %s;" S_ARTICLE_SQL = 'SELECT `id`, article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at, created_at from article_tpwd WHERE `article_id` = "%s";' I_ARTICLE_SQL = "INSERT INTO article_tpwd (article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at) VALUES %s;" R_ARTICLE_SQL = "REPLACE INTO article_tpwd (`id`, article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at, created_at, is_deleted) VALUES %s;" END_TEXT = "</text><inline-styles/><styles/></para></body></note>" TPWD_REG = "\p{Sc}(\w{8,12}?)\p{Sc}" TPWD_REG2 = "(\p{Sc}\w{8,12}\p{Sc})" JSON_KEYS = ["p", "ct", "su", "pr", "au","pv","mt","sz","domain", "tl","content", ] URL_DOMAIN = { 0: "s.click.taobao.com", 1: "item.taobao.com", 2: "detail.tmall.com", 5: "uland.taobao.com", 10: "taoquan.taobao.com", 11: "a.m.taobao.com", 15: "empty", 16: "failure", } NEED_KEY = ["content", "url", "validDate", "picUrl"] ONE_HOURS = 3600 ONE_DAY = 24 M = "_m_h5_tk" ZERO_STAMP = "0天0小时0分0秒" T_FORMAT = "%m-%d %H:%M" BASIC_STAMP = ( time_stamp(time_format="%d天%H小时%M分%S秒", time_str="1天0小时0分0秒") - ONE_DAY * ONE_HOURS ) def __init__(self): super(ActivateArticle, self).__init__() self.Db = Db("tbk") self.Db.create_table(os.path.join(root_dir, "tpwd.sql")) self.Db.create_table(os.path.join(root_dir, "article.sql")) self.tpwd_map = {} self.tpwd_db_map = {} self.tpwds = {} self.cookies = {} self.share2article = {} self.article_list = {} self.list_recent = {} self.idx = [] self.empty_content = "" self.tpwd_exec = ThreadPoolExecutor(max_workers=20) self.need_del = {} self.get_share_list() def load_process(self): self.load_ids() if len(self.idx) < 30: time.sleep(np.random.rand() * 30 + 6) self.load_ids() self.load_article_list() # self.update_tpwd() self.get_m_h5_tk() self.get_ynote_file() self.get_ynote_file(1) def load_ids(self): changeJsonTimeout(5) req = self.basic_youdao(self.home_id) if req == "": echo("0|error", "Get The Home Page Info Error!!! Please retry->->->") return self.idx = regex.findall("id=(\w*?)<", req) if len(self.idx) < 30: echo("0|error", "The Num of id is error!! Please check it.") else: echo(1, "Load Article List {} items.".format(len(self.idx))) def get_share_info(self, share_id: str): changeJsonTimeout(4) url = self.GET_SHARE_URL % share_id headers = self.get_tb_headers(self.Y_URL) req = basic_req(url, 1, header=headers) if req is None: return info = req["entry"] self.share2article[share_id] = (info["name"].replace('.note', ''), info["id"], info["lastUpdateTime"]) return req def basic_youdao(self, idx: str, use_proxy: bool = True): url = self.NOTE_URL % idx refer_url = self.SHARE_URL % idx headers = { "Accept": "*/*", "Referer": refer_url, "X-Requested-With": "XMLHttpRequest", } req_req = proxy_req if use_proxy else basic_req req = req_req(url, 1, header=headers, config={'timeout': 8}) if req is None or list(req.keys()) != self.JSON_KEYS: if can_retry(url): echo(2, "retry") return self.basic_youdao(idx) else: echo(1, "retry upper time") return "" return req["content"] def load_article_pipeline(self, mode: int = 0): article_exec = ThreadPoolExecutor(max_workers=5) a_list = [article_exec.submit(self.load_article, ii, mode) for ii in self.idx] list(as_completed(a_list)) self.load_list2db() def load_article(self, article_id: str, mode: int = 0, is_load2db: bool = True): if mode: self.get_share_info(article_id) self.load_list2db() return if article_id not in self.tpwds: article = self.basic_youdao(article_id) tpwds = list({ii: 0 for ii in regex.findall(self.TPWD_REG, article)}) self.tpwds[article_id] = tpwds else: tpwds = self.tpwds[article_id] if article_id not in self.tpwd_map: self.tpwd_map[article_id] = {} time = 0 au_list = [] no_type = [ ii for ii, jj in self.tpwd_map[article_id].items() if "type" not in jj or jj["item_id"] is None ] while ( len(self.tpwd_map[article_id]) < len(tpwds) or (len(no_type) and not time) ) and time < 5: thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[article_id]] echo(1, article_id, "tpwds len:", len(tpwds), "need load", len(thread_list)) thread_list = [ self.tpwd_exec.submit(self.decoder_tpwd_once, article_id, ii) for ii in thread_list ] list(as_completed(thread_list)) no_type = [ ii for ii, jj in self.tpwd_map[article_id].items() if "type" not in jj or jj["item_id"] is None ] au_list.extend( [ self.tpwd_exec.submit(self.decoder_tpwd_url, article_id, ii) for ii in no_type ] ) time += 1 list(as_completed(au_list)) no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] time = 0 while len(no_title) and time < 5: title_list = [ self.tpwd_exec.submit(self.get_item_title, article_id, ii) for ii in no_title ] echo(1, article_id, "need get title:", len(title_list)) list(as_completed(title_list)) time += 1 no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] if is_load2db: self.load_article2db(article_id) def update_title(self, article_id: str): self.tpwd_map[article_id] = { ii[3]: {"content": ii[1], "item_id": ii[0]} for ii in self.article_list[article_id].values() } no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] time = 0 while len(no_title) and time < 5: title_list = [ self.tpwd_exec.submit(self.get_item_title, article_id, ii) for ii in no_title ] echo(1, article_id, "need get title:", len(title_list)) list(as_completed(title_list)) time += 1 no_title = [ ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj ] update_num = len( [ 1 for ii, jj in self.tpwd_map[article_id].items() if "title" in jj and jj["content"] != jj["title"] ] ) echo(2, "Update", article_id, update_num, "Title Success!!!") self.update_article2db(article_id) def load_list2db(self): t_share_map = self.share2article.copy() share_map = self.get_share_list() insert_list, update_list = [], [] for ii, jj in t_share_map.items(): if ii in share_map: t = share_map[ii] update_list.append((t[0], ii, jj[0], jj[1], 0, t[-1])) else: insert_list.append((ii, jj[0], jj[1])) self.update_db(insert_list, "Insert Article List", 1) self.update_db(update_list, "Update Article List", 1) def get_share_list(self): share_list = self.Db.select_db(self.S_LIST_SQL) share_map = {} for ii, jj in enumerate(share_list): t = jj[-1].strftime("%Y-%m-%d %H:%M:%S") share_map[jj[1]] = (*jj[:-1], t) self.share2article = share_map return share_map def load_article2db(self, article_id: str): m = self.tpwd_map[article_id] m = {ii: jj for ii, jj in m.items() if jj["url"]} tpwds = list(set(self.tpwds[article_id])) data = [ ( article_id, ii, m[jj]["item_id"], jj, m[jj]["type"], m[jj]["content"], m[jj]["url"], 0, "", m[jj]["validDate"], ) for ii, jj in enumerate(tpwds) if jj in m and "item_id" in m[jj] and m[jj]["type"] != 15 ] data_map = {ii[3]: ii for ii in data} update_list, insert_list = [], [] for ii in data: if ii[3] in self.tpwd_db_map[article_id]: t = self.tpwd_db_map[article_id][ii[3]] update_list.append((t[0], *ii, t[-1], 0)) else: insert_list.append(ii) for ii, jj in self.tpwd_db_map[article_id].items(): if ii not in data_map: update_list.append((*jj, 1)) self.update_db(insert_list, f"article_id {article_id} Insert") self.update_db(update_list, f"article_id {article_id} Update") def update_tpwd(self, mode: int = 0, is_renew: bool = True, a_id: str = None): update_num = 0 for article_id, jj in self.article_list.items(): if a_id is not None and article_id != a_id: continue for o_tpwd, (num_iid, title, domain, tpwd, _, _, url) in jj.items(): c = jj[o_tpwd] if ( is_renew and self.URL_DOMAIN[1] not in url and self.URL_DOMAIN[2] not in url and self.URL_DOMAIN[10] not in url ): renew_type = 2 if url in self.URL_DOMAIN[5] else 1 origin_tpwd = self.convert2tpwd(url, title) if origin_tpwd is None: origin_tpwd = tpwd else: renew_type = 0 origin_tpwd = tpwd if num_iid == "" or domain == 16: c = ( *c[:2], 16, origin_tpwd, 1 if renew_type == 0 else 2, *c[-2:], ) else: c = self.generate_tpwd( title, int(num_iid), origin_tpwd, renew_type, c, mode ) self.article_list[article_id][o_tpwd] = c update_num += int(c[2] < 15 or (renew_type and not mode)) echo(2, "Update {} Tpwd Info Success!!".format(update_num)) def generate_tpwd( self, title: str, num_iid: int, renew_tpwd: str, renew_type: int, c: dict, mode: int ): goods = self.get_dg_material(title, num_iid) if goods is None or not len(goods): echo(0, "goods get", 'error' if goods is None else 'empty', ':', title, num_iid) return (*c[:2], 17, renew_tpwd, 1 if renew_type == 0 else 2, *c[-2:]) goods = goods[0] if "ysyl_click_url" in goods and len(goods["ysyl_click_url"]): url = goods["ysyl_click_url"] elif "coupon_share_url" in goods and len(goods["coupon_share_url"]): url = goods["coupon_share_url"] else: url = goods["url"] url = "https:{}".format(url) commission_rate = int(goods["commission_rate"]) commission_type = goods["commission_type"] tpwd = self.convert2tpwd(url, title) if tpwd is None: echo(0, "tpwd error:", tpwd) return (*c[:2], 18, renew_tpwd, 1 if renew_type == 0 else 2 * c[-2:]) if mode: return (*c[:3], tpwd, commission_rate, commission_type, c[-1]) if renew_type == 1: return (*c[:3], tpwd, 2, commission_type, c[-1]) return (*c[:3], tpwd, commission_rate, commission_type, c[-1]) def load_article_list(self): """ tpwd: [goods_id, goods_name, domain, tpwd, commission_rate, commission_type, url] """ for article_id in self.idx: article_list = self.get_article_db(article_id) self.article_list[article_id] = { ii[4]: [ii[3], ii[6], ii[5], ii[4], ii[8], ii[9], ii[7]] for ii in article_list } self.tpwd_db_map[article_id] = {ii[4]: ii for ii in article_list} have_id = [ii[0] for ii in self.tpwd_db_map[article_id].values()] need_del_id = [ii[0] for ii in article_list if ii[0] not in have_id] self.need_del[article_id] = need_del_id item_num = sum([len(ii) for ii in self.article_list.values()]) echo(1, "Load {} article list from db.".format(item_num)) def get_article_db(self, article_id: str): article_list = list(self.Db.select_db(self.S_ARTICLE_SQL % article_id)) for ii, jj in enumerate(article_list): t = jj[-1].strftime("%Y-%m-%d %H:%M:%S") y = jj[-2].strftime("%Y-%m-%d %H:%M:%S") article_list[ii] = [*jj[:-2], y, t] return article_list def update_db(self, data: list, types: str, mode: int = 0): if not len(data): return if "insert" in types.lower(): basic_sql = self.I_LIST_SQL if mode else self.I_ARTICLE_SQL else: basic_sql = self.R_LIST_SQL if mode else self.R_ARTICLE_SQL i_sql = basic_sql % str(data)[1:-1] insert_re = self.Db.insert_db(i_sql) if insert_re: echo(3, "{} {} info Success".format(types, len(data))) else: echo(0, "{} failed".format(types)) def decoder_tpwd_once(self, article_id: str, tpwd: str, mode: int = 0): req = self.decoder_tpwd(tpwd) if req is None or not len(req): return temp_map = {ii: req[ii] for ii in self.NEED_KEY} if temp_map["validDate"] == self.ZERO_STAMP or "-" in temp_map["validDate"]: temp_map["validDate"] = 1500000000 else: temp_map["validDate"] = ( time_stamp(time_format="%d天%H小时%M分%S秒", time_str=req["validDate"]) - self.BASIC_STAMP + time_stamp() ) temp_map["validDate"] = time_str(temp_map["validDate"]) temp_map["url"] = temp_map["url"].strip() if article_id not in self.tpwd_map: self.tpwd_map[article_id] = {} self.tpwd_map[article_id][tpwd] = temp_map if not mode: self.decoder_tpwd_url(article_id, tpwd) def decoder_tpwd_url(self, article_id: str, tpwd: str): temp_map = self.tpwd_map[article_id][tpwd] tpwd_type, item_id = self.analysis_tpwd_url(temp_map["url"]) if item_id is None: return temp_map["type"] = tpwd_type temp_map["item_id"] = item_id if tpwd_type < 20: echo(2, "Domain:", self.URL_DOMAIN[tpwd_type], "item id:", item_id) self.tpwd_map[article_id][tpwd] = temp_map def analysis_tpwd_url(self, url: str): if self.URL_DOMAIN[5] in url: return 5, self.get_uland_url(url) elif self.URL_DOMAIN[11] in url: return 11, self.get_a_m_url(url) elif self.URL_DOMAIN[0] in url: return 0, self.get_s_click_url(url) elif self.URL_DOMAIN[10] in url: return 10, 0 elif self.URL_DOMAIN[1] in url: good_id = self.get_item_detail(url) if good_id != "": return 1, good_id return 16, 0 elif url == "": return 15, 0 echo("0|warning", "New Domain:", regex.findall("https://(.*?)/", url), url) return 20, 0 def decoder_tpwd(self, tpwd: str): """ decoder the tpwd from taokouling """ url = self.DECODER_TPWD_URL % (self.api_key, tpwd) req = basic_req(url, 1) if ( req is None or isinstance(req, str) or 'ret' not in list(req.keys()) ): return {} return req def get_s_click_url(self, s_click_url: str): """ decoder s.click real jump url @validation time: 2019.10.23""" time.sleep(np.random.randint(0, 10)) item_url = self.get_s_click_location(s_click_url) if item_url is None: echo(3, "s_click_url location Error..") return return self.get_item_detail(item_url) def get_s_click_url_v1(self, s_click_url: str): """ decoder s.click real jump url @validation time: 2019.08.31""" if "tu=" not in s_click_url: tu_url = self.get_s_click_tu(s_click_url) else: tu_url = s_click_url if tu_url is None or "tu=" not in tu_url: echo(3, "s_click_url tu url ENd Retry..", tu_url) return qso = decoder_url(tu_url) if "tu" not in qso: if "alisec" in tu_url: echo("0|debug", "Request Too Fast") time.sleep(np.random.randint(10) * np.random.rand()) else: echo(0, s_click_url, tu_url) return redirect_url = urllib.parse.unquote(qso["tu"]) return self.get_s_click_detail(redirect_url, tu_url) def get_tb_headers(self, url: str = "", refer_url: str = "") -> dict: headers = {"Accept": get_accept("html"), "User-Agent": get_use_agent()} if url != "": headers["Host"] = url.split("/")[2] if refer_url != "": headers["referer"] = refer_url return headers def get_s_click_basic( self, s_click_url: str, retry_func=(lambda x: False), referer: str = "", allow_redirects: bool = True, is_direct: bool = False, ): headers = self.get_tb_headers(refer_url=referer) req_func = basic_req if is_direct else proxy_req req = req_func( s_click_url, 2, header=headers, config={"allow_redirects": allow_redirects} ) if is_direct: return req if req is None or retry_func(req): if can_retry(s_click_url): return self.get_s_click_basic( s_click_url, retry_func, referer, allow_redirects, is_direct ) else: return return req def get_s_click_tu(self, s_click_url: str): req = self.get_s_click_basic(s_click_url, lambda i: "tu=" not in i.url) if req is None: return return req.url def get_s_click_location(self, s_click_url: str): req = self.get_s_click_basic(s_click_url) if req is None: echo("0|warning", "s_click_url first click error.") return echo("1", "real_jump_address get") rj = regex.findall("real_jump_address = '(.*?)'", req.text) if not len(rj): echo("0|warning", "real_jump_address get error.") return rj = rj[0].replace("&", "&") req_rj = self.get_s_click_basic( rj, lambda i: "Location" not in i.headers, referer=rj, allow_redirects=False ) if req_rj is None: return return req_rj.headers["Location"] def get_s_click_detail(self, redirect_url: str, tu_url: str): headers = self.get_tb_headers(refer_url=tu_url) req = proxy_req(redirect_url, 2, header=headers) if req is None or "id=" not in req.url: if can_retry(redirect_url): return self.get_s_click_detail(redirect_url, tu_url) else: return return self.get_item_detail(req.url) def get_item_detail(self, item_url: str) -> str: item = decoder_url(item_url) if not "id" in item: echo(0, "id not found:", item_url) return "" return item["id"] def get_item_title_once(self, item_id: int) -> str: item = self.get_tb_getdetail(item_id) if item is None: return '' return item['title'] def get_item_title(self, article_id: str, tpwd: str): temp_map = self.tpwd_map[article_id][tpwd] if ( "item_id" not in temp_map or temp_map["item_id"] == "" or temp_map["item_id"] == "0" ): return item_id = int(temp_map["item_id"]) title = self.get_item_title_once(item_id) if title != "": self.tpwd_map[article_id][tpwd]["title"] = title def get_item_title_once_v1(self, item_id: int) -> str: req = self.get_item_basic(item_id) if req is None: return "" req_text = req.text req_title = regex.findall('data-title="(.*?)">', req_text) if len(req_title): return req_title[0] req_title = regex.findall('<meta name="keywords" content="(.*?)"', req_text) if len(req_title): return req_title[0] return "" def get_item_basic(self, item_id: int, url: str = ""): url = self.ITEM_URL % item_id if url == "" else url headers = {"Accept": get_accept("html")} req = proxy_req(url, 2, header=headers, config={"allow_redirects": False}) if req is None: if can_retry(url): return self.get_item_basic(item_id, url) return if req.status_code != 200: return self.get_item_basic(item_id, req.headers["Location"]) return req def get_uland_url(self, uland_url: str): if ( not 'uland' in self.cookies # or not self.M in self.cookies['uland'] or time_stamp() - self.m_time > self.ONE_HOURS / 2 ): self.get_m_h5_tk() s_req = self.get_uland_url_once(uland_url, self.cookies['uland']) req_text = s_req.text re_json = json.loads(req_text[req_text.find("{") : -1]) return re_json["data"]["resultList"][0]["itemId"] def get_a_m_url(self, a_m_url: str): req = self.get_a_m_basic(a_m_url) if req is None: return item_url = req.headers["location"] return self.get_item_detail(item_url) def get_a_m_basic(self, a_m_url: str): headers = self.get_tb_headers(a_m_url) req = proxy_req(a_m_url, 2, header=headers, config={"allow_redirects": False}) if req is None or "location" not in req.headers: if can_retry(a_m_url): return self.get_a_m_basic(a_m_url) return return req def get_m_h5_tk(self): self.m_time = time_stamp() def get_cookie_once(key, func, *param): req = func(*param) if req is not None: self.cookies[key] = req.cookies.get_dict() echo(1, "get {} cookie:".format(key), self.cookies[key]) get_cookie_once('uland', self.get_uland_url_once, self.uland_url) if False: get_cookie_once('finger', self.get_finger_once, self.test_item_id) get_cookie_once('baichuan', self.get_baichuan_once, self.test_item_id, self.test_finger_id) def get_baichuan(self, item_id: int): if ( not 'baichuan' in self.cookies or not self.M in self.cookies['baichuan'] or time_stamp() - self.m_time > self.ONE_HOURS / 2 ): self.get_m_h5_tk() finger_id = self.get_finger(item_id) if finger_id is None: return echo(4, 'finger id:', finger_id) req = self.get_baichuan_once(item_id, finger_id, self.cookies['baichuan']) if req is not None: return req.json()['data'] def get_tb_getdetail(self, item_id: int): if ( not 'uland' in self.cookies or time_stamp() - self.m_time > self.ONE_HOURS / 2 ): self.get_m_h5_tk() req = self.get_tb_getdetail_once(item_id, self.cookies['uland']) if req is not None: req_text = req.text re_json = json.loads(req_text[req_text.find("{") : -1]) return re_json["data"]["item"] def get_tb_getdetail_once(self, item_id: int, cookies: dict = {}): refer_url = self.DETAIL_URL % item_id data = {"itemNumId": str(item_id)} jsv = '2.4.8' api = 'mtop.taobao.detail.getdetail' j_data_t = {'v': 6.0, 'ttid': '2017@taobao_h5_6.6.0', 'AntiCreep': True, 'callback': 'mtopjsonp1' } return self.get_tb_h5_api(api, jsv, refer_url, data, j_data_t, cookies) def get_baichuan_once(self, item_id: int, finger_id: str, cookies: dict = {}): refer_url = self.DETAIL_URL % item_id data = { 'pageCode': 'mallDetail', 'ua': get_use_agent('mobile'), 'params': json_str({ "url": refer_url, "referrer": "", "oneId": None, "isTBInstalled": "null", "fid": finger_id }) } data_str = r'{"pageCode":"mallDetail","ua":"%s","params":"{\"url\":\"%s\",\"referrer\":\"\",\"oneId\":null,\"isTBInstalled\":\"null\",\"fid\":\"%s\"}"}' % (get_use_agent('mobile'), refer_url, finger_id) print(data) api = 'mtop.taobao.baichuan.smb.get' jsv = '2.4.8' return self.get_tb_h5_api(api, jsv, refer_url, data, cookies=cookies, mode=1, data_str=data_str) def get_tb_h5_api(self, api: str, jsv: str, refer_url: str, data: dict, j_data_t: dict = {}, cookies: dict = {}, mode: int = 0, data_str: str = None): """ tb h5 api @2019.11.6 ✔️Tested""" step = self.M in cookies if data_str is None: data_str = json_str(data) headers = { "Accept": 'application/json', "referer": refer_url, "Agent": get_use_agent('mobile') } if step: headers["Cookie"] = encoder_cookie(cookies) appkey = "12574478" token = cookies[self.M].split("_")[0] if step else "" t = int(time_stamp() * 1000) j_data = { "jsv": jsv, "appKey": appkey, "t": t, "sign": self.get_tb_h5_token(token, t, appkey, data_str), "api": api, "v": 1.0, "timeout": 20000, "AntiCreep": True, "AntiFlood": True, "type": "originaljson", "dataType": "jsonp", **j_data_t } if mode == 0: j_data['data'] = data_str mtop_url = encoder_url(j_data, self.MTOP_URL % (api, int(j_data['v']))) if mode == 0: req = proxy_req(mtop_url, 2, header=headers) else: req = proxy_req(mtop_url, 12, data=data, header=headers) # echo(4, 'request once.') if req is None: if can_retry(self.MTOP_URL % (api, int(j_data['v']))): return self.get_tb_h5_api(api, jsv, refer_url, data, j_data_t, cookies, mode) else: return return req def get_uland_url_once(self, uland_url: str, cookies: dict = {}): """ tb h5 api @2019.11.9 ✔️Tested""" step = self.M in cookies uland_params = decoder_url(uland_url) tt = { "floorId": "13193" if step else "13052", "variableMap": json_str( { "taoAppEnv": "0", "e": uland_params["e"], "scm": uland_params["scm"], } ), } api = "mtop.alimama.union.xt.en.api.entry" jsv = '2.4.0' j_data = {'type': 'jsonp', "callback": "mtopjsonp{}".format(int(step) + 1)} return self.get_tb_h5_api(api, jsv, uland_url, tt, j_data, cookies) def get_finger(self, item_id: int): if ( not 'finger' in self.cookies or not self.M in self.cookies['finger'] or time_stamp() - self.m_time > self.ONE_HOURS / 2 ): self.get_m_h5_tk() s_req = self.get_finger_once(item_id, self.cookies['finger']) if s_req is None: return try: return s_req.json()['data']['fingerId'] except Exception as e: return def get_finger_once(self, item_id: int, cookies: dict = {}): step = self.M in cookies api = 'mtop.taobao.hacker.finger.create' refer_url = self.ITEM_URL % item_id jsv = '2.4.11' j_data = {'type': 'jsonp', "callback": "mtopjsonp{}".format(int(step) + 1),} return self.get_tb_h5_api(api, jsv, refer_url, {}, cookies=cookies) def get_tb_h5_token(self, *data: list): md5 = hashlib.md5() wait_enc = "&".join([str(ii) for ii in data]) md5.update(wait_enc.encode()) return md5.hexdigest() def get_ynote_file(self, offset: int = 0): url = self.LISTRECENT_URL % (offset, self.cstk) data = {"cstk": self.cstk} req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1)) if req is None or type(req) != list: return None list_recent = {ii["fileEntry"]["id"]: ii["fileEntry"] for ii in req} self.list_recent = {**self.list_recent, **list_recent} echo(1, "Load ynote file {} items.".format(len(self.list_recent))) return req def get_ynote_web_header(self, mode: int = 0): headers = { "Content-Type": get_content_type(), "Cookie": self.cookie, "Host": self.Y_URL.split("/")[2], "Origin": self.Y_URL, "Referer": self.WEB_URL, } if mode: headers["Accept"] = get_accept("xhr") else: headers["Accept"] = get_accept("html") return headers def get_empty_content(self): headers = {"Referer": self.WEB_URL} req = proxy_req(self.Y_DOC_JS_URL, 3, header=headers) if len(req) < 1000: if can_retry(self.Y_DOC_JS_URL): return self.get_empty_content() else: return empty_content = regex.findall("t.EMPTY_NOTE_CONTENT='(.*?)'", req)[0] empty_content = empty_content.split(self.END_TEXT)[0] self.empty_content = empty_content echo(1, "Load empty content", empty_content) return empty_content def get_web_content(self): req = proxy_req(self.WEB_URL, 3, header=self.get_ynote_web_header()) if len(req) < 1000: if can_retry(self.WEB_URL): return self.get_web_content() else: return return req def update_article_pipeline(self, article_id: str): xml = self.get_xml(article_id) if xml is None: echo("0|warning", "get xml error") return xml, r_log, r_num = self.replace_tpwd(article_id, xml) if not r_num: echo("0|warning", "r_num == 0") return flag = self.update_article(article_id, xml) if flag: self.email_update_result(article_id, r_log, r_num) self.update_valid(article_id) self.update_article2db(article_id, True) self.share_article(article_id) def email_update_result(self, article_id: str, r_log: list, r_num: int): p = self.share2article[article_id][-2].split("/")[-1] article_info = self.list_recent[p] name = article_info["name"].replace(".note", "") subject = "更新({}){}/{}条[{}]".format( time_str(time_format=self.T_FORMAT), r_num, len(r_log), article_info["name"] ) content = "\n".join( [ "Title: {}".format(article_info["name"]), "Time: {}".format(time_str()), "Update Num: {}/{}条".format(r_num, len(r_log)), "", *r_log, ] ) send_email(content, subject, assign_rec=self.assign_rec) def update_valid(self, article_id: str): if article_id not in self.tpwd_map: self.tpwd_map[article_id] = {} wait_list = [ ii for ii in self.article_list[article_id].keys() if ii not in self.tpwd_map[article_id] ] update_time = 0 while len(wait_list) and update_time < 5: echo(2, "Begin Update No.{} times Tpwd validDate".format(update_time + 1)) update_v = [ self.tpwd_exec.submit(self.decoder_tpwd_once, article_id, ii, 1) for ii in wait_list ] list(as_completed(update_v)) wait_list = [ ii for ii in self.article_list[article_id].keys() if ii not in self.tpwd_map[article_id] ] update_time += 1 def update_article2db(self, article_id: str, is_tpwd_update: bool = False): def valid_t(types: str, maps: dict): return types in maps and maps[types] != '' m = {ii[4]: ii for ii in self.get_article_db(article_id)} data = [] for ( o_tpwd, (num_iid, title, domain, tpwd, commission_rate, commission_type, ur), ) in self.article_list[article_id].items(): """ `id`, article_id, tpwd_id, item_id, tpwd, domain, content, url, commission_rate, commission_type, expire_at, created_at, is_deleted """ n = m[o_tpwd] if o_tpwd in self.tpwd_map[article_id]: t = self.tpwd_map[article_id][o_tpwd] content = ( t["title"] if valid_t('title', t) else (t['content'] if valid_t('content', t) else n[6]) ) url = t["url"] if valid_t('url', t) else n[7] validDate = t["validDate"] if valid_t('validDate', t) else n[-2] data.append( ( *n[:4], tpwd if is_tpwd_update else o_tpwd, domain, content, url, commission_rate, commission_type, validDate, n[-1], 0, ) ) else: data.append( ( *n[:4], tpwd if is_tpwd_update else o_tpwd, domain, n[6], n[7], commission_rate, commission_type, n[-2], n[-1], 0, ) ) self.update_db(data, "Update Article {} TPWD".format(article_id)) def replace_tpwd(self, article_id: str, xml: str): tpwds = regex.findall(self.TPWD_REG2, xml) m = self.article_list[article_id] r_log, r_num = [], 0 EXIST = "PASSWORD_NOT_EXIST::口令不存在" DECODER_EXC = "DECODER_EXCEPTION::商品已下架" NO_GOODS = "GOODS_NOT_FOUND::未参加淘客" TPWD_ERROR = "TPWD_ERROR::淘口令生成异常" for ii, jj in enumerate(tpwds): pure_jj = jj[1:-1] no_t = "No.{} tpwd: {}, ".format(ii + 1, jj) if pure_jj not in m: r_log.append("{}{}".format(no_t, EXIST)) continue # tpwd = 'NOTNOTEXIST' num_iid, title, domain, tpwd, commission_rate, commission_type, ur = m[pure_jj] if domain >= 15: if domain == 15: applied = "{},{}".format(EXIST, title) elif domain == 16: applied = "{},{}".format(DECODER_EXC, title) elif domain == 17: applied = "{},{}".format(NO_GOODS, title) elif domain == 18: applied = "{},{}".format(TPWD_ERROR, title) else: applied = title xml = xml.replace(jj, "¥{}¥".format(tpwd)) if commission_rate == 2: COMMISSION = "->¥{}¥ SUCCESS, 保持原链接, {}".format(tpwd, applied) elif commission_rate == 1: COMMISSION = "未能更新淘口令, {}".format(applied) else: COMMISSION = "->¥{}¥ SUCCESS, 佣金: {}, 类型: {}, {}".format( tpwd, commission_rate, commission_type, applied ) r_log.append("{}{}".format(no_t, COMMISSION)) r_num += int(commission_rate != 1) return xml, r_log, r_num def get_xml(self, article_id: str): url = self.SYNC_URL % ("download", self.cstk) data = { "fileId": self.share2article[article_id][-2].split("/")[-1], "version": -1, "convert": True, "editorType": 1, "cstk": self.cstk, } req = proxy_req(url, 12, data=data, header=self.get_ynote_web_header(1)) if req is None or len(req.text) < 100: if can_retry(url): return self.get_xml(article_id) else: return return req.text def update_article(self, article_id: str, article_body: str): p = self.share2article[article_id][-2].split("/")[-1] article_info = self.list_recent[p] data = { "fileId": p, "parentId": article_info["parentId"], "domain": article_info["domain"], "rootVersion": -1, "sessionId": "", "modifyTime": int(time_stamp()), "bodyString": article_body, "transactionId": p, "transactionTime": int(time_stamp()), "orgEditorType": article_info["orgEditorType"], "tags": article_info["tags"], "cstk": self.cstk, } url = self.SYNC_URL % ("push", self.cstk) req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1)) if req is None or list(req.keys()) != [ "entry", "meta", "effectedShareEntries", "forcePullVersion", "effected", ]: echo( "0|error", "Update atricle_id {} Error".format(article_id), req.json() if req is not None else "", ) return False echo("1|warning", "Update atricle_id {} Success!!!".format(article_id)) return True def share_article(self, article_id: str): p = self.share2article[article_id][-2].split("/")[-1] url = self.MYSHARE_URL % (p, self.cstk) req = proxy_req(url, 1, header=self.get_ynote_web_header(1)) if req is None or list(req.keys()) != ["entry", "meta"]: if can_retry(url): return self.share_article(article_id) return False echo("2", "Share article {} Success!!!".format(article_id)) return True def load_article_local(self, file_path: str): if file_path not in self.tpwds: tt = '||||'.join(read_file(file_path)) tpwds = regex.findall(self.TPWD_REG, tt) self.tpwds[file_path] = tpwds else: tpwds = self.tpwds[file_path] if file_path not in self.tpwd_map: self.tpwd_map[file_path] = {} time = 0 while (len(self.tpwd_map[file_path]) < len(tpwds)) and time < 5: thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[file_path]] echo(1, file_path, "tpwds len:", len(tpwds), "need load", len(thread_list)) thread_list = [ self.tpwd_exec.submit(self.decoder_tpwd_once, file_path, ii, 1) for ii in thread_list ] list(as_completed(thread_list)) time += 1 def load_picture(self, url: str, idx: int): td = basic_req(url, 2) picture_path = 'picture/{}.jpg'.format(idx) with open(picture_path, 'wb') as f: f.write(td.content) def load_picture_pipeline(self, file_path: str): mkdir('picture') tpk_list = self.tpwds[file_path] picture_url = [(self.tpwd_map[file_path][tpk]['picUrl'], idx) for idx, tpk in enumerate(tpk_list) if tpk in self.tpwd_map[file_path]] picture_url = [(ii, idx) for ii, idx in picture_url if not os.path.exists('picture/{}.jpg'.format(idx))] echo(1, 'Load {} picture Begin'.format(len(picture_url))) pp = [self.tpwd_exec.submit(self.load_picture, ii, jj) for ii, jj in picture_url] return pp def check_overdue(self): def check_overdue_once(data: list) -> bool: dif_time = time_stamp(data[-2]) - time_stamp() return dif_time > 0 and dif_time <= self.ONE_HOURS * self.ONE_DAY overdue_article = [(article_id, article_list[4]) for article_id, ii in self.tpwd_db_map.items() for article_list in ii.values() if check_overdue_once(article_list)] overdue_id = set([article_id for article_id, _ in overdue_article]) overdue_list = [(article_id, len([1 for a_id, tpwd in overdue_article if article_id == a_id])) for article_id in overdue_id] if not len(overdue_list): return title = '链接需要更新#{}#篇'.format(len(overdue_list)) content = title + '\n \n' for article_id, num in overdue_list: content += '{}, 需要更新{}个链接,{}\n'.format(self.share2article[article_id][2], num, self.NOTE_URL % article_id) content += '\n\nPlease update within 6 hours, Thx!' echo('2|debug', title, content) send_email(content, title) def load_share_total(self): self.check_overdue() for article_id in self.idx: self.get_share_info(article_id) self.load_list2db() self.__init__() self.load_process() def load_article_new(self): for article_id in self.idx: self.load_article(article_id) def load_click(self, num=1000000): ''' schedule click ''' for index in range(num): threading_list = [] if index % 12 != 1: threading_list.append(threading.Thread(target=self.load_article_new, args=())) if index % 12 == 1: threading_list.append(threading.Thread(target=self.load_share_total, args=())) for work in threading_list: work.start() time.sleep(self.ONE_HOURS / 2)