def load(cls): if cls.__load__cache__(PathConfig.RAW_KES_CACHE): print("Raw KES data cache has been loaded from %s\n" % PathConfig.RAW_KES_CACHE) return cls db = Db() rows = db.select(["new_eventid", "sub_eventid", "text_dbpedia"]) random.shuffle(rows) row_count = len(rows) train_rows, test_rows, validation_rows = \ rows[0: int(row_count * 0.45)], \ rows[int(row_count * 0.45): int(row_count * 0.95)], \ rows[int(row_count * 0.95):] bar = ProgressBar( cls.__self__accumulating__(len(train_rows) - 1) + cls.__self__accumulating__(len(test_rows) - 1) + cls.__self__accumulating__(len(validation_rows) - 1), "Loading raw KES data...") progress = 0 for rows_splitting, set_name in (train_rows, "train"), (test_rows, "test"), (validation_rows, "validation"): cls.__fill__(rows_splitting, len(rows_splitting), set_name, bar, progress) bar.finish("Raw KES data has been loaded.") cls.__cache__(PathConfig.RAW_KES_CACHE) return cls
class DbTestCase(unittest.TestCase): def setUp(self): self.db_to_test = Db() pass def test_new_user(self): self.db_to_test.cursor = MagicMock() self.db_to_test.new_user("test_user") self.db_to_test.cursor.execute.assert_called_with("INSERT INTO user (name,balance,cc_number) VALUES (?,?,?)", ("test_user",0,None))
class DbTestCase(unittest.TestCase): def setUp(self): self.db_to_test = Db() pass def test_new_user(self): self.db_to_test.cursor = MagicMock() self.db_to_test.new_user("test_user") self.db_to_test.cursor.execute.assert_called_with( "INSERT INTO user (name,balance,cc_number) VALUES (?,?,?)", ("test_user", 0, None))
def __init__(self): self.Db = Db("netease") self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s ''' self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0''' self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s ''' self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s''' self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s''' self.canuseip = {} self.waitjudge = [] self.cannotuseip = {} self.failuredtime = {} self.initproxy()
def __init__(self): self.Db = Db("china_regions") china = pd.read_csv('news/china_city_list.csv', encoding='gbk') self.province = list(china.groupby(by=['Province']).count().axes[0]) self.city = list(china.groupby(by=['City']).count().axes[0]) self.filelists = [ 'google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail', 'bjh_detail_poison', 'news_steal.txt', 'news_poison.txt' ] self.city_province = {} self.province_map = {} self.pre_data() for index, row in china.iterrows(): self.city_province[row['City']] = row['Province']
def __init__(self): self.Db = Db("netease") self.classifylist = {} self.playlists = [] self.failuredmap = {} self.songmap = {} self.songlist = [] self.finishlist = [] self.get_classify() self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' ''' self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 ''' self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' ''' self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s''' self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)''' # change to your file absolute address self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s''' self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s'''
def __init__(self): self.Db = Db("blog") self.local_views = {} self.title_map = {} self.title2slug = {} self.failured_map = {} self.zhihu_views = {} self.zhihu_id = {} self.jianshu_views = {} self.jianshu_id = {} self.csdn_views = {} self.csdn_id = {} self.exist_data = {} self.getTitleMap() self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s''' self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s''' self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''
def db(ctrl, queue): db = Db(DB_URI, mode='rwc', timeout=DB_TIMEOUT) notify('DB', 'up') stats = {} def _commit(cache): db.write(cache) def _check(path, url): if not db.exists(path): ctrl.put((LOAD, path, url)) else: ctrl.put((DISCARD, path, url)) commands = { CHECK: _check, COMMIT: _commit, } def _run(): cmd, *args = queue.get() commands[cmd](*args) return worker(_run, "db", stats)
import os import sys import csv import time from collections import defaultdict from utils.db import Db from config.constants import * if __name__ == "__main__": manifest = os.path.join(EXTRACT_DIR, EXTRACT_MANIFEST) db = Db(DB_URI, timeout=DB_TIMEOUT) stats = { 'date': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), 'by-year': defaultdict(lambda: dict(count=0, filename=None, file=None)), } def writer(): def _writer(year): entry = stats['by-year'][year] entry['filename'] = filename = str(year) + '.csv' filepath = os.path.join(EXTRACT_DIR, filename) entry['file'] = file = open(filepath, 'wt') output = csv.writer(file) def _write(row): date, *tail = row
class GetFreeProxy: """ proxy pool """ def __init__(self): self.Db = Db("netease") self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s ''' self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0''' self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s ''' self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5 and http_type in %s''' self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s''' self.canuseip = {} self.waitjudge = [] self.cannotuseip = {} self.failuredtime = {} self.initproxy() def get_request_proxy(self, url: str, types: int, data=None, test_func=None, header=None): """ use proxy to send requests, and record the proxy cann't use @types S0XY: X=0.->get; =1.->post; Y=0.->html; =1.->json; =2.->basic S=0.->basic ;=1.->ss support failured retry && failured auto record """ httptype = url[4] == 's' ss_type = types // 1000 types %= 1000 if ss_type: proxylist = self.proxylists_ss if httptype else self.proxylist_ss else: proxylist = self.proxylists if httptype else self.proxylist if not len(proxylist): if self.Db.db: print( 'Proxy pool empty!!! Please check the db conn & db dataset!!!' ) proxies = {} else: index = random.randint(0, len(proxylist) - 1) proxies_url = proxylist[index] proxies = {type_map[httptype]: proxies_url} try: result = basic_req(url, types, proxies, data, header) if not test_func is None: if not test_func(result): if self.check_retry(url): self.get_request_proxy(url, types + 1000 * ss_type, data, test_func) else: self.failuredtime[url] = 0 return else: return result else: return result except: self.cannotuseip[random.randint(0, MAXN)] = proxies_url if proxies_url in proxylist: proxylist.remove(proxylist.index(proxies_url)) if not len(self.cannotuseip.keys()) % 10: self.cleancannotuse() if self.check_retry(url): self.get_request_proxy(url, types + 1000 * ss_type, data, test_func) else: return def check_retry(self, url): """ check cannt retry """ if url not in self.failuredtime: self.failuredtime[url] = 0 return True elif self.failuredtime[url] < 3: self.failuredtime[url] += 1 return True else: self.log_write(url) self.failuredtime[url] = 0 return False def log_write(self, url): """ failure log """ with codecs.open("proxy.log", 'a', encoding='utf-8') as f: f.write(time_str() + url + '\n') def insertproxy(self, insertlist): """ insert data to db """ results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1]) if results: print('Insert ' + str(len(insertlist)) + ' items Success!') else: pass def updateproxy(self, updatelist, types): """ update data to db """ results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1]) typemap = {0: 'can use ', 1: 'can not use '} if results: print('Update', typemap[types], str(len(updatelist)), ' items Success!') else: pass def selectproxy(self, targetlist): """ select ip proxy by ids """ if not len(targetlist): return [] elif len(targetlist) == 1: waitlist = '(\'' + targetlist[0] + '\')' else: waitlist = tuple(targetlist) return self.Db.select_db(self.select_sql % str(waitlist)) def dbcanuseproxy(self): """ test db have or not this data """ results = self.selectproxy([ii[0] for ii in self.canuseip.values()]) ss_len = len([1 for ii in self.canuseip.values() if ii[1] > 1]) print("SS proxies %d" % ss_len) insertlist = [] updatelist = [] ipmap = {} if results != False: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.canuseip.values(): http_type = ip_now[1] ip_now = ip_now[0] if ip_now in ipmap: if ipmap[ip_now][1]: updatelist.append( (ipmap[ip_now][0], ip_now, http_type, 0)) else: insertlist.append((ip_now, http_type)) if len(insertlist): self.insertproxy(insertlist) if len(updatelist): self.updateproxy(updatelist, 0) else: pass self.canuseip = {} def cleancannotuse(self): """ update db proxy cann't use """ results = self.selectproxy(self.cannotuseip.values()) updatelist = [] ipmap = {} if results: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.cannotuseip.values(): http_type = ip_now[4] == 's' if ip_now in ipmap: updatelist.append((ipmap[ip_now][0], ip_now, http_type, ipmap[ip_now][1] + 1)) if len(updatelist): self.updateproxy(updatelist, 1) else: pass self.cannotuseip = {} def initproxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) self.proxylist = [] self.proxylists = [] self.proxylist_ss = [] self.proxylists_ss = [] if results != 0: for index in results: if index[1] == 1: self.proxylists.append(index[0]) elif index[1] == 2: self.proxylist.append(index[0]) self.proxylist_ss.append(index[0]) elif index[1] == 3: self.proxylists.append(index[0]) self.proxylists_ss.append(index[0]) else: self.proxylist.append(index[0]) print(len(self.proxylist), ' http proxy can use.') print(len(self.proxylists), ' https proxy can use.') print(len(self.proxylist_ss), ' ss http proxy can use.') print(len(self.proxylists_ss), ' ss https proxy can use.') else: print( '>>>Please check db configure!!! The proxy pool cant use!!!>>>' ) def judgeurl(self, urls, index, times): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == 's' proxies = {type_map[http_type]: urls} test_url = type_map[ http_type] + '://music.163.com/api/playlist/detail?id=432853362' ss_url = 'https://www.google.com/?gws_rd=ssl' try: # print(test_url, proxies) # return data = basic_req(test_url, 1, proxies) result = data['result'] tracks = result['tracks'] if len(tracks) == 56: if times < 2: self.judgeurl(urls, index, times + 1) else: self.canuseip[index] = [urls, int(http_type)] data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.canuseip[index] = [urls, int(http_type) + 2] else: self.cannotuseip[index] = urls except: if not index in self.canuseip: self.cannotuseip[index] = urls pass def threadjude(self): """ threading to judge proxy """ changeJsonTimeout(2) changeHtmlTimeout(3) text = self.waitjudge num = len(text) for block in range(num // 1000 + 1): blockthreads = [] for index in range(block * 1000, min(num, 1000 * (block + 1))): work = threading.Thread(target=self.judgeurl, args=( text[index], index, 0, )) blockthreads.append(work) for work in blockthreads: work.start() for work in blockthreads: work.join() # return self.dbcanuseproxy() self.cleancannotuse() self.waitjudge = [] def testdb(self, types): ''' test proxy in db can use ''' version = begin_time() typestr = '' if types == 2: typestr = '(0,1,2,3)' elif types == 1: typestr = '(1,3)' else: typestr = '(0,2)' results = self.Db.select_db(self.select_all % typestr) if results != 0: for index in results: self.waitjudge.append(index[0]) self.threadjude() else: pass self.initproxy() end_time(version) def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): print("Please input num!") return [] version = begin_time() url = 'http://www.xicidaili.com/nn/%d' for index in range(1, page + 1): html = basic_req(url % (index), 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append(ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time(version) def gatherproxy(self, types): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ version = begin_time() if not os.path.exists('%sgatherproxy' % data_path): print('Gather file not exist!!!') return with codecs.open('%sgatherproxy' % data_path, 'r', encoding='utf-8') as f: file_d = [ii.strip() for ii in f.readlines()] if not types: waitjudge = ['http://' + ii[:-1] for ii in file_d] elif types == 1: waitjudge = ['https://' + ii[:-1] for ii in file_d] else: waitjudge1 = ['http://' + ii[:-1] for ii in file_d] waitjudge2 = ['https://' + ii[:-1] for ii in file_d] waitjudge = [*waitjudge1, *waitjudge2] self.waitjudge = waitjudge print('load gather over!') end_time(version) def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ version = begin_time() host = 'http://www.goubanjia.com' html = self.get_request_proxy(host, 0) if not html: return [] trs = html.find_all('tr', class_=['warning', 'success']) for tr in trs: tds = tr.find_all('td') ip = tds[2].find_all('a')[0].text + '://' iplist = tds[0].find_all(['div', 'span', not 'p'], class_=not 'port') for index in iplist: ip += index.text encode = tds[0].find_all(['div', 'span', 'p'], class_='port')[0]['class'][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord('A')), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ':' + str(int(uncode / 8))) self.threadjude() end_time(version) def schedulegou(self): sched = BlockingScheduler() sched.add_job(self.goubanjia, 'interval', seconds=100) sched.start() def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ version = begin_time() url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml'] host = 'http://www.data5u.com/' for uri in url_list: html = self.get_request_proxy(host + uri, 0) if not html: continue table = html.find_all('ul', class_='l2') for index in table: tds = index.find_all('li') ip = tds[3].text self.waitjudge.append(ip + '://' + tds[0].text + ':' + tds[1].text) self.threadjude() end_time(version) def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): print(str(index) + ' ' + str(pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version) def sixsixthread(self, index, pageindex): host = '''http://www.66ip.cn/areaindex_%d/%d.html''' html = self.get_request_proxy(host % (index, pageindex), 0) if not html: return [] trs = html.find_all('table')[2].find_all('tr') for test in range(1, len(trs) - 1): tds = trs[test].find_all('td') self.waitjudge.append('http://' + tds[0].text + ':' + tds[1].text) self.waitjudge.append('https://' + tds[0].text + ':' + tds[1].text) def kuaidaili(self, page): """ kuaidaili https://www.kuaidaili.com/free/ """ version = begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version) def kuaidailithread(self, index): host = '''https://www.kuaidaili.com/free/inha/%d/''' html = self.get_request_proxy(host % index, 0) if not html: return [] trs = html.find_all('tr') for index in range(1, len(trs)): tds = trs[index].find_all('td') ip = tds[3].text.lower() + "://" + tds[0].text + ':' + tds[1].text self.waitjudge.append(ip) def get_cookie(self): """ make cookie login PS: Though cookie expired time is more than 1 year, but It will be break when the connect close. So you need reactive the cookie by this function. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } login_url = 'http://www.gatherproxy.com/subscribe/login' cookie_html = basic_req(login_url, 0, header=headers) verify_text = cookie_html.find_all('div', class_='label')[2].span.text verify_list = verify_text.replace('= ', '').strip().split() num_map = { 'Zero': 0, 'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Fine': 5, 'Six': 6, 'Seven': 7, 'Eight': 8, 'Nine': 9, 'Ten': 10 } verify_num = [verify_list[0], verify_list[2]] for index, num in enumerate(verify_num): if num.isdigit(): verify_num[index] = int(num) elif num in num_map: verify_num[index] = num_map[num] else: print('Error', index) # return False verify_code = 0 error = True operation = verify_list[1] if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied': verify_code = verify_num[0] + verify_num[1] error = False if operation == '-' or operation == 'minus': verify_code = verify_num[0] - verify_num[1] error = False if operation == 'X' or operation == 'multiplication': verify_code = verify_num[0] * verify_num[1] error = False if error: print('Error', operation) # return False if not os.path.exists('%spassage' % data_path): print('gather passage not exist!!!') return with codecs.open('%spassage' % data_path, 'r', encoding='utf-8') as f: passage = [index[:-1] for index in f.readlines()] data = { 'Username': passage[0], 'Password': passage[1], 'Captcha': str(verify_code) } time.sleep(2.163) r = requests.session() r.cookies = cj.LWPCookieJar() login_req = r.post(login_url, headers=headers, data=data, verify=False) def load_gather(self): """ load gather proxy pool text If failured, you should reactive the cookie. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } url = 'http://www.gatherproxy.com/subscribe/infos' sid_url_req = requests.get(url, headers=headers, verify=False) sid_url_html = BeautifulSoup(sid_url_req.text, 'html.parser') sid_url = sid_url_html.find_all( 'div', class_='wrapper')[1].find_all('a')[0]['href'] if len(sid_url.split('sid=')) < 2: print('cookie error') self.get_cookie() self.load_gather() return sid = sid_url.split('sid=')[1] sid_url = 'http://www.gatherproxy.com' + sid_url data = {'ID': sid, 'C': '', 'P': '', 'T': '', 'U': '0'} gatherproxy = requests.post(sid_url, headers=headers, data=data, verify=False) with codecs.open(data_path + 'gatherproxy', 'w', encoding='utf-8') as f: f.write(gatherproxy.text)
def setUp(self): self.db_to_test = Db() pass
def __init__(self): Db.__init__(self) self.table = "subs_schedule" self.connect()
class Get_playlist_song(): """ 1. get playlist id from classify; 2. get song from play list; use url: """ def __init__(self): self.Db = Db() self.classifylist = {} self.proxyclass = GetFreeProxy() self.playlists = [] self.failuredmap = {} self.songmap = {} self.songlist = [] self.finishlist = [] self.get_classify() self.select_one = '''SELECT playlist_id from playlist_queue WHERE `playlist_id` in %s AND classify = '%s' ''' self.select_ids = '''SELECT `id`, playlist_id from playlist_queue WHERE classify = '%s' AND is_finished = 0 ''' self.select_song = '''SELECT `id`, `song_id`, `time`, `play_time` from playlist_detail WHERE song_id in %s AND classify = '%s' ''' self.insert_sql = '''INSERT INTO playlist_queue(`playlist_id`, `classify`) VALUES %s''' self.insert_song = '''LOAD DATA INFILE '/Users/gunjianpan/Desktop/git/spider/song_detail' INTO TABLE playlist_detail FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' (`song_id`, `song_name`, `classify`, `time`, `play_time`)''' # change to your file absolute address self.replace_song = '''REPLACE INTO playlist_detail(`id`,`song_id`,`classify`,`song_name`,`time`,`play_time`) VALUES %s''' self.replace_queue = '''REPLACE INTO playlist_queue(`id`, `playlist_id`, `classify`, `is_finished`) VALUES %s''' def get_classify(self): """ get classify from /discover/playlist """ begin_time() self.classifylist = {} host = 'https://music.163.com/discover/playlist' html = self.proxyclass.get_request_proxy(host, host[8:21], 0) if not html: print('Empty') self.proxyclass.cleancannotuse() if self.can_retry(host): self.get_classify() return [] alist = html.find_all('a', class_='s-fc1') if not len(alist): if self.can_retry(host): self.get_classify() print(html) for index in alist: self.classifylist[index.text] = index['href'] self.proxyclass.cleancannotuse() end_time() def get_playlist_id(self, classify, offset): """ get playlist id from classify """ host = 'https://music.163.com' allclassify = classify == '全部风格' url = host + self.classifylist[classify] + ( '?' if allclassify else '&') + 'order=hot&limit=35&offset=' + str(offset) # html = self.proxyclass.get_request_proxy(url, host[8:], 0) html = get_html(url, {}, host[8:]) if not html: if self.can_retry(url): self.get_playlist_id(classify, offset) else: self.proxyclass.log_write(url) return [] alist = html.find_all('a', class_='icon-play') if not len(alist): if self.can_retry(url): self.get_playlist_id(classify, offset) else: self.proxyclass.log_write(url) for index in alist: self.playlists.append(index['data-res-id']) def can_retry(self, url): """ judge can retry once """ if url not in self.failuredmap: self.failuredmap[url] = 0 # print("Retry " + str(self.failuredmap[url]) + ' ' + url) return True elif self.failuredmap[url] < 2: self.failuredmap[url] += 1 # print("Retry " + str(self.failuredmap[url]) + ' ' + url) return True else: print("Failured " + url) self.proxyclass.log_write(url) self.failuredmap[url] = 0 return False def get_playlist_id_thread(self): """ get play list id in threading """ begin_time() if not len(self.classifylist): self.get_classify() for index in self.classifylist: threadings = [] for offset in range(41): work = threading.Thread(target=self.get_playlist_id, args=( index, offset * 35, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.proxyclass.cleancannotuse() print(len(self.playlists)) self.test_queue(index) self.playlists = [] print(index + " Over") end_time() def test_queue(self, classify): """ test data if in playlist_queue """ if len(self.playlists) == 1: waitlist = '(' + str(self.playlists[0]) + ')' else: waitlist = tuple(self.playlists) results = self.Db.select_db(self.select_one % (str(waitlist), classify)) if not results: return [] hadexist = [] for index in results: hadexist.append(index[0]) insertlist = [] for index in self.playlists: if index not in hadexist: # file_d.write(str([index, classify])[1:-1] + '\n') insertlist.append((index, classify)) print('Insert ' + str(len(insertlist)) + ' ' + classify) self.insert_queue(insertlist) def insert_queue(self, ids): """ insert data to playlist_queue """ if not len(ids): return [] results = self.Db.insert_db(self.insert_sql % str(ids)[1:-1]) if results: if len(ids): print('Insert ' + ids[0][1] + ' ' + str(len(ids)) + ' Success!') else: pass def get_list_ids(self, classify): """ get list ids from db """ results = self.Db.select_db(self.select_ids % classify) ids = [] if results: for index in results: ids.append([index[0], index[1]]) return ids def get_song_detail_thread(self): """ get song detail threadings """ begin_time() for classify in self.classifylist: ids = self.get_list_ids(classify) threadings = [] for oneid in ids: work = threading.Thread(target=self.get_song_detail, args=(oneid[1], )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.clean_data() self.test_song(classify, ids) self.songlist = [] self.songmap = {} self.finishlist = [] self.successtime = 0 print(classify + ' Over!') end_time() def clean_data(self): """ aggregation data """ for song in self.songlist: [songid, songname, playcount] = song if songid not in self.songmap: self.songmap[songid] = [1, playcount, songname] else: orgin = self.songmap[songid] self.songmap[songid] = [ orgin[0] + 1, orgin[1] + playcount, songname ] def get_song_detail(self, id): """ get song detail form playlist """ host = 'http://music.163.com/api/playlist/detail?id=' + str(id) json = self.proxyclass.get_request_proxy(host, host[7:20], 1) if json == 0: if self.can_retry(host): self.get_song_detail(id) else: self.proxyclass.log_write(host) return [] result = json['result'] tracks = result['tracks'] if len(tracks) <= 1: if self.can_retry(host): self.get_song_detail(id) else: self.proxyclass.log_write(host) return [] else: playcount = result['playCount'] for track in tracks: songid = track['id'] songname = track['name'] self.songlist.append([songid, songname, playcount]) self.finishlist.append(id) def test_song(self, classify, ids): """ test song if in db """ songs = [] for song in self.songmap: songs.append(song) if not len(songs): return [] elif len(songs) == 1: waitlist = '(' + songs[0] + ')' else: waitlist = tuple(songs) results = self.Db.select_db(self.select_song % (str(waitlist), classify)) resultmap = {} for detail in results: resultmap[detail[1]] = [detail[0], detail[2], detail[3]] replacelist = [] insertlist = [] replacequeue = [] file_d = codecs.open("song_detail", 'a', encoding='utf-8') file_d.seek(0) file_d.truncate() idsmap = {} for indexid in ids: idsmap[indexid[1]] = indexid[0] for song in self.songmap: songdetail = self.songmap[song] if song in resultmap: dbdetail = resultmap[song] replacelist.append( (dbdetail[0], song, classify, songdetail[2], songdetail[0] + dbdetail[1], songdetail[1] + dbdetail[2])) else: file_d.write(u'' + str([ song, u'' + str(u'' + songdetail[2].replace(',', ' ')) [0:20], classify, songdetail[0], songdetail[1] ])[1:-1] + '\n') insertlist.append((song, songdetail[2], classify, songdetail[0], songdetail[1])) for playlist in self.finishlist: replacequeue.append((idsmap[playlist], playlist, classify, 1)) file_d.close() if len(insertlist): self.db_song_detail(insertlist, 'Insert', replacequeue) if len(replacelist): self.db_song_detail(replacelist, 'Update', []) def db_song_detail(self, waitlist, types, replacequeue): """ batch insert/update song detail """ if types == 'Update': results = self.Db.update_db(self.replace_song % str(blocklist)[1:-1]) else: results = self.Db.update_db(self.insert_song) if results: if len(waitlist): print(types + ' song detail for ' + waitlist[0][2] + ' ' + str(len(waitlist)) + ' Success!') if types == 'Insert': self.replace_queue_db(replacequeue) def replace_queue_db(self, replacequeue): """ replace db for fininsh playlist id """ results = self.Db.update_db(self.replace_queue % str(replacequeue)[1:-1]) if results: if len(replacequeue): print('Update queue fininsh for ' + str(len(replacequeue)) + ' item!') else: pass
def __init__(self): cmd.Cmd.__init__(self) self.prompt = "> " self.db = Db( ) # i don't feel great about how this db is created and passed around
class find_location(object): """ find location """ def __init__(self): self.Db = Db("china_regions") china = pd.read_csv('news/china_city_list.csv', encoding='gbk') self.province = list(china.groupby(by=['Province']).count().axes[0]) self.city = list(china.groupby(by=['City']).count().axes[0]) self.filelists = [ 'google_steal.txt', 'google_posion.txt', 'bjh', 'bjh_detail', 'bjh_detail_poison', 'news_steal.txt', 'news_poison.txt' ] self.city_province = {} self.province_map = {} self.pre_data() for index, row in china.iterrows(): self.city_province[row['City']] = row['Province'] def search_location(self): word = '' count = 0 for file in self.filelists: temp_word_list = codecs.open(file, 'r', encoding='utf-8').readlines() count += len(temp_word_list) word += " ".join(temp_word_list) # return word print(count) word_province = {} word_city = {} word_city_pro = {} for index in self.province: temp_num = word.count(index) if temp_num: word_province[index] = temp_num for index in self.city: temp_num = word.count(index) if temp_num: word_city[index] = temp_num for index in word_city: province = self.city_province[index] if province in word_city_pro: word_city_pro[province] += word_city[index] else: word_city_pro[province] = word_city[index] print(sum(word_province.values()), sum(word_city.values()), sum(word_city_pro.values())) return word_province, word_city, word_city_pro def participles_word(self): """ participles word """ version = begin_time() for file in self.filelists: pkuseg.test(file, file[:-4] + '_pkuseg.txt', model_name='../Model_retrieval/pkuseg', nthread=20) end_time(version) def pre_data(self): """ load city key-value from mysql """ province = self.Db.select_db( 'select * from china_regions where level=1') self.province_map = { int(index[2]): index[3][:3] if len(index[3]) == 4 or len(index[3]) == 6 else index[3][:2] for index in province } city = self.Db.select_db('select * from china_regions where level=2') city_state = [index for index in city if index[3][-1:] == '州'] seg = pkuseg.pkuseg() city_state = { seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(index[3])[0] + seg.cut(index[3])[1]: int(index[1]) for index in city if index[3][-1:] == '州' } seg = pkuseg.pkuseg(model_name='../Model_retrieval/pkuseg') city_state1 = { seg.cut(index)[0] if len(seg.cut(index)[0]) > 1 else seg.cut(index)[0] + seg.cut(index)[1]: city_state[index] for index in city_state } city_area = { index[3][:-2]: int(index[1]) for index in city if '地区' in index[3] } city_other = { index[3][:-1]: int(index[1]) for index in city if index[3][-1:] == '市' or index[3][-1:] == '盟' } self.city_province = {**city_state1, **city_area, **city_other} self.city_province = { index: self.province_map[self.city_province[index]] for index in self.city_province } county = self.Db.select_db('select * from china_regions where level=3') county_area_pre = {index for index in county if index[3][-1] == '区'} county_area_two = { index[3][:-2]: int(index[1][:2]) for index in county_area_pre if len(index[3]) > 3 and ( index[3][-2] == '矿' or index[3][-2] == '林') } # print('芒' in county_area_two, 'two') county_area_state = { seg.cut(index[3][:-2])[0]: int(index[1][:2]) for index in county_area_pre if len(index[3]) > 2 and index[3][-2] == '族' } # print('芒' in county_area_state, 'state') county_area_other = { index[3][:-1]: int(index[1][:2]) for index in county_area_pre if len(index[3]) > 2 and index[3][-2] != '族' and index[3][-2] != '林' and index[3][-2] != '矿' } # print('芒' in county_area_other, 'other') county_county_pre = {index for index in county if index[3][-1] == '县'} county_county_two = { index[3]: int(index[1][:2]) for index in county_county_pre if len(index[3]) == 2 } # print('芒' in county_county_two, 'two') seg = pkuseg.pkuseg() county_county_state = { seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(index[3])[0] + seg.cut(index[3])[1]: int(index[1][:2]) for index in county_county_pre if len(index[3]) > 2 and index[3][-3:-1] == '自治' } county_county_state = { index[:-2] if '族' in index and len(index) > 3 else index: county_county_state[index] for index in county_county_state } # print('芒' in county_county_state, 'state') county_county_other = { index[3][:-1]: int(index[1][:2]) for index in county_county_pre if index[3][-3:-1] != '自治' and len(index[3]) > 2 } # print('芒' in county_county_other, 'other') county_city = { index[3][:-1] if len(index[3]) > 2 else index[3]: int(index[1][:2]) for index in county if index[3][-1] == '市' } # print('芒' in county_city, 'city') county_domain = { index[3][:4]: int(index[1][:2]) for index in county if index[3][-1] == '域' } # print('芒' in county_domain, 'domain') county_other = { index[3]: int(index[1][:2]) for index in county if index[3][-1] == '盟' or index[3][-1] == '岛' } # print('芒' in county_other, 'other') county_province = { **county_area_two, **county_area_state, **county_area_other, **county_county_two, **county_county_state, **county_county_other, **county_city, **county_domain, **county_other } county_province = { index: self.province_map[county_province[index]] for index in county_province } self.city_province = {**self.city_province, **county_province} print({index for index in self.city_province if len(index) == 1}) def test_province(self, maps, words): word_city = {} for index in maps: temp_num = words.count(index) province = maps[index] if temp_num: if province in word_city: word_city[province] += temp_num else: word_city[province] = temp_num print(sum(word_city.values())) return word_city
import pytz import pandas as pd from utils.db import Db, get_prices # connect to Db _ = Db(host="localhost", user="******", password="******", db="go_finance") def load(symbols, start, end, is_adj=True): data = dict() # загружаем цены r = get_prices(symbols=symbols, dt_from=end, period=(end - start).days, is_adj=is_adj) for symbol in symbols: symbol_data = r['symbol'] == symbol data[symbol] = pd.DataFrame({ 'open': r['open'][symbol_data], 'high': r['high'][symbol_data], 'low': r['low'][symbol_data], 'close': r['close'][symbol_data], 'volume': r['volume'][symbol_data], }, index=r['dt'][symbol_data]) panel = pd.Panel(data) panel.major_axis = panel.major_axis.tz_localize(pytz.utc) return panel
def __init__(self): Db.__init__(self) self.table = 'credentials' self.connect()
class GetFreeProxy(object): """ proxy getter """ def __init__(self): self.Db = Db() self.insert_sql = '''INSERT INTO ip_proxy( `address`, `http_type`) VALUES %s ''' self.select_list = '''SELECT address, http_type from ip_proxy WHERE `is_failured` = 0''' self.select_sql = '''SELECT `id`, address, `is_failured` from ip_proxy WHERE `address` in %s ''' self.select_all = '''SELECT `address`, `http_type` from ip_proxy WHERE `is_failured` != 5''' self.replace_ip = '''REPLACE INTO ip_proxy(`id`, `address`, `http_type`, `is_failured`) VALUES %s''' self.typemap = {1: 'https', 0: 'http'} self.canuseip = [] self.waitjudge = [] self.proxylist = [] self.proxylists = [] self.cannotuseip = [] self.failuredtime = {} self.initproxy() def get_request_proxy(self, url, host, types): """ use proxy to send requests, and record the proxy cann't use @types 1:json, 0:html support failured retry """ if not len(self.proxylist): self.initproxy() httptype = url[4] == 's' index = random.randint( 0, len(self.proxylists if httptype else self.proxylist) - 1) if httptype: proxies = {'https': self.proxylists[index]} else: proxies = {'http': self.proxylist[index]} try: if types: json = get_json(url, proxies, host) if 'code' in json and json['code'] != 200: ppap = self.retry(url, host, types) if not ppap: return False else: return json else: html = get_html(url, proxies, host) if 'code' in html or not html: ppap = self.retry(url, host, types) if not ppap: return False else: return html except Exception as e: self.cannotuseip.append(proxies[self.typemap[httptype]]) if httptype: if index < len(self.proxylists) and proxies[ 'https'] == self.proxylists[index]: self.proxylists.remove(proxies['https']) else: if index < len(self.proxylist ) and proxies['http'] == self.proxylist[index]: self.proxylist.remove(proxies['http']) ppap = self.retry(url, host, types) if not ppap: return False def retry(self, url, host, types): """ retry once """ if url not in self.failuredtime: self.failuredtime[url] = 0 # print("retry " + str(self.failuredtime[url])) self.get_request_proxy(url, host, types) elif self.failuredtime[url] < 3: self.failuredtime[url] += 1 # print("retry " + str(self.failuredtime[url])) self.get_request_proxy(url, host, types) else: # print("Request Failured three times!") self.log_write(url) self.failuredtime[url] = 0 return False def log_write(self, url): """ failure log """ file_d = open("log", 'a') file_d.write( time.strftime("%Y-%m-%d %H:%M:%S ", time.localtime()) + url + '\n') file_d.close() def insertproxy(self, insertlist): """ insert data to db """ results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1]) if results: print('Insert ' + str(len(insertlist)) + ' items Success!') else: pass def updateproxy(self, updatelist, types): """ update data to db """ results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1]) typemap = {0: 'can use ', 1: 'can not use '} if results: print('Update ' + typemap[types] + str(len(updatelist)) + ' items Success!') else: pass def selectproxy(self, targetlist): """ select ip proxy by ids """ if not len(targetlist): return [] elif len(targetlist) == 1: waitlist = '(\'' + targetlist[0] + '\')' else: waitlist = tuple(targetlist) return self.Db.select_db(self.select_sql % str(waitlist)) def dbcanuseproxy(self): """ test db have or not this data """ results = self.selectproxy(self.canuseip) insertlist = [] updatelist = [] ipmap = {} if results: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.canuseip: http_type = ip_now[4] == 's' if ip_now in ipmap: if ipmap[ip_now][1]: updatelist.append( (ipmap[ip_now][0], ip_now, http_type, 0)) else: insertlist.append((ip_now, http_type)) if len(insertlist): self.insertproxy(insertlist) if len(updatelist): self.updateproxy(updatelist, 0) else: pass self.canuseip = [] def cleancannotuse(self): """ update db proxy cann't use """ results = self.selectproxy(self.cannotuseip) updatelist = [] ipmap = {} if results: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.cannotuseip: http_type = ip_now[4] == 's' if ip_now in ipmap: updatelist.append((ipmap[ip_now][0], ip_now, http_type, ipmap[ip_now][1] + 1)) if len(updatelist): self.updateproxy(updatelist, 1) else: pass self.cannotuseip = [] def initproxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) if results != 0: self.proxylist = [] self.proxylists = [] for index in results: if index[1]: self.proxylists.append(index[0]) else: self.proxylist.append(index[0]) print(str(len(self.proxylist)) + ' http proxy can use.') print(str(len(self.proxylists)) + ' https proxy can use.') else: pass def judgeurl(self, urls, times): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == 's' proxies = {self.typemap[http_type]: urls} test_url = 'https://music.163.com/discover/playlist/?order=hot&limit=35&offset=0' if http_type else 'http://music.163.com/api/playlist/detail?id=432853362' if http_type: try: html = get_html(test_url, proxies, test_url[8:21]) alist = html.find_all('a', class_='s-fc1') if len(alist) == 73: self.canuseip.append(urls) else: self.cannotuseip.append(urls) except Exception as e: self.cannotuseip.append(urls) pass else: try: data = get_json(test_url, proxies, test_url[7:20]) result = data['result'] tracks = result['tracks'] if len(tracks) == 56: if times < 2: self.judgeurl(urls, times + 1) else: self.canuseip.append(urls) else: self.cannotuseip.append(urls) except Exception as e: self.cannotuseip.append(urls) pass def threadjude(self): """ threading to judge proxy """ text = self.waitjudge num = len(text) for block in range(int(num / 1000) + 1): blockthreads = [] for index in range(block * 1000, min(num, 1000 * (block + 1))): work = threading.Thread(target=self.judgeurl, args=( text[index], 0, )) blockthreads.append(work) for work in blockthreads: work.start() for work in blockthreads: work.join() self.dbcanuseproxy() self.cleancannotuse() self.waitjudge = [] def testdb(self): ''' test proxy in db can use ''' begin_time() results = self.Db.select_db(self.select_all) if results != 0: for index in results: self.waitjudge.append(index[0]) self.threadjude() else: pass self.initproxy() end_time() def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): print("Please input num!") return [] begin_time() host = 'http://www.xicidaili.com/nn/' for index in range(1, page + 1): html = get_html(host + str(index), {}, host[7:-4]) # html = self.get_request_proxy(host + str(index), host[7:-4], 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append(ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time() def gatherproxy(self, types): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ begin_time() file_d = open('proxy/gatherproxy', 'r') for index in file_d.readlines(): if types == 0: self.waitjudge.append('http://' + index[0:-1]) elif types == 1: self.waitjudge.append('https://' + index[0:-1]) else: self.waitjudge.append('http://' + index[0:-1]) self.waitjudge.append('https://' + index[0:-1]) self.threadjude() end_time() def goubanjia(self): """ :-1: html tag mixed with invalid data :100:And the most important thing is the port writed in 'class' rather in text. The website is difficult to spider, but the proxys are very goog goubanjia proxy http://www.goubanjia.com """ begin_time() host = 'http://www.goubanjia.com' html = self.get_request_proxy(host, host[7:], 0) if not html: return [] trs = html.find_all('tr', class_=['warning', 'success']) for tr in trs: tds = tr.find_all('td') ip = tds[2].find_all('a')[0].text + '://' iplist = tds[0].find_all(['div', 'span', not 'p'], class_=not 'port') for index in iplist: ip += index.text encode = tds[0].find_all(['div', 'span', 'p'], class_='port')[0]['class'][1] uncode = functools.reduce( lambda x, y: x * 10 + (ord(y) - ord('A')), map(lambda x: x, encode), 0) self.waitjudge.append(ip + ':' + str(int(uncode / 8))) self.threadjude() end_time() def schedulegou(self): sched = BlockingScheduler() sched.add_job(self.goubanjia, 'interval', seconds=100) sched.start() def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ begin_time() url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml'] host = 'http://www.data5u.com/' for uri in url_list: html = self.get_request_proxy(host + uri, host[7:-1], 0) if not html: continue table = html.find_all('ul', class_='l2') for index in table: tds = index.find_all('li') ip = tds[3].text self.waitjudge.append(ip + '://' + tds[0].text + ':' + tds[1].text) self.threadjude() end_time() def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): print(str(index) + ' ' + str(pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time() def sixsixthread(self, index, pageindex): host = '''http://www.66ip.cn/areaindex_%d/%d.html''' html = self.get_request_proxy(host % (index, pageindex), host[7:-21], 0) if not html: return [] trs = html.find_all('table')[2].find_all('tr') for test in range(1, len(trs) - 1): tds = trs[test].find_all('td') self.waitjudge.append('http://' + tds[0].text + ':' + tds[1].text) self.waitjudge.append('https://' + tds[0].text + ':' + tds[1].text) def kuaidaili(self, page): """ kuaidaili https://www.kuaidaili.com/free/ """ begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time() def kuaidailithread(self, index): host = '''https://www.kuaidaili.com/free/inha/%d/''' html = self.get_request_proxy(host % index, host[8:25], 0) if not html: return [] trs = html.find_all('tr') for index in range(1, len(trs)): tds = trs[index].find_all('td') ip = tds[3].text.lower() + "://" + tds[0].text + ':' + tds[1].text self.waitjudge.append(ip)
class TitleViews(object): """ update title views """ def __init__(self): self.Db = Db("blog") self.local_views = {} self.title_map = {} self.title2slug = {} self.failured_map = {} self.zhihu_views = {} self.zhihu_id = {} self.jianshu_views = {} self.jianshu_id = {} self.csdn_views = {} self.csdn_id = {} self.exist_data = {} self.getTitleMap() self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s''' self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s''' self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s''' def loadLocalView(self): """ load local view """ if not os.path.exists("%sgoogle" % data_dir): return with codecs.open("%sgoogle" % data_dir, 'r', encoding='utf-8') as f: test = f.readlines() test = test[7:] for index in test: arr = index.split(',') slug = self.matchSlug(arr[0]) if slug is None or slug not in self.title_map: continue print(slug + ' ' + str(arr[1]) + ' ' + arr[0]) if slug in self.local_views: self.local_views[slug] += int(arr[1]) else: self.local_views[slug] = int(arr[1]) def getTitleMap(self): """ get title map """ if os.path.exists('%sslug' % data_dir): with codecs.open('%sslug' % data_dir, 'r', encoding='utf-8') as f: slug = f.readlines() else: slug = [] if os.path.exists('%stitle' % data_dir): with codecs.open('%stitle' % data_dir, 'r', encoding='utf-8') as f: title = f.readlines() else: title = [] self.title_map = { tempslug.split('"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug) } title2slug = { self.title_map[index]: index for index in self.title_map.keys() } noemoji_title = { self.filter_emoji(self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys() } self.title2slug = {**noemoji_title, **title2slug} def matchSlug(self, pattern): """ match slug """ arr = re.search(r'\/([^\/]+).html', pattern) return None if arr is None else arr.group(1) def getZhihuView(self): if os.path.exists('%scookie' % data_dir): with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() else: cookie = ' ' changeCookie(cookie[:-1]) url_basic = [ 'https://www.zhihu.com/api/v4/creator/content_statistics/', 'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=', datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no=' ] url = "".join(url_basic) json = self.get_request(url + '1', 1) if not json: return if not 'data' in json: if 'code' in json: print(json) return for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: print(index['title']) for index in range(json['count'] // 10): print('zhihu', index) json = self.get_request(url + str(index + 2), 1) if not json: continue for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: print(index['title']) def get_request(self, url, types): result = basic_req(url, 1) if not result: if can_retry(url): self.get_request(url, types) return return result def get_request_v2(self, url, types, header): result = get_request_proxy(url, 0, header=header) if not result or not len(result.find_all('div', class_='content')): if can_retry(url): self.get_request_v2(url, types, header) return return result def get_request_v3(self, url, types): result = basic_req(url, 0) if result is None or not result or not len( result.find_all('p', class_='content')): if can_retry(url): self.get_request_v3(url, types) return return result def getJianshuViews(self): """ get jianshu views """ header = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'sec-ch-ua': 'Google Chrome 75', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'cross-site', 'sec-fetch-user': '******', 'sec-origin-policy': '0', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3736.0 Safari/537.36' } basic_url = 'https://www.jianshu.com/u/2e0f69e4a4f0' for rounds in range(1, 4): url = basic_url if rounds == 1 else basic_url + \ '?order_by=shared_at&page=' + str(rounds) print(url) html = self.get_request_v2(url, 0, header) if html is None: print('None') return for index in html.find_all('li', class_=["", 'have-img']): if len(index.find_all('i')) < 3: continue title = index.find_all('a', class_='title')[0].text.replace( '`', '') jianshu_id = int(index['data-note-id']) jianshu_count = int(index.find_all('a')[-2].text) if title in self.title2slug: temp_slug = self.title2slug[title] self.jianshu_id[temp_slug] = jianshu_id self.jianshu_views[temp_slug] = jianshu_count elif jianshu_id in self.jianshu_id_map: temp_slug = self.jianshu_id_map[jianshu_id] self.jianshu_id[temp_slug] = jianshu_id self.jianshu_views[temp_slug] = jianshu_count else: print(title) def getCsdnViews(self): """ get csdn views """ basic_url = "https://blog.csdn.net/iofu728" for index in range(1, 3): url = basic_url if index == 1 else basic_url + \ '/article/list/' + str(index) + '?' html = self.get_request_v3(url, 0) if html is None: print('None') return for div_lists in html.find_all( 'div', class_='article-item-box csdn-tracking-statistics'): if 'style' in div_lists.attrs: continue csdn_id = int(div_lists['data-articleid']) title = div_lists.a.contents[2].replace('\n', '').strip().replace( '`', '') csdn_count = int( div_lists.find_all('span', class_='read-num')[0].span.text) if title in self.title2slug: temp_slug = self.title2slug[title] self.csdn_id[temp_slug] = csdn_id self.csdn_views[temp_slug] = csdn_count elif csdn_id in self.csdn_id_map: temp_slug = self.csdn_id_map[csdn_id] self.csdn_id[temp_slug] = csdn_id self.csdn_views[temp_slug] = csdn_count else: print(title) def filter_emoji(self, desstr, restr=''): ''' filter emoji ''' desstr = str(desstr) try: co = re.compile(u'[\U00010000-\U0010ffff]') except re.error: co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') return co.sub(restr, desstr) def init_db(self): self.loadLocalView() self.getZhihuView() self.getJianshuViews() self.getCsdnViews() insert_list = [] for index in self.title_map.keys(): insert_list.append( (index, self.local_views[index] if index in self.local_views else 0, self.zhihu_views[index] if index in self.zhihu_views else 0, self.csdn_views[index] if index in self.csdn_views else 0, self.jianshu_views[index] if index in self.jianshu_views else 0, self.zhihu_id[index] if index in self.zhihu_id else 0, self.csdn_id[index] if index in self.csdn_id else 0, self.jianshu_id[index] if index in self.jianshu_id else 0)) # return insert_list results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1]) if results: if len(insert_list): print('Insert ' + str(len(insert_list)) + ' Success!') else: pass def select_all(self): result = self.Db.select_db( "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0" ) if result == False: print("SELECT Error!") else: self.exist_data = {index[1]: list(index) for index in result} self.zhihu_id_map = { index[6]: index[1] for index in result if index[6] } self.csdn_id_map = { index[7]: index[1] for index in result if index[7] } self.jianshu_id_map = { index[8]: index[1] for index in result if index[8] } for index in self.exist_data: self.exist_data[index][-1] = self.exist_data[index][ -1].strftime('%Y-%m-%d %H:%M:%S') def update_view(self): changeHtmlTimeout(10) wait_map = {} self.select_all() self.getZhihuView() self.getJianshuViews() self.getCsdnViews() for index in self.zhihu_views.keys(): if self.zhihu_views[index] == self.exist_data[index][ 3] and self.zhihu_id[index] == self.exist_data[index][6]: continue wait_map[index] = self.exist_data[index] wait_map[index][3] = self.zhihu_views[index] wait_map[index][6] = self.zhihu_id[index] for index in self.csdn_views.keys(): if self.csdn_views[index] == self.exist_data[index][ 4] and self.csdn_id[index] == self.exist_data[index][7]: continue if index not in wait_map: wait_map[index] = self.exist_data[index] wait_map[index][4] = self.csdn_views[index] wait_map[index][7] = self.csdn_id[index] for index in self.jianshu_views.keys(): if self.jianshu_views[index] == self.exist_data[index][ 5] and self.jianshu_id[index] == self.exist_data[index][8]: continue wait_map[index] = self.exist_data[index] wait_map[index][5] = self.jianshu_views[index] wait_map[index][8] = self.jianshu_id[index] update_list = [tuple(index) for index in wait_map.values()] # return update_list:q if not len(update_list): return results = self.Db.update_db(self.update_sql % str(update_list)[1:-1]) if results: if len(update_list): print('Update ' + str(len(update_list)) + ' Success!') else: pass def new_day(self): day_data = self.Db.select_db( "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1" ) if not os.path.exists('../blog/log/basic'): print('File not exist!!!') return with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f: existed_spider = int(f.readlines()[1]) today_date = datetime.datetime.now().strftime('%Y-%m-%d') new_day_list = [(today_date, day_data[0][0] + day_data[0][1], existed_spider)] results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1]) if results: if len(new_day_list): print('New day update' + str(len(new_day_list)) + ' Success!') else: pass
\N{ROBOT FACE} Downloading the full #StackOverflow history from the @waybackmachine for {duration} now At this point I have read {fcount} files """ def timedelta_format(td): d = td.days s = td.seconds h, s = divmod(s, HOUR) m, s = divmod(s, MINUTE) return "{} days, {} hours and {} minutes".format(d, h, m) db = Db(DB_URI, timeout=DB_TIMEOUT) while True: try: orig = datetime(2020, 2, 1, 4, 17) delta = datetime.today() - orig ts = timedelta_format(delta) fcount = db.fcount() msg = MSG.format(duration=ts, fcount=fcount) print() print(msg) TWEET_CMD = ['t', 'update', 'XXXX'] TWEET_CMD[-1] = msg subprocess.run(TWEET_CMD) except Exception as e: