def livecollect(num): bloom = ScalableBloomFilter(1000000,0.001) havedone = temp_stupid('user.txt').read() error = temp_stupid('erroruser.txt').read() for i in havedone: try: ii=json.loads(i)['id'] except: continue bloom.add(ii) for i in error: bloom.add(i) temp = temp_stupid('sample.txt') see = temp.read() sc = tempflow('user.txt','a') seed=[] for i in see: if i not in bloom: seed.append(i) for i in seed[:num]: #tempp=temp_stupid(i+'txt') if i.startswith(u'\ufeff'): i = i.encode('utf8')[3:].decode('utf8') try: userdict = tiny_people(getRequest(),i) sc.writein([json.dumps(userdict,ensure_ascii=False)]) print(i) except ErrorInJson as result: temp_stupid('erroruser.txt').save([i]) except BaseException as result: for e in range(10): try: send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**') break except BaseException as res: print(res) continue while True: if not CanConnect(cre): continue else: break # try: # livelist = worm_userV4(i,'lives',['id']) # except BaseException as result: # while True: # try: # send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime( # '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**') # break # except: # continue # continue # tempp.update(livelist) sc.end()
def CanConnect(url, header=XHR_HEADER_WZ): ''' 测试连接是否被ban(即是否返回403状态码) url: 用于测试的链接 header: 默认为www.zhihu.com主机,如果是api.zhihu.com必须更换 ''' req = getRequest() r = req.get(url, headers=header) if int(r.status_code) in [401, 403]: return False else: return True
def forlive(num): bloom = ScalableBloomFilter(1000000, 0.001) havedone = temp_stupid('live.txt').read() error = temp_stupid('errorlive.txt').read() for i in havedone: try: ii = json.loads(i)['id'] except: continue bloom.add(ii) for i in error: bloom.add(i) temp = temp_stupid('need.txt') see = temp.read() sc = tempflow('live.txt', 'a') seed = [] for i in see: if i not in bloom: seed.append(i) for i in seed[:num]: #tempp=temp_stupid(i+'txt') if i.startswith(u'\ufeff'): i = i.encode('utf8')[3:].decode('utf8') try: userdict = tiny_live(i, getRequest()) sc.writein([userdict]) print(i) except ErrorInJson as result: temp_stupid('errorlive.txt').save([i]) except BaseException as result: for e in range(10): try: send( traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), __name__ + ' throw ' + result.__class__.__name__, '*****@*****.**') break except BaseException as res: print(res) continue while True: if not CanConnect(create_live(i), XHR_HEADER_API): continue else: break sc.end()
def antcolony_userV4(token, domain, bloom: ScalableBloomFilter, key: str, dataobject, keylist=None, lamda=20, xhr_headers=XHR_HEADER_WZ): def keyconvert(keys, dict): c = {} for key in dict.keys(): if key in keys: c[key] = dict[key] return c urlhead = create_userV4(domain, token) req = getRequest() people_url = urlhead + paging(0, 5) print(people_url) print('before r') r = req.get(people_url, headers=xhr_headers) print('after r') if int(r.status_code) == 410: dataobject.delone(token) return 0 if int(r.status_code) > 300: raise WrongStatuCode(str(r.status_code) + ': ' + people_url) j = json.loads(r.text) if 'error' in j: raise ErrorInJson(__name__ + ": from url=" + urlhead + '\n msg=' + j['error']) print(j['paging']) try: total = int(j['paging']['totals']) except: total = None if total: print(total) print(round(total / lamda)) for i in range(round(total / lamda) + 1): urll = urlhead + paging(i * lamda, lamda) r = req.get(urll, headers=xhr_headers) # print(r) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if keylist: data = [keyconvert(keylist, x) for x in data] for i in data: if i[key] not in bloom: dataobject.insert(i) bloom.add(i[key]) print('data import ' + i[key]) else: print('pass') else: i = 0 while True: urll = urlhead + paging(int(i * lamda), lamda) r = req.get(urll, headers=xhr_headers) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if len(data) == 0: break if keylist: data = [keyconvert(keylist, x) for x in data] for c in data: if c[key] not in bloom: dataobject.insert(c) bloom.add(c[key]) print('data import ' + c[key]) else: print('pass') i += 1
def worm_userV4(token, domain, keylist=None, bloom=None, lamda=20, xhr_headers=XHR_HEADER_API): def keyconvert(keys, dict): c = {} for key in dict.keys(): if key in keys: c[key] = dict[key] return c def convert(bloom, data): def istrue(bloom, ok): if ok['url_token'] in bloom: return False else: return True c = [] for i in data: if istrue(bloom, i): c.append(i) return c urlhead = create_userV4(domain, token) req = getRequest() people_url = urlhead + paging(0, 5) r = req.get(people_url, headers=xhr_headers) if int(r.status_code) > 300: raise WrongStatuCode(str(r.status_code) + ': ' + people_url) j = json.loads(r.text) if 'error' in j: raise ErrorInJson(__name__ + ": from url=" + urlhead + '\n msg=' + j['error']) print(j['paging']) alldata = [] try: total = int(j['paging']['totals']) except: total = None if total: print(total) print(round(total / lamda)) for i in range(round(total / lamda) + 1): urll = urlhead + paging(i * lamda, lamda) r = req.get(urll, headers=xhr_headers) # print(r) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if keylist: data = [keyconvert(keylist, x) for x in data] if bloom: data = convert(bloom, data) print(data) alldata = alldata + data else: i = 0 while True: urll = urlhead + paging(int(i * lamda), lamda) r = req.get(urll, headers=xhr_headers) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if len(data) == 0: break if keylist: data = [keyconvert(keylist, x) for x in data] if bloom: data = convert(bloom, data) print(data) alldata = alldata + data i = i + 1 return alldata
if i[key] not in bloom: dataobject.insert(i) bloom.add(i[key]) print('data import ' + i[key]) else: print('pass') else: i = 0 while True: urll = urlhead + paging(int(i * lamda), lamda) r = req.get(urll, headers=xhr_headers) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if len(data) == 0: break if keylist: data = [keyconvert(keylist, x) for x in data] for c in data: if c[key] not in bloom: dataobject.insert(c) bloom.add(c[key]) print('data import ' + c[key]) else: print('pass') i += 1 if __name__ == '__main__': req = getRequest() print(tiny_speciallist('1020409341244997632', req, XHR_HEADER_API))
def getuseronline(id): return FORWEB(json.loads(tiny_people(id,getRequest()),encoding='utf-8'))
def getliveonline(id): return FORWEB(json.loads(tiny_live(id,getRequest()),encoding='utf-8'))