def spiders(zhanghao,mima): try: ses=requests.session() headers = { "Accept-Encoding":"gzip, deflate, br", "Accept-Language":"zh-CN,zh;q=0.9", "Cache-Control":"no-cache", "Connection":"keep-alive", "Content-Length":"24", "Content-Type":"application/json; charset=UTF-8", "Host":"www.tianyancha.com", "Origin":"https://www.tianyancha.com", "Pragma":"no-cache", "Referer":"https://www.tianyancha.com/", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With":"XMLHttpRequest", } url='https://www.tianyancha.com/verify/geetest.xhtml' data = {"uuid":"1592968082445"} res=ses.post(url,headers=headers,json=data,timeout=15) json_res=json.loads(res.text) gt=json_res["data"]["gt"] challenge1=json_res["data"]["challenge"] challenge, gt, c, s, distance, track=get_track(challenge1,gt,ses) result=run(challenge, gt, c, s, distance, track) challenge1 = result["challenge"] validate = result["validate"] print(result) data1 ={"mobile":"{}".format(zhanghao),"cdpassword":"******".format(md5(mima)),"loginway":"PL","autoLogin":False,"challenge":"{}".format(challenge1),"validate":"{}".format(validate),"seccode":"{}|jordan".format(validate)} login = ses.post(url="https://www.tianyancha.com/cd/login.json",json=data1,headers=headers,timeout=15) print(login.text) except Exception as e: print(e)
def timing(times): print('定時程序已啟動') login_wechat() print('定时程序已开始,当前时间%s,\n请注意已开启JAVA服务' % time.strftime("%H:%M:%S")) while 1: i = time.strftime("%H:%M:00") t = time.strftime("%H:%M:%S") time.sleep(10) if i == times: print(i, '开始列印') send_news() time.sleep(60) print('今日列印完成 明日将继续列印') elif i == '08:50:00': epr = EPRCPRI() print('开始删除') # epr.delpdf() spider.run() elif i == '18:01:00': print('测试微信是否在线') send_text() time.sleep(60) elif times.endswith('00:00') is True: print('定時程序運行中……現在是%s,将于%s开始列印' % (t, times))
def main(root_page, firebase_path, meta_json={}): print 'root_page:', root_page print 'firebase_path:', firebase_path firebase_public_path = get_firebase_public_path(firebase_path) print 'firebase_public_path:', firebase_public_path print 'beginning spider...' rewrites = spider.run(root_page, results_path=firebase_public_path, meta_json=meta_json) print 'completed spider' print 'rewrites:', rewrites add_to_firebase_json(firebase_path, rewrites) original_path = os.getcwd() os.chdir(firebase_path) print 'deploying...' os.system('firebase deploy') os.chdir(original_path)
def index(): foodlist = spider.run() foodnames = [] for food in foodlist: foodnames.append(food.get_name()) count = counter.Counter() data = json.dumps(foodnames) #x = {'date':[u'2012-06-28', u'2012-06-29', u'2012-06-30'], 'users': [405, 368, 119]} return render_template('index.html', foodlist = foodlist, count = count, data = data )
#! /usr/bin/env python #-*-encoding=utf8-*- import os import sys import pdb if __name__ == "__main__": path = "/".join(os.path.dirname(__file__).split("/")[:-1]) sys.path.append(path) import spider argv = sys.argv arg1 = argv[1] if arg1 == "all": spider.run() elif arg1 == "debug": spider.debug = True spider.run_cat(argv[2], argv[3]) elif arg1 == "test": if argv[2] == "me": spider.test_me() elif argv[2] == "modules": spider.test_modules() else: spider.load_worker_and_test(argv[2], argv[3]) elif arg1 == "redis_proxy": from spider import redis_proxy redis_proxy.run() elif arg1 == "create": from spider import template template.create_module(argv[2], *argv[3:]) else:
def thread_function(name, version): spider.run(name, version)
sys.setdefaultencoding("utf-8") logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='myapp.log', filemode='w') login.login() res = {} wordlist = {} for person in conf.persons: res[person] = spider.run(person, 1000) logging.info('%s\tdownloaded!' % person) f = open('./data/' + person, 'w') for word in res[person]: #写入临时文件 f.write('%s\t' % word) f.write('%f\n' % res[person][word]) #总词数统计 if word in wordlist: wordlist[word] += 1 else: wordlist[word] = 1 f.close() #筛选重复率高和低的词
reload(sys) sys.setdefaultencoding( "utf-8" ) logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='myapp.log', filemode='w') login.login() res = {} wordlist = {} for person in conf.persons: res[person] = spider.run(person, 1000) logging.info('%s\tdownloaded!'%person) f = open('./data/'+person, 'w') for word in res[person]: #写入临时文件 f.write('%s\t' % word) f.write('%f\n' % res[person][word]) #总词数统计 if word in wordlist: wordlist[word] += 1 else: wordlist[word] = 1 f.close() #筛选重复率高和低的词
} #迭代iteration个用户 for i in range(iteration): offset = 0 log(seed) while True: if (seed.encode() in self.done_list): seed, seed_img = self.get_next_user() break log('iteration: ' + str(i) + ' limit: ' + str(limit) + ' offset: ' + str(offset)) data = run_uri(self.convert_uri(seed, offset, limit), headers=headers) #获取了20条数据,处理,如果都处理完毕,则该用户完成 status = self.process_data(data) if status == False: self.append_done_list(seed, seed_img) seed, seed_img = self.get_next_user() break offset += 20 if seed == None: break self.save_result() if __name__ == "__main__": spider = zhihu_spider() spider.run(seed="excited-vczh")