Пример #1
0
def spiders(zhanghao,mima):
    try:
        ses=requests.session()
        headers = {
            "Accept-Encoding":"gzip, deflate, br",
            "Accept-Language":"zh-CN,zh;q=0.9",
            "Cache-Control":"no-cache",
            "Connection":"keep-alive",
            "Content-Length":"24",
            "Content-Type":"application/json; charset=UTF-8",
            "Host":"www.tianyancha.com",
            "Origin":"https://www.tianyancha.com",
            "Pragma":"no-cache",
            "Referer":"https://www.tianyancha.com/",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "X-Requested-With":"XMLHttpRequest",
        }
        url='https://www.tianyancha.com/verify/geetest.xhtml'
        data = {"uuid":"1592968082445"}
        res=ses.post(url,headers=headers,json=data,timeout=15)
        json_res=json.loads(res.text)
        gt=json_res["data"]["gt"]
        challenge1=json_res["data"]["challenge"]
        challenge, gt, c, s, distance, track=get_track(challenge1,gt,ses)
        result=run(challenge, gt, c, s, distance, track)
        challenge1 = result["challenge"]
        validate = result["validate"]
        print(result)
        data1  ={"mobile":"{}".format(zhanghao),"cdpassword":"******".format(md5(mima)),"loginway":"PL","autoLogin":False,"challenge":"{}".format(challenge1),"validate":"{}".format(validate),"seccode":"{}|jordan".format(validate)}
        login = ses.post(url="https://www.tianyancha.com/cd/login.json",json=data1,headers=headers,timeout=15)
        print(login.text)
    except Exception as e:
        print(e)
Пример #2
0
def timing(times):
    print('定時程序已啟動')
    login_wechat()
    print('定时程序已开始,当前时间%s,\n请注意已开启JAVA服务' % time.strftime("%H:%M:%S"))
    while 1:
        i = time.strftime("%H:%M:00")
        t = time.strftime("%H:%M:%S")
        time.sleep(10)
        if i == times:
            print(i, '开始列印')
            send_news()
            time.sleep(60)
            print('今日列印完成 明日将继续列印')
        elif i == '08:50:00':
            epr = EPRCPRI()
            print('开始删除')
            # epr.delpdf()
            spider.run()
        elif i == '18:01:00':
            print('测试微信是否在线')
            send_text()
            time.sleep(60)
        elif times.endswith('00:00') is True:
            print('定時程序運行中……現在是%s,将于%s开始列印' % (t, times))
Пример #3
0
def main(root_page, firebase_path, meta_json={}):
    print 'root_page:', root_page
    print 'firebase_path:', firebase_path
    firebase_public_path = get_firebase_public_path(firebase_path)
    print 'firebase_public_path:', firebase_public_path
    print 'beginning spider...'
    rewrites = spider.run(root_page, results_path=firebase_public_path, meta_json=meta_json)
    print 'completed spider'
    print 'rewrites:', rewrites
    add_to_firebase_json(firebase_path, rewrites)
    original_path = os.getcwd()
    os.chdir(firebase_path)
    print 'deploying...'
    os.system('firebase deploy')
    os.chdir(original_path)
Пример #4
0
def index():

	foodlist = spider.run()
	foodnames = []
	for food in foodlist:
		foodnames.append(food.get_name())
	count = counter.Counter()
	data = json.dumps(foodnames)







	#x = {'date':[u'2012-06-28', u'2012-06-29', u'2012-06-30'], 'users': [405, 368, 119]}

	return render_template('index.html', foodlist = foodlist, count = count, data = data )
Пример #5
0
#! /usr/bin/env python
#-*-encoding=utf8-*-
import os
import sys
import pdb

if __name__ == "__main__":
    path = "/".join(os.path.dirname(__file__).split("/")[:-1])
    sys.path.append(path)
    import spider
    argv = sys.argv
    arg1 = argv[1]
    if arg1 == "all":
        spider.run()
    elif arg1 == "debug":
        spider.debug = True
        spider.run_cat(argv[2], argv[3])
    elif arg1 == "test":
        if argv[2] == "me":
            spider.test_me()
        elif argv[2] == "modules":
            spider.test_modules()
        else:
            spider.load_worker_and_test(argv[2], argv[3])
    elif arg1 == "redis_proxy":
        from spider import redis_proxy
        redis_proxy.run()
    elif arg1 == "create":
        from spider import template
        template.create_module(argv[2], *argv[3:])
    else:
Пример #6
0
def thread_function(name, version):
    spider.run(name, version)
Пример #7
0
#! /usr/bin/env python
#-*-encoding=utf8-*-
import os 
import sys 
import pdb 

if __name__ == "__main__": 
    path = "/".join(os.path.dirname(__file__).split("/")[:-1])
    sys.path.append(path) 
    import spider 
    argv = sys.argv
    arg1 = argv[1] 
    if arg1 == "all":
        spider.run() 
    elif arg1 == "debug":
        spider.debug = True
        spider.run_cat(argv[2], argv[3]) 
    elif arg1 == "test":
        if argv[2] == "me":
            spider.test_me()
        elif argv[2] == "modules":
            spider.test_modules()
        else:
            spider.load_worker_and_test(argv[2], argv[3]) 
    elif arg1 == "redis_proxy":
        from spider import redis_proxy
        redis_proxy.run()
    elif arg1 == "create":
        from spider import template
        template.create_module(argv[2], *argv[3:])
    else: 
Пример #8
0
sys.setdefaultencoding("utf-8")

logging.basicConfig(
    level=logging.DEBUG,
    format=
    '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',
    filename='myapp.log',
    filemode='w')

login.login()

res = {}
wordlist = {}
for person in conf.persons:
    res[person] = spider.run(person, 1000)
    logging.info('%s\tdownloaded!' % person)
    f = open('./data/' + person, 'w')
    for word in res[person]:
        #写入临时文件
        f.write('%s\t' % word)
        f.write('%f\n' % res[person][word])

        #总词数统计
        if word in wordlist:
            wordlist[word] += 1
        else:
            wordlist[word] = 1
    f.close()

#筛选重复率高和低的词
Пример #9
0
reload(sys)
sys.setdefaultencoding( "utf-8" )

logging.basicConfig(level=logging.DEBUG,
    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',
    filename='myapp.log',
    filemode='w')

login.login()

res = {}
wordlist = {}
for person in conf.persons:
    res[person] = spider.run(person, 1000)
    logging.info('%s\tdownloaded!'%person)
    f = open('./data/'+person, 'w')
    for word in res[person]:
        #写入临时文件
        f.write('%s\t' % word)
        f.write('%f\n' % res[person][word])
         
        #总词数统计
        if word in wordlist:
            wordlist[word] += 1
        else:
            wordlist[word] = 1
    f.close()

#筛选重复率高和低的词
Пример #10
0
        }
        #迭代iteration个用户
        for i in range(iteration):
            offset = 0
            log(seed)
            while True:
                if (seed.encode() in self.done_list):
                    seed, seed_img = self.get_next_user()
                    break

                log('iteration: ' + str(i) + '  limit: ' + str(limit) +
                    '  offset: ' + str(offset))
                data = run_uri(self.convert_uri(seed, offset, limit),
                               headers=headers)

                #获取了20条数据,处理,如果都处理完毕,则该用户完成
                status = self.process_data(data)
                if status == False:
                    self.append_done_list(seed, seed_img)
                    seed, seed_img = self.get_next_user()
                    break
                offset += 20
            if seed == None:
                break
        self.save_result()


if __name__ == "__main__":
    spider = zhihu_spider()
    spider.run(seed="excited-vczh")