async def main(): async with aiohttp.ClientSession() as session: start_url = 'https://www.zhihu.com/api/v4/topics/19592502/feeds/timeline_question?include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;&limit=10&offset={pageToken}' mysql_cli = db.MysqlClient() for i in range(40): url = start_url.format(pageToken=i * 10) html = await fetch(session, url) print(html) json_obj = json.loads(html) for data in json_obj['data']: id = str(data['target']['id']) title = str(data['target']['title']) name = str(data['target']['author']['name']) save_re = id+','+title.replace(',',',')+','+name.replace(',',',')+'\n' print(save_re) with open('zhihu_id.txt','a') as f: f.write(save_re) sql = "insert into question(postId,title,name)" \ " VALUES ('%s', '%s', '%s')" \ % (id,title,name) print(sql) mysql_cli.save(sql)
def __init__(self): # with open('results.csv','w') as f: # write_res = 'id,用户名,性别,年龄,应聘职位,应聘类型,教育程度,城市,期望薪资,在职状态,工作年限,期望工作性质,入学时间,毕业时间,学校,专业,简历修改时间,自我评价,工作1,工作2,工作3,项目1,项目2,项目3' + '\n' # f.write(write_res) #实例化db self.mysql = db.MysqlClient() #cityid 上海538 苏州639 self.start_url = 'https://rd5.zhaopin.com/api/custom/search/resumeListV2?_=1540959006801&x-zp-page-request-id=50d84efd11d84f49b1b88d22d774142d-1540958517096-790847' self.download = download.Download()
'cityId': 's130900', 'cityName': '沧州市' }, { 'cityId': 's131000', 'cityName': '廊坊市' }, { 'cityId': 's131100', 'cityName': '衡水市' }, { 'cityId': 's139100', 'cityName': '定州市' }, { 'cityId': 's130181', 'cityName': '辛集市' }] dbclient = db.MysqlClient() item_list = [] fileName = '张家口市' with open(fileName + '.txt') as f: results = f.readlines() for res in results: title = res.split(',')[0] id = res.split(',')[1].strip() __EVENTARGUMENT = res.split(',')[2].strip() obj = { 'id': id, 'title': title, '__EVENTARGUMENT': __EVENTARGUMENT, } item_list.append(obj) for item in item_list: ########## 起始位置
for i in range(1, pageNum + 1): print('当前页数:' + str(i)) #拼接url start_url = URL.format(keyword=keyword, queryType=queryType, pageToken=i, startDate=startDate, endDate=endDate) print(start_url) response = down.get_html(start_url) if response: # print(response.text): parse(item, response) else: print('网络请求失败') continue if __name__ == '__main__': #实例化数据库,下载器对象 mysqlCli = db.MysqlClient() down = download.Download() #获取配置参数 item_list = read() # with open('post.csv', 'w', encoding='gbk', errors='ignore') as f: # f.write('关键词,开始时间,结束时间,id,链接,用户名,内容,发布时间,转发数,评论数,点赞数,爬取时间\n') # with open('comment.csv', 'w', encoding='gbk', errors='ignore') as f: # f.write('id,评论id,评论内容,评论时间\n') for obj in item_list: print('当前关键词:' + obj['keyword']) main(obj)
async def main(): async with aiohttp.ClientSession() as session: mysql_cli = db.MysqlClient() item_list = [] with open('author.txt') as f: results = f.readlines() for res in results: name = res.split(',')[0] if name == '匿名用户' or name == '知乎用户': continue authorId = res.split(',')[1] obj = { 'name': name, 'authorId': authorId, } item_list.append(obj) for obj in item_list: print(obj) name = obj['name'] authorId = obj['authorId'] start_url = 'https://www.zhihu.com/api/v4/members/{authorId}/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit={pageToken}' url = start_url.format(authorId=authorId, pageToken=0) response = await fetch(session, url) print(response) json_obj = json.loads(response) if 'paging' not in json_obj: continue total = json_obj['paging']['totals'] if total == 0: print('关注数为0') continue #处理第一页 for data in json_obj['data']: followName = data['name'] sql = "insert into author(name,followName)" \ " VALUES ('%s', '%s')" \ % (name,followName) print(sql) mysql_cli.save(sql) pageNum = math.ceil(total / 20) #获取其他页数 for i in range(1, pageNum + 1): pageToken = i * 20 url = start_url.format(authorId=authorId, pageToken=pageToken) response = await fetch(session, url) print(response) json_obj = json.loads(response) for data in json_obj['data']: followName = data['name'] sql = "insert into author(name,followName)" \ " VALUES ('%s', '%s')" \ % (name, followName) print(sql) mysql_cli.save(sql)
def __init__(self): self.mysql = db.MysqlClient()
def __init__(self): self.one_results_list = [] self.mysql = db.MysqlClient()
filePath = 'virusFile/' + fileName sql = "insert into virus2(virusName,virusType,uploadTime,description,filePath) values ('%s','%s','%s','%s','%s')" % ( virusName, virusType, uploadTime, description, filePath ) + "ON DUPLICATE KEY UPDATE uploadTime='%s'" % ( uploadTime) print(sql) dbClient.save(sql) except: print('error') continue if 'No additional results available' in response.text: break except: print('error') continue def start(): login() searchKey_list = ['worm', 'win32', 'win95', 'PE'] for kw in searchKey_list: get_info(kw) if __name__ == '__main__': dbClient = db.MysqlClient() start()
async def main(): async with aiohttp.ClientSession() as session: mysql_cli = db.MysqlClient() item_list = [] with open('zhihu_id.txt') as f: results = f.readlines() for res in results: id = res.split(',')[0] question = res.split(',')[1] obj = { 'id': id, 'question': question, } item_list.append(obj) for obj in item_list: print(obj['id']) url = 'https://www.zhihu.com/question/' + obj['id'] print(url) response = await fetch(session, url) jsonStr = re.search( '<script id="js-initialData".*?>(.*?)</script>', response).group(1) json_obj = json.loads(jsonStr) print(json.dumps(json_obj)) for data in json_obj['initialState']['entities']['questions']: questionAuthor = json_obj['initialState']['entities'][ 'questions'][data]['author']['name'] questionAuthorId = json_obj['initialState']['entities'][ 'questions'][data]['author']['urlToken'] questionAuthor_hashId = json_obj['initialState']['entities'][ 'questions'][data]['author']['id'] save_re = questionAuthor.replace( ',', ',' ) + ',' + questionAuthorId + ',' + questionAuthor_hashId + '\n' with open('author.txt', 'a') as f: f.write(save_re) for data in json_obj['initialState']['entities']['answers']: question = obj['question'] answerId = str(json_obj['initialState']['entities']['answers'] [data]['id']) answer = json_obj['initialState']['entities']['answers'][data][ 'content'] html = HTML(answer) content_list = html.xpath('//text()') answer = ''.join(content_list) answerAuthor = json_obj['initialState']['entities']['answers'][ data]['author']['name'] answerAuthorId = json_obj['initialState']['entities'][ 'answers'][data]['author']['urlToken'] answerAuthor_hashId = json_obj['initialState']['entities'][ 'answers'][data]['author']['id'] commentCount = str(json_obj['initialState']['entities'] ['answers'][data]['commentCount']) likeCount = str(json_obj['initialState']['entities']['answers'] [data]['voteupCount']) # print(question) # print(answerId) # print(answer) # print(answerAuthor) # print(answerAuthorId) # print(answerAuthor_hashId) # print(commentCount) # print(likeCount) save_re = answerAuthor.replace( ',', ',' ) + ',' + answerAuthorId + ',' + answerAuthor_hashId + '\n' with open('author.txt', 'a') as f: f.write(save_re) sql = "insert into questionDetail(question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount)" \ " VALUES ('%s', '%s', '%s','%s', '%s', '%s','%s', '%s')" \ % (question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount) print(sql) mysql_cli.save(sql)