예제 #1
0
async def main():
    async with aiohttp.ClientSession() as session:
        start_url = 'https://www.zhihu.com/api/v4/topics/19592502/feeds/timeline_question?include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;&limit=10&offset={pageToken}'

        mysql_cli = db.MysqlClient()
        for i in range(40):
            url = start_url.format(pageToken=i * 10)
            html = await fetch(session, url)
            print(html)
            json_obj = json.loads(html)
            for data in json_obj['data']:
                id = str(data['target']['id'])
                title = str(data['target']['title'])
                name = str(data['target']['author']['name'])
                save_re = id+','+title.replace(',',',')+','+name.replace(',',',')+'\n'
                print(save_re)
                with open('zhihu_id.txt','a') as f:
                    f.write(save_re)

                sql = "insert into question(postId,title,name)" \
                      " VALUES ('%s', '%s', '%s')" \
                      % (id,title,name)
                print(sql)

                mysql_cli.save(sql)
예제 #2
0
파일: job.py 프로젝트: zack7wong/spiders
    def __init__(self):
        # with open('results.csv','w') as f:
        #     write_res = 'id,用户名,性别,年龄,应聘职位,应聘类型,教育程度,城市,期望薪资,在职状态,工作年限,期望工作性质,入学时间,毕业时间,学校,专业,简历修改时间,自我评价,工作1,工作2,工作3,项目1,项目2,项目3' + '\n'
        #     f.write(write_res)
        #实例化db
        self.mysql = db.MysqlClient()

        #cityid 上海538 苏州639
        self.start_url = 'https://rd5.zhaopin.com/api/custom/search/resumeListV2?_=1540959006801&x-zp-page-request-id=50d84efd11d84f49b1b88d22d774142d-1540958517096-790847'
        self.download = download.Download()
예제 #3
0
     'cityId': 's130900',
     'cityName': '沧州市'
 }, {
     'cityId': 's131000',
     'cityName': '廊坊市'
 }, {
     'cityId': 's131100',
     'cityName': '衡水市'
 }, {
     'cityId': 's139100',
     'cityName': '定州市'
 }, {
     'cityId': 's130181',
     'cityName': '辛集市'
 }]
 dbclient = db.MysqlClient()
 item_list = []
 fileName = '张家口市'
 with open(fileName + '.txt') as f:
     results = f.readlines()
     for res in results:
         title = res.split(',')[0]
         id = res.split(',')[1].strip()
         __EVENTARGUMENT = res.split(',')[2].strip()
         obj = {
             'id': id,
             'title': title,
             '__EVENTARGUMENT': __EVENTARGUMENT,
         }
         item_list.append(obj)
 for item in item_list:  ########## 起始位置
예제 #4
0
파일: start.py 프로젝트: zack7wong/spiders
    for i in range(1, pageNum + 1):
        print('当前页数:' + str(i))
        #拼接url
        start_url = URL.format(keyword=keyword,
                               queryType=queryType,
                               pageToken=i,
                               startDate=startDate,
                               endDate=endDate)
        print(start_url)
        response = down.get_html(start_url)
        if response:
            # print(response.text):
            parse(item, response)
        else:
            print('网络请求失败')
            continue


if __name__ == '__main__':
    #实例化数据库,下载器对象
    mysqlCli = db.MysqlClient()
    down = download.Download()
    #获取配置参数
    item_list = read()
    # with open('post.csv', 'w', encoding='gbk', errors='ignore') as f:
    #     f.write('关键词,开始时间,结束时间,id,链接,用户名,内容,发布时间,转发数,评论数,点赞数,爬取时间\n')
    # with open('comment.csv', 'w', encoding='gbk', errors='ignore') as f:
    #     f.write('id,评论id,评论内容,评论时间\n')
    for obj in item_list:
        print('当前关键词:' + obj['keyword'])
        main(obj)
예제 #5
0
async def main():
    async with aiohttp.ClientSession() as session:

        mysql_cli = db.MysqlClient()
        item_list = []
        with open('author.txt') as f:
            results = f.readlines()
            for res in results:
                name = res.split(',')[0]
                if name == '匿名用户' or name == '知乎用户':
                    continue
                authorId = res.split(',')[1]
                obj = {
                    'name': name,
                    'authorId': authorId,
                }
                item_list.append(obj)

        for obj in item_list:
            print(obj)
            name = obj['name']
            authorId = obj['authorId']

            start_url = 'https://www.zhihu.com/api/v4/members/{authorId}/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit={pageToken}'
            url = start_url.format(authorId=authorId, pageToken=0)
            response = await fetch(session, url)
            print(response)

            json_obj = json.loads(response)
            if 'paging' not in json_obj:
                continue

            total = json_obj['paging']['totals']
            if total == 0:
                print('关注数为0')
                continue

            #处理第一页
            for data in json_obj['data']:
                followName = data['name']
                sql = "insert into author(name,followName)" \
                      " VALUES ('%s', '%s')" \
                      % (name,followName)
                print(sql)
                mysql_cli.save(sql)

            pageNum = math.ceil(total / 20)

            #获取其他页数
            for i in range(1, pageNum + 1):
                pageToken = i * 20
                url = start_url.format(authorId=authorId, pageToken=pageToken)
                response = await fetch(session, url)
                print(response)

                json_obj = json.loads(response)

                for data in json_obj['data']:
                    followName = data['name']
                    sql = "insert into author(name,followName)" \
                          " VALUES ('%s', '%s')" \
                          % (name, followName)
                    print(sql)
                    mysql_cli.save(sql)
예제 #6
0
 def __init__(self):
     self.mysql = db.MysqlClient()
예제 #7
0
 def __init__(self):
     self.one_results_list = []
     self.mysql = db.MysqlClient()
예제 #8
0
파일: virus.py 프로젝트: zack7wong/spiders
                    filePath = 'virusFile/' + fileName

                    sql = "insert into virus2(virusName,virusType,uploadTime,description,filePath) values ('%s','%s','%s','%s','%s')" % (
                        virusName, virusType, uploadTime, description, filePath
                    ) + "ON DUPLICATE KEY UPDATE uploadTime='%s'" % (
                        uploadTime)
                    print(sql)
                    dbClient.save(sql)
                except:
                    print('error')
                    continue

            if 'No additional results available' in response.text:
                break

        except:
            print('error')
            continue


def start():
    login()
    searchKey_list = ['worm', 'win32', 'win95', 'PE']
    for kw in searchKey_list:
        get_info(kw)


if __name__ == '__main__':
    dbClient = db.MysqlClient()
    start()
예제 #9
0
async def main():
    async with aiohttp.ClientSession() as session:

        mysql_cli = db.MysqlClient()
        item_list = []
        with open('zhihu_id.txt') as f:
            results = f.readlines()
            for res in results:
                id = res.split(',')[0]
                question = res.split(',')[1]
                obj = {
                    'id': id,
                    'question': question,
                }
                item_list.append(obj)

        for obj in item_list:
            print(obj['id'])
            url = 'https://www.zhihu.com/question/' + obj['id']
            print(url)
            response = await fetch(session, url)

            jsonStr = re.search(
                '<script id="js-initialData".*?>(.*?)</script>',
                response).group(1)
            json_obj = json.loads(jsonStr)
            print(json.dumps(json_obj))

            for data in json_obj['initialState']['entities']['questions']:
                questionAuthor = json_obj['initialState']['entities'][
                    'questions'][data]['author']['name']
                questionAuthorId = json_obj['initialState']['entities'][
                    'questions'][data]['author']['urlToken']
                questionAuthor_hashId = json_obj['initialState']['entities'][
                    'questions'][data]['author']['id']

                save_re = questionAuthor.replace(
                    ',', ','
                ) + ',' + questionAuthorId + ',' + questionAuthor_hashId + '\n'
                with open('author.txt', 'a') as f:
                    f.write(save_re)

            for data in json_obj['initialState']['entities']['answers']:
                question = obj['question']

                answerId = str(json_obj['initialState']['entities']['answers']
                               [data]['id'])
                answer = json_obj['initialState']['entities']['answers'][data][
                    'content']

                html = HTML(answer)
                content_list = html.xpath('//text()')
                answer = ''.join(content_list)
                answerAuthor = json_obj['initialState']['entities']['answers'][
                    data]['author']['name']
                answerAuthorId = json_obj['initialState']['entities'][
                    'answers'][data]['author']['urlToken']
                answerAuthor_hashId = json_obj['initialState']['entities'][
                    'answers'][data]['author']['id']
                commentCount = str(json_obj['initialState']['entities']
                                   ['answers'][data]['commentCount'])
                likeCount = str(json_obj['initialState']['entities']['answers']
                                [data]['voteupCount'])

                # print(question)
                # print(answerId)
                # print(answer)
                # print(answerAuthor)
                # print(answerAuthorId)
                # print(answerAuthor_hashId)
                # print(commentCount)
                # print(likeCount)

                save_re = answerAuthor.replace(
                    ',', ','
                ) + ',' + answerAuthorId + ',' + answerAuthor_hashId + '\n'
                with open('author.txt', 'a') as f:
                    f.write(save_re)

                sql = "insert into questionDetail(question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount)" \
                      " VALUES ('%s', '%s', '%s','%s', '%s', '%s','%s', '%s')" \
                      % (question,answerId,answer,answerAuthor,answerAuthorId,answerAuthor_hashId,commentCount,likeCount)
                print(sql)

                mysql_cli.save(sql)