예제 #1
0
def fetch_all_url(kw, begin, count, name_cookies):
    # 先进行一次搜索并保存
    max_num = get_total(kw, begin, 1, name_cookies)
    while max_num > begin:
        search_url = get_search_biz_url(query=kw, begin=begin, count=count)
        logger.info(search_url)
        keyword.excute_user_crawl_task(search_url)
        begin += count
예제 #2
0
def search_keyword(kw):
    # 通过队列获取账号的cookie
    name_cookies = Cookies.fetch_cookies()
    if (len(name_cookies) == 0):
        logger.error("没有可用cookie。")
    else:
        token = get_token(name_cookies)
        fetch_all(kw, 340, 5, name_cookies, token)
        logger.info('完成搜索公众号=========>关键词:' + kw)
        # 设置为不可用
        KeywordDao.set_enable(kw, False)
예제 #3
0
def fetch_all(kw, begin, count, name_cookies, token):
    #先进行一次搜索并保存
    max_num = get_total(kw, begin, count, name_cookies, token)
    while max_num > begin:
        time.sleep(random.randint(10, 15))
        logger.info('翻页###################begin=' + str(begin))
        try:
            begin += count
            search_by_page(kw, begin, count, name_cookies, token)
        except:
            logger.error('采集异常!!!!!!!')
예제 #4
0
def search_by_page(wechat_biz, begin, count, name_cookies):
    logger.info('开始search公众号=========>:' + str(begin) + ':' + wechat_biz)
    login_user = name_cookies[0]
    cookies = name_cookies[1]
    token = cookies['token']
    search_url = get_search_wechat_url(fakeid=wechat_biz,
                                       begin=begin,
                                       count=count,
                                       token=token)
    logger.info(search_url)
    search_response = requests.get(search_url, cookies=cookies, headers=header)
    return search_response
예제 #5
0
def search_by_page(kw, begin, count, name_cookies):
    logger.info('开始search关键词=========>:' + str(begin) + ':' + kw)
    login_user = name_cookies[0]
    cookies = name_cookies[1]
    token = cookies['token']
    search_url = get_search_biz_url(query=kw,
                                    begin=begin,
                                    count=count,
                                    token=token)
    logger.info(search_url)
    search_response = requests.get(search_url, cookies=cookies, headers=header)
    return search_response
예제 #6
0
def get_article_url_list(search_url):
    # 通过队列获取账号的cookie
    name_cookies = get_cookie()
    while name_cookies == None:
        logger.info('cookie 为空,30s后 重新获取')
        time.sleep(30)
        name_cookies = get_cookie()

    cookies = name_cookies[1]
    token = cookies['token']
    search_url = search_url + '&token=' + token
    logger.info("searching article ==========>:" + search_url)
    search_response = requests.get(search_url, cookies=cookies, headers=header)
    lists = search_response.json().get('app_msg_list')

    if not os.path.exists(url_save_path):
        os.mkdir(url_save_path)
    urls_file = open(url_save_path + '/urls.txt', 'a+', encoding='utf-8')
    for item in lists:
        json_str = json.dumps(item, ensure_ascii=False)
        logger.info(json_str)
        try:
            link = item.get('link')
            # url写入文件
            urls_file.write(link + '\n')
            # 发送微信文章爬虫任务
            wechat_crawl.excute_wechat_crawl_task(link)

        except:
            logger.info("保存失败:" + json_str)

    urls_file.close()
    random_time = random.randint(15, 30)
    time.sleep(random_time)
예제 #7
0
def search_keyword(kw):
    # 通过队列获取账号的cookie
    name_cookies = get_cookie()
    while name_cookies == None:
        logger.info('搜索关键词' + kw + '时,cookie 为空,重新获取')
        name_cookies = get_cookie()
    user_crawl.fetch_all_url(kw, start, page_size, name_cookies)
    logger.info('搜索公众号任务完成=========>关键词:' + kw)
    # 设置为不可用
    KeywordDao.set_enable(kw, False)


# if __name__ == '__main__':
#
#     search_keyword('datangleiyin')
예제 #8
0
def fetch_user_all_url(wechat_biz):
    begin = 0
    count = ARTICLE_PAGE_LIMIT
    name_cookies = get_cookie()
    # 先进行一次搜索并保存
    max_num = get_total(wechat_biz, begin, 1, name_cookies)
    while max_num > begin:
        search_url = get_search_wechat_url(fakeid=wechat_biz,
                                           begin=begin,
                                           count=count)
        logger.info(search_url)

        # 发送搜索url任务
        wechat_crawl.excute_wechat_url_crawl_task(search_url)

        begin += count
    WeChatUserDao.set_history_crawled(wechat_biz)
예제 #9
0
def search_by_page(kw, begin, count, name_cookies, token):
    logger.info('开始search关键词=========>:' + str(begin) + ':' + kw)
    login_user = name_cookies[0]
    cookies = name_cookies[1]
    query_id = {
        'action': 'search_biz',
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'query': kw,
        'begin': '{}'.format(str(begin)),
        'count': count,
    }
    search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
    search_response = requests.get(search_url,
                                   cookies=cookies,
                                   headers=header,
                                   params=query_id)
    lists = search_response.json().get('list')
    for item in lists:
        json_str = json.dumps(item, ensure_ascii=False)
        logger.info(json_str)
        try:
            WeChatUserDao.create_by_json(item)
        except:
            logger.info("保存失败:" + json_str)
    return search_response.json()
예제 #10
0
def get_user_list(search_url):
    # 通过队列获取账号的cookie
    name_cookies = get_cookie()
    while name_cookies == None:
        name_cookies = get_cookie()

    cookies = name_cookies[1]
    token = cookies['token']
    search_url = search_url + '&token=' + token
    logger.info("searching==========>:" + search_url)
    search_response = requests.get(search_url, cookies=cookies, headers=header)
    lists = search_response.json().get('list')
    for item in lists:
        json_str = json.dumps(item, ensure_ascii=False)
        logger.info(json_str)
        try:
            WeChatUserDao.create_by_json(item)
            #TODO:
            #触发爬取该用户url的任务,根据公众号的fakeid,即为biz
            logger.info("开始爬取公众号:" + item.get('nickname'))
            wechat_crawl.excute_wechat_user_crawl_task(item.get('fakeid'))
        except Exception as e:
            logger.error("保存公众号失败:" + json_str)
            logger.error(e)
    #执行完后休息一段时间
    random_time = random.randint(20, 50)
    time.sleep(random_time)
예제 #11
0
 def put(self, item):
     json_str = json.dumps(item, ensure_ascii=False, cls=CJsonEncoder)
     crawler.info(json_str)
     return item
예제 #12
0
def get_article(article_url):
    is_crawled = Urls.is_crawled_url(article_url)
    if (is_crawled == 1):
        logger.info("ignore crawled page : " + article_url)
        return
    logger.info("crawling page : " + article_url)
    response = requests.get(article_url, headers=header_wechat, timeout=(3, 5))
    soup = BeautifulSoup(response.content, 'lxml')
    html_str = str(soup)
    try:
        # article = soup.find('div', class_='rich_media')
        # title = article.find('h2').get_text()
        # meta_content = soup.find(id='meta_content')
        # time_str = meta_content.find(id='post-date').get_text()
        # nickname = meta_content.find_all('em')[1].get_text()
        #首先提取分享的内容,如果没有再去提取正文(如果文章是分享的内容为js_share_content,js_content则为空)
        content_div = soup.find(id='js_share_content')
        if (content_div == None):
            content_div = soup.find(id='js_content')
        # content = content_div.get_text()
        content = str(content_div)
        msg_title = (re.search('(var msg_title = ")(.*)"', html_str).group(2))
        nickname = (re.search('(var nickname = ")(.*)"', html_str).group(2))
        alias = (re.search('(var user_name = ")(.*)"', html_str).group(2))
        publish_timestamp = (re.search('(var ct = ")(.*)"', html_str).group(2))
        publish_time = timestamp_datetime(publish_timestamp, type='s')
        # publish_time = time.localtime(int(publish_time_long))
        # publish_time = (re.search('(var publish_time = ")(.*)" ', html_str).group(2))
        round_head_img = (re.search('(var round_head_img = ")(.*)"',
                                    html_str).group(2))
        ori_head_img_url = (re.search('(var ori_head_img_url = ")(.*)"',
                                      html_str).group(2))
        msg_desc = (re.search('(var msg_desc = ")(.*)"', html_str).group(2))
        msg_source_url = (re.search('(var msg_source_url = \')(.*)\'',
                                    html_str).group(2))

        if msg_title:
            item = WeChatData()
            item.url = response.url
            item.title = msg_title
            item.nickname = nickname
            item.alias = alias
            item.pub_time = publish_time
            item.round_head_img = round_head_img
            item.ori_head_img_url = ori_head_img_url
            item.msg_desc = msg_desc
            item.msg_source_url = msg_source_url
            item.content = content
            # 文章处理
            try:
                dic = class_to_dict(item)
                del dic['_state']
                dic['date_modified'] = dic[
                    'date_created'] = timestamp_datetime(time.time(), type='s')
                # json_str = json.dumps(dic,ensure_ascii=False)
                # 发送到kafka(必须先发送到kafka,否则时间格式会让发送不成功)
                MyKafkaProducer.get_instance().send(dic)
                #保存到数据库
                WeChatData.save(item)
                Urls.store_crawled_url(article_url)
            except Exception as err:
                logger.error("保存微信文章异常:")
                logger.error(err)
                Urls.store_save_failed_url(article_url)
                # dic = class_to_dict(item)
                # urls_file = open(url_save_path + '/articles.txt', 'a+', encoding='utf-8')
                # urls_file.writelines(str(dic) + '\n')
                # urls_file.close()
    except Exception as e:
        Urls.store_crawl_failed_url(article_url)
        logger.error(e)


# if __name__ == '__main__':
# share_article_url  = "https://mp.weixin.qq.com/s?__biz=MzA5MzIwNDk5Ng==&mid=2649719243&idx=2&sn=175caa81361cfea8acbb5ead4211b5b6&chksm=887a46a0bf0dcfb6d4c6c4e12eea067446d7e6035e45404d9a0666ab66822e0f7027e37cbb3e#rd"
# article_url ='http://mp.weixin.qq.com/s?__biz=MzA3NjcwNjgyOQ==&mid=204671295&idx=3&sn=731238aecce2c638c3c1157092650e18#rd'
# get_article(article_url)
예제 #13
0
def get_wechat_user_urls(wechat_user_id, name_cookies):
    login_user = name_cookies[0]
    cookies = name_cookies[1]
    response = requests.get(url=url, cookies=cookies)
    response_list = re.findall(r'token=(\d+)', str(response.url))
    if (len(response_list)):
        token = response_list[0]
        query = wechat_user_id
        crawler.info('使用账号[' + login_user + ']开始采集公众号=========>:' + query)
        urls_file = open(url_save_path + '/urls_' + query + '.txt',
                         'a+',
                         encoding='utf-8')
        query_id = {
            'action': 'search_biz',
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1',
            'random': random.random(),
            'query': query,
            'begin': '0',
            'count': '5',
        }
        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
        search_response = requests.get(search_url,
                                       cookies=cookies,
                                       headers=header,
                                       params=query_id)
        lists = search_response.json().get('list')
        if (len(lists) == 0):
            crawler.info("未找到公众号:" + query)
        else:
            fakeid = lists[0].get('fakeid')
            query_id_data = {
                'token': token,
                'lang': 'zh_CN',
                'f': 'json',
                'ajax': '1',
                'random': random.random(),
                'action': 'list_ex',
                'begin': '0',
                'count': '5',
                'query': '',
                'fakeid': fakeid,
                'type': '9'
            }
            appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
            appmsg_response = requests.get(appmsg_url,
                                           cookies=cookies,
                                           headers=header,
                                           params=query_id_data)
            max_num = appmsg_response.json().get('app_msg_cnt')
            num = int(int(max_num) / 5)
            begin = 0
            while num + 1 > 0:
                query_id_data = {
                    'token': token,
                    'lang': 'zh_CN',
                    'f': 'json',
                    'ajax': '1',
                    'random': random.random(),
                    'action': 'list_ex',
                    'begin': '{}'.format(str(begin)),
                    'count': '5',
                    'query': '',
                    'fakeid': fakeid,
                    'type': '9'
                }
                crawler.info('翻页###################begin=' + str(begin))
                try:
                    query_fakeid_response = requests.get(appmsg_url,
                                                         cookies=cookies,
                                                         headers=header,
                                                         params=query_id_data)
                    fakeid_list = query_fakeid_response.json().get(
                        'app_msg_list')
                    for item in fakeid_list:
                        crawler.info(item.get('link'))
                        urls_file.write(item.get('link') + '\n')
                    urls_file.flush()
                    num -= 1
                    begin = int(begin)
                    begin += 5
                    time.sleep(random.randint(10, 15))
                except:
                    crawler.error('采集异常!!!!!!!')
            urls_file.close()
            crawler.info('完成采集公众号=========>:' + query)
            WeChatUserDao.set_history_crawled(query)
    else:
        crawler.info(login_user + 'cookie 失效,请重新登录,正在删除...')
        Cookies.delete_cookies(login_user)
        crawler.info(login_user + 'cookie,删除成功。')