Пример #1
0
 async def crawl_follow(self):
     while True:
         follow_dict = self.redis_job_now.fetch_job(JobType.follower.value)
         if follow_dict:
             try:
                 await self.grab_follow(follow_dict)
                 LOGGER.info('finish %d follow crawl ' % follow_dict['uid'])
             except TimeoutError as e:
                 print(e)
             except:
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Пример #2
0
 async def crawl_repost(self):
     while True:
         repost_job_info = await self.redis_job.fetch_job(JobType.repost.value)
         if repost_job_info:
             try:
                 await self.grab_tweet_repost(repost_job_info)
             except TimeoutError as e:
                 pass
             except:
                 LOGGER.error("something error")
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Пример #3
0
 async def search(self):
     while True:
         search_job_info = await self.redis_job.fetch_job(
             JobType.search.value)
         if search_job_info:
             try:
                 await self.search_tweet(search_job_info)
             except TimeoutError as e:
                 pass
             except:
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Пример #4
0
 def topic_finding_now(self):
     topic_job_info = self.redis_job_now.fetch_job(JobType.topic.value)
     if topic_job_info:
         try:
             print(topic_job_info)
             LOGGER.info('topic finding')
             self.search_topic_user_now(topic_job_info)
         except TimeoutError as e:
             LOGGER.info('topic finding timeout error')
             pass
         except:
             LOGGER.error(traceback.format_exc())
             sleep(5 * 60)
Пример #5
0
 async def crawl_comment(self):
     while True:
         comment_job_info = await self.redis_job.fetch_job(JobType.comment.value)
         if comment_job_info:
             try:
                 # asyncio.run_coroutine_threadsafe(self.grab_tweet_comments(comment_job_info), self.loop)
                 await self.grab_tweet_comments(comment_job_info)
             except TimeoutError as e:
                 pass
             except:
                 LOGGER.error("something error")
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Пример #6
0
    async def crawl_follow(self):
        while True:
            follow_dict = await self.redis_job.fetch_job(JobType.follower.value
                                                         )
            if follow_dict:

                try:
                    await self.grab_follow(follow_dict)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)
Пример #7
0
 async def super_fan_finding(self):
     while True:
         topic_job_info = self.redis_job_now.fetch_job(JobType.superfan.value)
         if topic_job_info:
             try:
                 print(topic_job_info)
                 LOGGER.info('super fan finding')
                 await self.search_super_fan(topic_job_info)
             except TimeoutError as e:
                 LOGGER.info('super fan finding timeout error')
                 pass
             except:
                 LOGGER.error(traceback.format_exc())
                 sleep(5 * 60)
Пример #8
0
def main():
    # RedisCookies.clean()
    weiboLogin = WeiboLogin()
    success = []
    failed = []
    for account in ACCOUNTS:
        try:
            LOGGER.info('get cookies for %s' % str(account))
            cookies = weiboLogin.login_by_selenium(account['user'], account['password'])
            if cookies is not None and 'SSOLoginState' in cookies and 'SUBP' in cookies and 'SUHB' in cookies:
                success.append(account)
                RedisCookies.save_cookies(account['user'], cookies)
            else:
                failed.append(account)
        except Exception:
            LOGGER.error("get cookies failed")
            traceback.print_exc()
            failed.append(account)
    LOGGER.info("%d accounts login success" % len(success))
    LOGGER.info("%d accounts login failed" % len(failed))
Пример #9
0
    async def crawl_user(self):
        while True:
            user_job_info = await self.redis_job.fetch_job(JobType.user.value)
            if user_job_info:
                try:
                    # asyncio.run_coroutine_threadsafe(self.grab_user_info(user_job_info['user_id']), self.loop)
                    await self.grab_user_info(user_job_info['user_id'])
                    # await self.redis_job.push_job(JobType.tweet.value,
                    #                               {'url': 'https://weibo.cn/' + user_job_info['user_id'],
                    #                                'uid': user_job_info['user_id']})

                    # await self.redis_job.push_job(JobType.follower.value,
                    #                               {'url': self.follow_url % user_job_info['user_id'],
                    #                                'uid': user_job_info['user_id']})
                    # self.weibo_queue.put({'url': self.user_tweet_url % user_id, 'uid': user_id})
                    # self.follow_queue.put({'uid': user_id, 'url': self.follow_url % user_id})
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)
Пример #10
0
    async def crawl_weibo(self):
        r = re.compile(r'https://weibo.cn/(\d*)\?page=(\d*)')
        while True:
            tweet_job_info = await self.redis_job.fetch_job(JobType.tweet.value)
            if tweet_job_info:
                m = r.findall(tweet_job_info['url'])
                if m:
                    page_no = int(m[0][1])
                    if page_no > 200:
                        LOGGER.info('job passed %s' % str(tweet_job_info))
                        continue
                # if 'page=' in tweet_job_info['url']:
                #     LOGGER.info('job passed %s' % str(tweet_job_info))
                #     continue

                try:
                    await self.grab_user_tweet(tweet_job_info)
                except TimeoutError as e:
                    pass
                except:
                    LOGGER.error(traceback.format_exc())
                    sleep(5 * 60)
Пример #11
0
    def login_by_selenium(self, weibo_user, weibo_password):
        browser = webdriver.Firefox()
        browser.maximize_window()
        try_time = 5
        cookie_got = False
        browser.get('https://weibo.com/login.php')
        username = browser.find_element_by_id("loginname")
        username.clear()
        username.send_keys(weibo_user)
        psd = browser.find_element_by_xpath('//input[@type="password"]')
        psd.clear()
        psd.send_keys(weibo_password)
        commit_btn = browser.find_element_by_xpath(
            '//a[@node-type="submitBtn"]')
        commit_btn.click()
        # 没那么快登录成功
        sleep(5)
        while try_time:
            try:
                # 如果登录不成功是有验证码框的
                browser.find_element_by_xpath(
                    '//div[@node-type="verifycode_box"]')
                code_input = browser.find_element_by_xpath(
                    '//input[@node-type="verifycode"]')
                LOGGER.info("need input verify code")
                code_input.send_keys('  ')
                img_path = self.save_verify_code_img(browser, weibo_user)

                while not os.path.exists(img_path):
                    LOGGER.info(img_path + "not exist")
                    sleep(1)
                    LOGGER.info(img_path)

                captcha_id, code_text = self.yun_da_ma.recognize(img_path)
                # os.remove(img_path)
                code_str = bytes.decode(code_text)
                LOGGER.info('recognize result: %s' % code_str)

                code_input.clear()
                code_input.send_keys(code_str)
                commit_btn = browser.find_element_by_xpath(
                    '//a[@node-type="submitBtn"]')
                commit_btn.click()
                # 稍等一会
                sleep(3)
                try_time -= 1
            except StaleElementReferenceException:
                cookie_got = True
                print('login success')
                break
            except NoSuchElementException:
                cookie_got = True
                print('login success')
                break
            except ElementNotInteractableException:
                sleep(2)
                try_time -= 1

        if cookie_got:
            sleep(2)
            LOGGER.info('get https://weibo.cn/1316949123/info')
            browser.get('https://weibo.cn/1316949123/info')
            sleep(2)
            cookies_dict = {}
            for elem in browser.get_cookies():
                cookies_dict[elem['name']] = elem['value']
                print(elem["name"], elem["value"])
            # RedisCookies.save_cookies(weibo_user, cookies_dict)
            browser.close()
            return cookies_dict
        else:
            browser.close()
            LOGGER.error("get cookie failed :%s" % weibo_user)
            return None
Пример #12
0
    def info(self, user_id):
        base_home_url = 'https://weibo.com/p/100306%s/info?mod=pedit_more' % user_id
        LOGGER.info('info task: %s' % base_home_url)
        cookies_json = RedisCookies.fetch_cookies()
        cookies = cookies_json['cookies']
        headers = self.get_header()
        session = requests.Session()
        session2 = requests.Session()
        headers['Host'] = 'weibo.com'
        headers['Referer'] = 'https://weibo.com/p/100306%s/home' % user_id
        # https://weibo.com/p/1003061316949123/home?from=page_100306&mod=TAB&is_all=1
        # 'http://weibo.com/2606356035/fans?from=100505&wvr=6&mod=headfans&current=fans'
        headers['Upgrade-Insecure-Requests'] = '1'
        headers.pop('Connection')
        headers.pop('Accept')
        headers['Proxy-Connection'] = 'keep-alive'
        try_time = 0
        info_html = ''
        info_html_str = ''
        while try_time < 10:
            resp_text = session.get(url=base_home_url,
                                    headers=headers,
                                    cookies=cookies,
                                    verify=False).text
            resp_text2 = requests.get(url=base_home_url,
                                      headers=headers,
                                      verify=False).text
            print(resp_text2)
            view_json = self.find_fm_view_json(html=resp_text)
            for r_json in view_json:
                if 'Pl_Official_PersonalInfo__58' == r_json['domid']:
                    info_html_str = r_json['html']
                    break
            if info_html_str != '':
                info_html = BeautifulSoup(info_html_str, 'html.parser')
                iframe = info_html.find_all('iframe')
                if not iframe:
                    break
            try_time += 1

        if info_html != '':
            # user = User()
            # user.user_id = user_id
            # if not db_session.query(exists().where(User.user_id == user_id)).scalar():
            #     db_session.add(user)
            #     db_session.commit()
            lis = info_html.find_all('li', 'clearfix')
            info_dict = {}
            for li in lis:
                try:
                    title = li.find('span', 'pt_title').text
                    pt_detail = li.find('span', 'pt_detail')
                    all_a = pt_detail.find_all('a')
                    if all_a:
                        detail = ','.join([a.text for a in all_a])
                    else:
                        detail = pt_detail.text

                    detail = detail.replace('\n',
                                            '').replace('\t',
                                                        '').replace('\r', '')

                    value = self.info_map.get(title[:-1], None)
                    if value:
                        info_dict[value] = detail
                except:
                    LOGGER.error('info task error: %s' %
                                 traceback.format_exc())
                    continue
            # app.send_task('tasks.user.fans', args=(user_id,))
            if info_dict:
                LOGGER.info('info task result: %s' % info_dict)
                # try:
                #     db_session.query(User).filter(User.user_id == user_id).update(info_dict)
                #     db_session.commit()
                # except:
                #     db_session.rollback()
                #     LOGGER.error('info task error: %s' % traceback.format_exc())
            return info_dict
        return None