def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info( 'Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info('Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False): """ :param url: url to crawl :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login. :param is_ajax: whether the request is ajax :param need_proxy: whether the request need a http/https proxy :return: response text, when a exception is raised, return '' """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < MAX_RETRIES: if auth_level == 2: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning( 'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired' ) send_email() os.kill(os.getppid(), signal.SIGTERM) # There is no difference between http and https address. proxy = { 'http': name_cookies[2], 'https': name_cookies[2], } else: proxy = getip.getIPWithoutLogin('') # if proxy['http'] is None: # crawler.warning('No available ip in ip pools. Using local ip instead.') try: if auth_level == 2: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy) elif auth_level == 1: resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy) else: resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning( 'Excepitons are raised when crawling {}.Here are details:{}'. format(url, e)) count += 1 time.sleep(EXCP_INTERAL) continue if resp.status_code == 414: crawler.warning('This ip has been blocked by weibo system') if not need_proxy: send_email() os.kill(os.getppid(), signal.SIGTERM) if resp.text: page = resp.text.encode('utf-8', 'ignore').decode('utf-8') else: count += 1 continue if auth_level == 2: # slow down to aviod being banned time.sleep(INTERAL) if is_banned(resp.url) or is_403(page): crawler.warning('Account {} has been banned'.format( name_cookies[0])) LoginInfoOper.freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_ajax and not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{} seems to be 404'.format(url)) return '' Urls.store_crawl_url(url, 1) return page Urls.store_crawl_url(url, 0) return ''
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False): """ :param url: url to crawl :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login. :param is_ajax: whether the request is ajax :param need_proxy: whether the request need a http/https proxy :return: response text, when a exception is raised, return '' """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < MAX_RETRIES: if auth_level == 2: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired') send_email() os.kill(os.getppid(), signal.SIGTERM) # There is no difference between http and https address. proxy = {'http': name_cookies[2], 'https': name_cookies[2], } else: proxy = getip.getIPWithoutLogin('') # if proxy['http'] is None: # crawler.warning('No available ip in ip pools. Using local ip instead.') try: if auth_level == 2: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy) elif auth_level == 1: resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy) else: resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e)) count += 1 time.sleep(EXCP_INTERAL) continue if resp.status_code == 414: crawler.warning('This ip has been blocked by weibo system') if not need_proxy: send_email() os.kill(os.getppid(), signal.SIGTERM) if resp.text: page = resp.text.encode('utf-8', 'ignore').decode('utf-8') else: count += 1 continue if auth_level == 2: # slow down to aviod being banned time.sleep(INTERAL) if is_banned(resp.url) or is_403(page): crawler.warning('Account {} has been banned'.format(name_cookies[0])) LoginInfoOper.freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_ajax and not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{} seems to be 404'.format(url)) return '' Urls.store_crawl_url(url, 1) return page Urls.store_crawl_url(url, 0) return ''