def __init__(self, db_record=None): Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain, nested_scrape=False)
def __init__(self, db_record=None): Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain, first_page_url=self.first_page_url)
def try_except_function(self, *args, **kwargs): for i in range(10): try: results = func(self, *args, **kwargs) return results except (ClientError, Exception) as e: Crawler.log_error(e) if 'Not Found' in str(e): raise ValueError('Профиль не найден') time.sleep(5) continue
def main(): configure_logging(settings={ 'LOG_LEVEL': 'INFO' }) logging.basicConfig(level=logging.INFO) Crawler().run_newsletter() subscriber_list = Subscriber.get_contacts() MailSender().send(subscriber_list)
def get_subscribers_count(self, link, internal_id=None): screen_name = Crawler.get_screen_name(link) info = self.get_raw_info(screen_name=screen_name) subscribers = { 'updated_at': datetime.now().astimezone().strftime('%Y-%m-%dT%H:%M:%S%z'), 'count_subscribers': info['edge_followed_by']['count'] } return subscribers
def __request_json(self, url): for i in range(5): try: query = requests.get(url, headers=self.get_headers) if query.status_code == 200: return ujson.loads(query.text) elif query.status_code == 403 or query.status_code == 429: logging.warning( f'Ошибка {query.status_code}. Возможно, невалидный токен' ) self.guest_token = self.get_tokens() else: raise RequestError( f'Ошибка {query.status_code} при запросе JSON') except RequestError as e: time.sleep(self.delay_after_request_error) Crawler.log_error(e) except (URLError, ConnectionError, Exception) as e: time.sleep(self.delay_after_request_error) Crawler.log_error(e) raise GetInfoError('Не удается получить информацию об источнике')
def get_subscribers_count(self, link, internal_id=None): screen_name = Crawler.get_screen_name(link) user_info = self.get_raw_info(screen_name=screen_name) key_exist = 'legacy' in user_info['data']['user'] if not key_exist: raise GetInfoError(f'Ошибка получения подписчиков') info = user_info['data']['user']['legacy'] subscribers = { 'updated_at': datetime.now().astimezone().strftime('%Y-%m-%dT%H:%M:%S%z'), 'count_subscribers': info['followers_count'] } return subscribers
def get_info(self, link, internal_id=None): screen_name = Crawler.get_screen_name(link) info = self.get_raw_info(screen_name=screen_name) parsed_info = { 'name': info['full_name'], 'link': f'https://www.instagram.com/{info["username"]}', 'internal_id': info['id'], 'avatar': info['profile_pic_url'], 'type_social': 'IN', } if info['full_name'] == '': parsed_info.update({'name': info['username']}) if not parsed_info['avatar']: parsed_info.update({'avatar': info['profile_pic_url_hd']}) return parsed_info
def get_info(self, link, internal_id=None): screen_name = Crawler.get_screen_name(link) user_info = self.get_raw_info(screen_name=screen_name) user_id = user_info['data']['user']['rest_id'] key_exist = 'legacy' in user_info['data']['user'] if not key_exist: raise GetInfoError(f'Ошибка получения информации об аккаунте') info = user_info['data']['user']['legacy'] parsed_info = { 'name': info['name'], 'link': f'https://twitter.com/{info["screen_name"]}', 'internal_id': str(user_id), 'avatar': (info['profile_image_url_https']).replace('_normal', ''), 'type_social': 'TW', } if info['name'] == '': parsed_info.update({'name': info['screen_name']}) return parsed_info
def __init__(self, db_record=None): Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain)
def main(): Crawler(baseurl="https://movie.douban.com/top250?start=", save_path="database/douban_top250.db", max_page=10, max_per_page=25, patterns=DoubanPatterns).get_data().savedata()
def get_internal_id(self, link): screen_name = Crawler.get_screen_name(link) info = self.get_raw_info(screen_name=screen_name) return info['id']