def get(self, profile_name): '''Processes the request. We don't use VKontakte API because it requires authorization and gives tokens with expiration time which is not suitable for RSS generator. ''' headers = self.__get_headers() user_agent = headers.get('user-agent', '').strip() if user_agent and ( # Google Reader bot still crawls the Web. Reject it to save # bandwidth. user_agent.startswith('Feedfetcher-Google;') or # FeedNotifier updates feeds every minute user_agent.startswith('FeedNotifier/') or # YandexBlogs bot sends a lot of requests (2/minute) for some # feeds. The support doesn't respond adequately. 'YandexBlogs' in user_agent): self.error(httplib.FORBIDDEN) return user_error = None http_status = httplib.OK unknown_user_error = False try: show_photo = (self.request.get('show_photo', '1') != '0') foreign_posts = (self.request.get('foreign_posts', '0') != '0') hash_tag_title = (self.request.get('hash_tag_title', '0') != '0') text_title = (self.request.get('text_title', '0') != '0') big_photos = (self.request.get('big_photos', '0') != '0') LOG.info( 'Requested feed for "%s" (foreign_posts = %s, show_photo = %s, hash_tag_title = %s, text_title = %s, big_photos = %s).', profile_name, foreign_posts, show_photo, hash_tag_title, text_title, big_photos) use_api = True if_modified_since = None if use_api: # Use VKontakte API from vkfeed.tools import wall_reader cur_time = int(time.time()) latency = constants.MINUTE_SECONDS min_timestamp = cur_time - constants.WEEK_SECONDS ## This confuses Google Reader users because it always requests ## feeds with 'Cache-Control: max-age=3600' when adding ## subscriptions and users often gen an empty feed. #for cache_control in headers.get('cache-control', '').split(','): # cache_control = cache_control.strip() # if cache_control.startswith('max-age='): # LOG.info('Applying Cache-Control: %s...', cache_control) # try: # cache_max_age = int(cache_control[len('max-age='):]) # except ValueError: # LOG.error('Invalid header: Cache-Control = %s.', cache_control) # else: # if cache_max_age: # min_timestamp = max(min_timestamp, cur_time - cache_max_age - latency) if 'if-modified-since' in headers and headers[ 'if-modified-since'] != '0': LOG.info('Applying If-Modified-Since: %s...', headers['if-modified-since']) try: if_modified_since = vkfeed.utils.http_timestamp( headers['if-modified-since']) except Exception as e: LOG.error('Invalid header: If-Modified-Since = %s.', headers['if-modified-since']) else: min_timestamp = max(min_timestamp, if_modified_since - latency) max_age = cur_time - min_timestamp if max_age > constants.DAY_SECONDS: max_posts_num = 10 else: max_posts_num = 50 if user_agent and vkfeed.utils.zero_subscribers(user_agent): max_posts_num /= 2 LOG.info( 'Applying the following limits: max_age=%s, max_posts_num=%s', max_age, max_posts_num) try: data = wall_reader.read(profile_name, min_timestamp, max_posts_num, foreign_posts, show_photo, hash_tag_title, text_title, big_photos) except wall_reader.ConnectionError as e: http_status = httplib.BAD_GATEWAY user_error = 'Ошибка соединения с сервером <a href="{0}" target="_blank">{0}</a>.'.format( constants.API_URL) raise except wall_reader.ServerError as e: http_status = httplib.NOT_FOUND user_error = unicode(e) raise else: # Parse HTML from site from vkfeed.tools.wall_parser import WallPageParser, ParseError, PrivateGroupError, ProfileNotAvailableError, ServerError url = constants.VK_URL + cgi.escape(profile_name) url_html = '<a href="{0}" target="_blank">{0}</a>'.format(url) if profile_name == 'feed': http_status = httplib.NOT_FOUND user_error = 'Страница {0} не является профилем пользователя или группы.'.format( url_html) raise Error('Unsupported page.') try: profile_page = vkfeed.utils.fetch_url(url) except vkfeed.utils.HTTPNotFoundError: http_status = httplib.NOT_FOUND user_error = 'Пользователя или группы {0} не существует.'.format( url_html) raise except Error: http_status = httplib.BAD_GATEWAY user_error = 'Не удалось загрузить страницу {0}.'.format( url_html) unknown_user_error = True raise try: data = WallPageParser().parse(profile_page) except PrivateGroupError as e: http_status = httplib.NOT_FOUND user_error = 'Группа {0} является закрытой группой.'.format( url_html) raise except ProfileNotAvailableError as e: http_status = httplib.NOT_FOUND user_error = 'Страница пользователя {0} удалена или доступна только авторизованным пользователям.'.format( url_html) raise except ServerError as e: LOG.debug('Page contents:\n%s', profile_page) http_status = httplib.BAD_GATEWAY user_error = 'Сервер {0} вернул ошибку{1}'.format( url_html, ':<br />' + e.server_error if e.server_error else '.') unknown_user_error = True raise except ParseError as e: LOG.debug('Page contents:\n%s', profile_page) http_status = httplib.NOT_FOUND user_error = 'Сервер вернул страницу, на которой не удалось найти стену с сообщениями пользователя.' unknown_user_error = True raise data['url'] = url if 'user_photo' not in data: data[ 'user_photo'] = constants.APP_URL + 'images/vk-rss-logo.png' LOG.info('Return %s items.', len(data['posts'])) if if_modified_since is not None and not data['posts']: http_status = httplib.NOT_MODIFIED else: feed = self.__generate_feed(data) except Exception as e: if isinstance(e, Error): if user_error and not unknown_user_error: log_function = LOG.warning else: log_function = LOG.error else: log_function = LOG.exception log_function('Unable to generate a feed for "%s": %s', profile_name, e) if user_error: self.error(http_status) error = '<p>Ошибка при генерации RSS-ленты:</p><p>{0}</p>'.format( user_error) if unknown_user_error: error += '''<p> Пожалуйста, убедитесь, что вы правильно указали профиль пользователя или группы, и что данный профиль является общедоступным. Если все указано верно, и ошибка повторяется, пожалуйста, свяжитесь с <a href="mailto:{0}">администратором</a>. </p>'''.format( cgi.escape(constants.ADMIN_EMAIL, quote=True)) else: self.error(httplib.INTERNAL_SERVER_ERROR) error = ''' При генерации RSS-ленты произошла внутренняя ошибка сервера. Если ошибка повторяется, пожалуйста, свяжитесь с <a href="mailto:{0}">администратором</a>. '''.format(cgi.escape(constants.ADMIN_EMAIL, quote=True)) self.response.headers[ b'Content-Type'] = b'text/html; charset=utf-8' self.response.out.write( vkfeed.utils.render_template('error.html', {'error': error})) else: if http_status == httplib.OK: self.response.headers[b'Content-Type'] = b'application/rss+xml' self.response.out.write(feed) else: self.error(http_status)
def setUp(self): self.__parser = WallPageParser(ignore_errors = False)
class TestWallParser(unittest.TestCase): '''Tests vk.com wall parser.''' def setUp(self): self.__parser = WallPageParser(ignore_errors = False) def test_invalid_page(self): '''Testing parsing of invalid page.''' self.assertRaises(ParseError, lambda: self.__parser.parse(open('wall_parser/invalid_page.html').read().decode('cp1251'))) def test_group_wall(self): '''Testing parsing of group wall''' self.__test_parsing( open('wall_parser/group_profile_page.html').read().decode('cp1251'), { 'user_name': u'Хабрахабр', 'user_photo': 'http://cs11159.vk.com/g20629724/a_ba3bb3dc.jpg', 'posts': 10, } ) def test_user_empty_wall(self): '''Testing parsing of empty wall''' self.__test_parsing( open('wall_parser/user_profile_page_with_empty_wall.html').read().decode('cp1251'), { 'user_name': u'Григорий Бакунов', 'user_photo': 'http://cs4383.vk.com/u78983895/a_912f563f.jpg', 'posts': 0, } ) def test_user_wall(self): '''Testing parsing of user wall''' self.__test_parsing( open('wall_parser/user_profile_page.html').read().decode('cp1251'), { 'user_name': u'Павел Дуров', 'user_photo': 'http://vk.com/u00001/a_a964f9a2.jpg', 'posts': 10, } ) def __test_parsing(self, html, etalon): '''Runs the test with the specified data.''' clear_run_data = self.__normalize_data(self.__parser.parse(html)) self.assertEqual(etalon, clear_run_data) dirty_run_data = self.__normalize_data(self.__parser.parse(html)) self.assertEqual(etalon, clear_run_data) self.assertEqual(etalon, dirty_run_data) def __normalize_data(self, data): '''Removes fields that don't exist in the etalon.''' if 'posts' in data: for post in data['posts']: self.assertNotEqual(post['text'].strip(), '') data['posts'] = len(data['posts']) return data
class TestWallParser(unittest.TestCase): '''Tests vk.com wall parser.''' def setUp(self): self.__parser = WallPageParser(ignore_errors = False) def test_invalid_page(self): '''Testing parsing of invalid page.''' self.assertRaises(ParseError, lambda: self.__parser.parse(open('wall_parser/invalid_page.html').read().decode('cp1251'))) def test_group_wall(self): '''Testing parsing of group wall''' self.__test_parsing( open('wall_parser/group_profile_page.html').read().decode('cp1251'), { 'user_name': 'Хабрахабр', 'user_photo': 'http://cs11159.vk.com/g20629724/a_ba3bb3dc.jpg', 'posts': 10, } ) def test_user_empty_wall(self): '''Testing parsing of empty wall''' self.__test_parsing( open('wall_parser/user_profile_page_with_empty_wall.html').read().decode('cp1251'), { 'user_name': 'Григорий Бакунов', 'user_photo': 'http://cs4383.vk.com/u78983895/a_912f563f.jpg', 'posts': 0, } ) def test_user_wall(self): '''Testing parsing of user wall''' self.__test_parsing( open('wall_parser/user_profile_page.html').read().decode('cp1251'), { 'user_name': 'Павел Дуров', 'user_photo': 'http://vk.com/u00001/a_a964f9a2.jpg', 'posts': 10, } ) def __test_parsing(self, html, etalon): '''Runs the test with the specified data.''' clear_run_data = self.__normalize_data(self.__parser.parse(html)) self.assertEqual(etalon, clear_run_data) dirty_run_data = self.__normalize_data(self.__parser.parse(html)) self.assertEqual(etalon, clear_run_data) self.assertEqual(etalon, dirty_run_data) def __normalize_data(self, data): '''Removes fields that don't exist in the etalon.''' if 'posts' in data: for post in data['posts']: self.assertNotEqual(post['text'].strip(), '') data['posts'] = len(data['posts']) return data