def fetch_url(url, content_type = 'text/html'): '''Fetches the specified URL.''' LOG.info('Fetching "%s"...', url) try: page = _fetch_url(url, headers = { 'Accept-Language': 'ru,en' }) except urlfetch.Error as e: raise Error('Failed to fetch the page: {0}.', e) else: if page.status_code == httplib.OK: LOG.info('"%s" has been successfully fetched.', url) else: error_class = HTTPNotFoundError if page.status_code == httplib.NOT_FOUND else Error raise error_class('The server returned error: {0} ({1}).', httplib.responses.get(page.status_code, 'Unknown error'), page.status_code) content = page.content for key in page.headers: if key.lower() == 'content-type': value, params = cgi.parse_header(page.headers[key]) if value != content_type: raise Error('The server returned a page with invalid content type: {0}.', value) if content_type.startswith('text/'): for param in params: if param.lower() == 'charset': content_encoding = params[param] break else: content_encoding = 'UTF-8' try: content = content.decode(content_encoding) except UnicodeDecodeError: raise Error('The server returned a page in invalid encoding.') break else: raise Error('The server returned a page with missing content type.') return content
def __fix_html(self, html): '''Fixes various things that may confuse the Python's HTML parser.''' html = self.script_regex.sub('', html) loop_replacements = ( lambda html: self.__invalid_tag_attr_spacing_regex.subn( r'\1 \2', html), lambda html: self.__invalid_tag_attr_regex.subn(r'\1 ', html), ) for loop_replacement in loop_replacements: for i in xrange(0, 1000): html, changed = loop_replacement(html) if not changed: break else: raise Error('Too many errors in the HTML or infinite loop.') html = self.__misopened_tag_regex.sub(r'<\1 />', html) return html
def _api(method, **kwargs): '''Calls the specified VKontakte API method.''' url = '{0}method/{1}?language=0&'.format(constants.API_URL, method) + urllib.urlencode(kwargs) try: data = vkfeed.utils.fetch_url(url, content_type='application/json') try: data = json.loads(data) except Exception as e: raise Error('Failed to parse JSON data: {0}.', e) except Exception as e: raise ConnectionError('API call {0} failed: {1}', url, e) if 'error' in data or 'response' not in data: error = data.get('error', {}).get('error_msg', '').strip() if not error: error = 'Ошибка вызова API.' elif error == 'Access denied: group is blocked': error = ( 'Страница временно заблокирована и проверяется администраторами, ' 'так как некоторые пользователи считают, что она не соответствует правилам сайта.' ) elif error == 'Access denied: this wall available only for community members': error = 'Это частное сообщество. Доступ только по приглашениям администраторов.' elif error == 'User was deleted or banned': error = 'Пользователь удален или забанен.' elif not error.endswith('.'): error += '.' raise ServerError(data.get('error', {}).get('error_code'), error) return data['response']
def __init__(self, code, *args, **kwargs): Error.__init__(self, *args, **kwargs) self.code = code
def __init__(self, *args, **kwargs): Error.__init__(self, *args, **kwargs)
def __init__(self, server_error): Error.__init__(self, "Server returned an error.") self.server_error = server_error
def __init__(self): Error.__init__(self, "The user's profile page is not available.")
def __init__(self): Error.__init__(self, "This is a private group.")
def get(self, profile_name): '''Processes the request. We don't use VKontakte API because it requires authorization and gives tokens with expiration time which is not suitable for RSS generator. ''' headers = self.__get_headers() user_agent = headers.get('user-agent', '').strip() if user_agent and ( # Google Reader bot still crawls the Web. Reject it to save # bandwidth. user_agent.startswith('Feedfetcher-Google;') or # FeedNotifier updates feeds every minute user_agent.startswith('FeedNotifier/') or # YandexBlogs bot sends a lot of requests (2/minute) for some # feeds. The support doesn't respond adequately. 'YandexBlogs' in user_agent): self.error(httplib.FORBIDDEN) return user_error = None http_status = httplib.OK unknown_user_error = False try: show_photo = (self.request.get('show_photo', '1') != '0') foreign_posts = (self.request.get('foreign_posts', '0') != '0') hash_tag_title = (self.request.get('hash_tag_title', '0') != '0') text_title = (self.request.get('text_title', '0') != '0') big_photos = (self.request.get('big_photos', '0') != '0') LOG.info( 'Requested feed for "%s" (foreign_posts = %s, show_photo = %s, hash_tag_title = %s, text_title = %s, big_photos = %s).', profile_name, foreign_posts, show_photo, hash_tag_title, text_title, big_photos) use_api = True if_modified_since = None if use_api: # Use VKontakte API from vkfeed.tools import wall_reader cur_time = int(time.time()) latency = constants.MINUTE_SECONDS min_timestamp = cur_time - constants.WEEK_SECONDS ## This confuses Google Reader users because it always requests ## feeds with 'Cache-Control: max-age=3600' when adding ## subscriptions and users often gen an empty feed. #for cache_control in headers.get('cache-control', '').split(','): # cache_control = cache_control.strip() # if cache_control.startswith('max-age='): # LOG.info('Applying Cache-Control: %s...', cache_control) # try: # cache_max_age = int(cache_control[len('max-age='):]) # except ValueError: # LOG.error('Invalid header: Cache-Control = %s.', cache_control) # else: # if cache_max_age: # min_timestamp = max(min_timestamp, cur_time - cache_max_age - latency) if 'if-modified-since' in headers and headers[ 'if-modified-since'] != '0': LOG.info('Applying If-Modified-Since: %s...', headers['if-modified-since']) try: if_modified_since = vkfeed.utils.http_timestamp( headers['if-modified-since']) except Exception as e: LOG.error('Invalid header: If-Modified-Since = %s.', headers['if-modified-since']) else: min_timestamp = max(min_timestamp, if_modified_since - latency) max_age = cur_time - min_timestamp if max_age > constants.DAY_SECONDS: max_posts_num = 10 else: max_posts_num = 50 if user_agent and vkfeed.utils.zero_subscribers(user_agent): max_posts_num /= 2 LOG.info( 'Applying the following limits: max_age=%s, max_posts_num=%s', max_age, max_posts_num) try: data = wall_reader.read(profile_name, min_timestamp, max_posts_num, foreign_posts, show_photo, hash_tag_title, text_title, big_photos) except wall_reader.ConnectionError as e: http_status = httplib.BAD_GATEWAY user_error = 'Ошибка соединения с сервером <a href="{0}" target="_blank">{0}</a>.'.format( constants.API_URL) raise except wall_reader.ServerError as e: http_status = httplib.NOT_FOUND user_error = unicode(e) raise else: # Parse HTML from site from vkfeed.tools.wall_parser import WallPageParser, ParseError, PrivateGroupError, ProfileNotAvailableError, ServerError url = constants.VK_URL + cgi.escape(profile_name) url_html = '<a href="{0}" target="_blank">{0}</a>'.format(url) if profile_name == 'feed': http_status = httplib.NOT_FOUND user_error = 'Страница {0} не является профилем пользователя или группы.'.format( url_html) raise Error('Unsupported page.') try: profile_page = vkfeed.utils.fetch_url(url) except vkfeed.utils.HTTPNotFoundError: http_status = httplib.NOT_FOUND user_error = 'Пользователя или группы {0} не существует.'.format( url_html) raise except Error: http_status = httplib.BAD_GATEWAY user_error = 'Не удалось загрузить страницу {0}.'.format( url_html) unknown_user_error = True raise try: data = WallPageParser().parse(profile_page) except PrivateGroupError as e: http_status = httplib.NOT_FOUND user_error = 'Группа {0} является закрытой группой.'.format( url_html) raise except ProfileNotAvailableError as e: http_status = httplib.NOT_FOUND user_error = 'Страница пользователя {0} удалена или доступна только авторизованным пользователям.'.format( url_html) raise except ServerError as e: LOG.debug('Page contents:\n%s', profile_page) http_status = httplib.BAD_GATEWAY user_error = 'Сервер {0} вернул ошибку{1}'.format( url_html, ':<br />' + e.server_error if e.server_error else '.') unknown_user_error = True raise except ParseError as e: LOG.debug('Page contents:\n%s', profile_page) http_status = httplib.NOT_FOUND user_error = 'Сервер вернул страницу, на которой не удалось найти стену с сообщениями пользователя.' unknown_user_error = True raise data['url'] = url if 'user_photo' not in data: data[ 'user_photo'] = constants.APP_URL + 'images/vk-rss-logo.png' LOG.info('Return %s items.', len(data['posts'])) if if_modified_since is not None and not data['posts']: http_status = httplib.NOT_MODIFIED else: feed = self.__generate_feed(data) except Exception as e: if isinstance(e, Error): if user_error and not unknown_user_error: log_function = LOG.warning else: log_function = LOG.error else: log_function = LOG.exception log_function('Unable to generate a feed for "%s": %s', profile_name, e) if user_error: self.error(http_status) error = '<p>Ошибка при генерации RSS-ленты:</p><p>{0}</p>'.format( user_error) if unknown_user_error: error += '''<p> Пожалуйста, убедитесь, что вы правильно указали профиль пользователя или группы, и что данный профиль является общедоступным. Если все указано верно, и ошибка повторяется, пожалуйста, свяжитесь с <a href="mailto:{0}">администратором</a>. </p>'''.format( cgi.escape(constants.ADMIN_EMAIL, quote=True)) else: self.error(httplib.INTERNAL_SERVER_ERROR) error = ''' При генерации RSS-ленты произошла внутренняя ошибка сервера. Если ошибка повторяется, пожалуйста, свяжитесь с <a href="mailto:{0}">администратором</a>. '''.format(cgi.escape(constants.ADMIN_EMAIL, quote=True)) self.response.headers[ b'Content-Type'] = b'text/html; charset=utf-8' self.response.out.write( vkfeed.utils.render_template('error.html', {'error': error})) else: if http_status == httplib.OK: self.response.headers[b'Content-Type'] = b'application/rss+xml' self.response.out.write(feed) else: self.error(http_status)
def __init__(self, server_error): Error.__init__(self, 'Server returned an error.') self.server_error = server_error
def __handle_post_date(self, tag, data): '''Handles data inside of post replies tag.''' replacements = (('jan.', '1'), ('feb.', '2'), ('mar.', '3'), ('apr.', '4'), ('may', '5'), ('jun.', '6'), ('jul.', '7'), ('aug.', '8'), ('sep.', '9'), ('oct.', '10'), ('nov.', '11'), ('dec.', '12'), ('янв', '1'), ('фев', '2'), ('мар', '3'), ('апр', '4'), ('мая', '5'), ('июн', '6'), ('июл', '7'), ('авг', '8'), ('сен', '9'), ('окт', '10'), ('ноя', '11'), ('дек', '12'), ('два', '2'), ('две', '2'), ('три', '3'), ('четыре', '4'), ('пять', '5'), ('шесть', '6'), ('семь', '7'), ('восемь', '8'), ('девять', '9'), ('десять', '10'), ('two', '2'), ('three', '3'), ('four', '4'), ('five', '5'), ('six', '6'), ('seven', '7'), ('eight', '8'), ('nine', '9'), ('ten', '10'), ('вчера', 'yesterday'), ('сегодня', 'today'), (' в ', ' at ')) date_string = data.strip().lower() is_pm = date_string.endswith(' pm') if date_string.endswith(' am') or date_string.endswith(' pm'): date_string = date_string[:-3] tz_delta = datetime.timedelta(hours=4) # MSK timezone today = datetime.datetime.utcnow() + tz_delta for token, replacement in replacements: date_string = date_string.replace(token, replacement) try: match = re.match(ur'(\d+ ){0,1}([^ ]+) (?:назад|ago)', date_string) if match: value = match.group(1) if value: value = int(value.strip()) else: value = 1 unit = match.group(2) if unit in ('секунд', 'секунду', 'секунды', 'second', 'seconds'): date = today - datetime.timedelta(seconds=value) elif unit in ('минут', 'минуту', 'минуты', 'minute', 'minutes'): date = today - datetime.timedelta(minutes=value) elif unit in ('час', 'часа', 'часов', 'hour', 'hours'): date = today - datetime.timedelta(hours=value) elif unit in ('день', 'дня', 'дней', 'day', 'days'): date = today - datetime.timedelta(days=value) elif unit in ('неделю', 'недели', 'недель', 'week', 'weeks'): date = today - datetime.timedelta(weeks=value) else: raise Error('Invalid time dimension: {0}.', unit) else: try: date = datetime.datetime.strptime(date_string, 'today at %H:%M') date = datetime.datetime.combine(today, date.time()) except ValueError: try: date = datetime.datetime.strptime( date_string, 'yesterday at %H:%M') date = datetime.datetime.combine( today - datetime.timedelta(days=1), date.time()) except ValueError: try: date = datetime.datetime.strptime( '{0} {1}'.format(today.year, date_string), '%Y %d %m at %H:%M') except ValueError: date = datetime.datetime.strptime( date_string, '%d %m %Y') date += tz_delta if is_pm: date += datetime.timedelta(hours=12) date -= tz_delta if date - datetime.timedelta(minutes=1) > today: if date - datetime.timedelta(days=1) <= today: date -= datetime.timedelta(days=1) else: last_year_date = datetime.datetime(date.year - 1, date.month, date.day, date.hour, date.minute, date.second, date.microsecond, date.tzinfo) if last_year_date <= today: date = last_year_date self.__get_cur_post()['date'] = date except Exception as e: if self.__ignore_errors: LOG.exception('Failed to parse date %s.', data) else: raise e
def __init__(self): Error.__init__(self, 'This is a private group.')