def _fetch_and_create_feed(uri: str, process_html: bool=True ) -> Tuple[models.Feed, Optional[ParsedFeed]]: try: feed_request = http_fetcher.fetch_feed(uri, None, None, 1, None) except (requests.exceptions.RequestException, http_fetcher.FetchFileTooBigError): raise FeedFetchError(f'Could not create feed "{uri}", HTTP get failed') if feed_request.is_html and not process_html: # An HTML page gave a feed link that is another HTML page raise FeedFetchError(f'Could not find valid feed in HTML page "{uri}"') if feed_request.is_html and process_html: found_uri = html_processing.find_feed_in_html( feed_request.content, feed_request.final_url ) if not found_uri: # An HTML page does not contain a feed link raise FeedFetchError(f'Could not find feed in HTML page "{uri}"') # An HTML page contains a feed link, let's fetch it return _fetch_and_create_feed(found_uri, process_html=False) try: parsed_feed = simple_parse_bytes(feed_request.content) except FeedDocumentError: raise FeedFetchError( f'Could not create feed "{uri}", content is not a valid feed' ) feed, created = models.Feed.objects.update_or_create( defaults={'name': utils.shrink_str(parsed_feed.title)}, uri=feed_request.final_url ) if created: logger.info('Created feed %s', feed) return feed, parsed_feed else: logger.info('Submitted URI %s points to existing %s', uri, feed) return feed, None
def synchronize_feed(feed_id: int, force=False): task_start_date = now() try: # Fetch the feed from db as well as its subscribers count in a single # query, this approach may give incorrect results if other annotations # are added in the future. # See https://code.djangoproject.com/ticket/10060 feed = ( models.Feed.objects.annotate(Count('subscribers')).get(pk=feed_id) ) except ObjectDoesNotExist: logger.info('Not synchronizing feed %d, does not exist', feed_id) return else: logger.info('Starting synchronization of %s', feed) try: feed_request = http_fetcher.fetch_feed( feed.uri, feed.last_fetched_at if not force else None, bytes(feed.last_hash) if feed.last_hash and not force else None, feed.subscribers__count, feed_id ) except (requests.exceptions.RequestException, http_fetcher.FetchFileTooBigError) as e: logger.warning('Could not synchronize %s: %s', feed, e) feed.last_failure = repr(e) feed.save() return if feed_request is None: # Feed did not change since last synchronization feed.last_fetched_at = task_start_date feed.last_failure = '' feed.save() return if feed_request.is_html: logger.warning('Fetch of %s gave an HTML page', feed) try: parsed_feed = simple_parse_bytes(feed_request.content) except FeedDocumentError as e: logger.warning('Could not synchronize %s: %s', feed, e) feed.last_failure = repr(e) feed.save() return parsed_feed_title = utils.shrink_str(parsed_feed.title) if feed.name != parsed_feed_title: logger.info('Renaming feed %d from "%s" to "%s"', feed_id, feed.name, parsed_feed_title) feed.name = parsed_feed_title synchronize_parsed_feed(feed, parsed_feed) feed.last_fetched_at = task_start_date feed.last_hash = feed_request.hash feed.last_failure = '' feed.frequency_per_year = calculate_frequency_per_year(feed) feed.save() # Update feed URI if it was redirected if feed_request.final_url != feed.uri: logger.info( 'Feed was redirected: %s -> %s', feed.uri, feed_request.final_url ) feed.uri = feed_request.final_url try: feed.save(update_fields=['uri']) except IntegrityError as e: if e.__cause__.pgcode != pg_error_codes.UNIQUE_VIOLATION: raise logger.warning( 'Could not change feed %d URI to %s, another feed ' 'already has this URI', feed.id, feed.uri, )