Пример #1
0
def _fetch_and_create_feed(uri: str, process_html: bool=True
                           ) -> Tuple[models.Feed, Optional[ParsedFeed]]:
    try:
        feed_request = http_fetcher.fetch_feed(uri, None, None, 1, None)
    except (requests.exceptions.RequestException,
            http_fetcher.FetchFileTooBigError):
        raise FeedFetchError(f'Could not create feed "{uri}", HTTP get failed')

    if feed_request.is_html and not process_html:
        # An HTML page gave a feed link that is another HTML page
        raise FeedFetchError(f'Could not find valid feed in HTML page "{uri}"')

    if feed_request.is_html and process_html:
        found_uri = html_processing.find_feed_in_html(
            feed_request.content, feed_request.final_url
        )
        if not found_uri:
            # An HTML page does not contain a feed link
            raise FeedFetchError(f'Could not find feed in HTML page "{uri}"')

        # An HTML page contains a feed link, let's fetch it
        return _fetch_and_create_feed(found_uri, process_html=False)

    try:
        parsed_feed = simple_parse_bytes(feed_request.content)
    except FeedDocumentError:
        raise FeedFetchError(
            f'Could not create feed "{uri}", content is not a valid feed'
        )

    feed, created = models.Feed.objects.update_or_create(
        defaults={'name': utils.shrink_str(parsed_feed.title)},
        uri=feed_request.final_url
    )
    if created:
        logger.info('Created feed %s', feed)
        return feed, parsed_feed
    else:
        logger.info('Submitted URI %s points to existing %s', uri, feed)
        return feed, None
Пример #2
0
def synchronize_feed(feed_id: int, force=False):
    task_start_date = now()

    try:
        # Fetch the feed from db as well as its subscribers count in a single
        # query, this approach may give incorrect results if other annotations
        # are added in the future.
        # See https://code.djangoproject.com/ticket/10060
        feed = (
            models.Feed.objects.annotate(Count('subscribers')).get(pk=feed_id)
        )
    except ObjectDoesNotExist:
        logger.info('Not synchronizing feed %d, does not exist', feed_id)
        return
    else:
        logger.info('Starting synchronization of %s', feed)

    try:
        feed_request = http_fetcher.fetch_feed(
            feed.uri,
            feed.last_fetched_at if not force else None,
            bytes(feed.last_hash) if feed.last_hash and not force else None,
            feed.subscribers__count,
            feed_id
        )
    except (requests.exceptions.RequestException,
            http_fetcher.FetchFileTooBigError) as e:
        logger.warning('Could not synchronize %s: %s', feed, e)
        feed.last_failure = repr(e)
        feed.save()
        return

    if feed_request is None:
        # Feed did not change since last synchronization
        feed.last_fetched_at = task_start_date
        feed.last_failure = ''
        feed.save()
        return

    if feed_request.is_html:
        logger.warning('Fetch of %s gave an HTML page', feed)

    try:
        parsed_feed = simple_parse_bytes(feed_request.content)
    except FeedDocumentError as e:
        logger.warning('Could not synchronize %s: %s', feed, e)
        feed.last_failure = repr(e)
        feed.save()
        return

    parsed_feed_title = utils.shrink_str(parsed_feed.title)
    if feed.name != parsed_feed_title:
        logger.info('Renaming feed %d from "%s" to "%s"', feed_id, feed.name,
                    parsed_feed_title)
        feed.name = parsed_feed_title

    synchronize_parsed_feed(feed, parsed_feed)

    feed.last_fetched_at = task_start_date
    feed.last_hash = feed_request.hash
    feed.last_failure = ''
    feed.frequency_per_year = calculate_frequency_per_year(feed)
    feed.save()

    # Update feed URI if it was redirected
    if feed_request.final_url != feed.uri:
        logger.info(
            'Feed was redirected: %s -> %s', feed.uri, feed_request.final_url
        )
        feed.uri = feed_request.final_url
        try:
            feed.save(update_fields=['uri'])
        except IntegrityError as e:
            if e.__cause__.pgcode != pg_error_codes.UNIQUE_VIOLATION:
                raise

            logger.warning(
                'Could not change feed %d URI to %s, another feed '
                'already has this URI', feed.id, feed.uri,
            )