示例#1
0
def update_posts_for_feed_task(partner):
	"""
	Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual
	entries and save each one as a partner_feeds.
	"""
	from feedparser import parse
	from partner_feeds.models import Post
	import timelib, re, time

	feed = parse(partner.feed_url)

	for entry in feed.entries:
		p = Post()
		try:
			
			p.partner_id = partner.id
			p.title = entry.title

			p.subheader = entry.summary
			
			try:
				p.author = entry.author
			except AttributeError:
				pass

			try:
				p.guid = entry.id
			except AttributeError:
				p.guid = entry.link

			p.url = entry.link

			# try to get the date of the entry, otherwise, try the date of the feed
			try:
				entry_date = re.sub('\|','', entry.date)
				entry_date = timelib.strtotime(entry_date) # convert to a timestamp
				entry_date = time.localtime(entry_date) # converts to a time.struct_time (with regards to local timezone)
				entry_date = time.strftime("%Y-%m-%d %H:%M:%S", entry_date) # converts to mysql date format
				p.date = entry_date
			except AttributeError:
				p.date =  time.strftime("%Y-%m-%d %H:%M:%S", feed.date)

			p.save()
		except AttributeError:
			# needs logging
			pass
示例#2
0
def update_posts_for_feed_task(partner):
    """
    Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual
    entries and save each one as a partner_feeds.
    """
    logger.debug("Updating posts for partner feed: {} - {}.".format(partner, partner.pk))

    current_datetime = datetime.now()
    number_of_new_posts = 0
    feed = parse(partner.feed_url)

    for entry in feed.entries:
        p = Post()
        exception_data = {'entry': entry}
        try:

            p.partner_id = partner.id
            p.title = entry.title

            if not p.title or len(p.title) == 0:
                continue

            if hasattr(entry, 'summary'):
                p.subheader = entry.summary
            else:
                p.subheader = ''

            try:
                p.author = entry.author
            except AttributeError:
                pass

            try:
                p.guid = entry.id
            except AttributeError:
                p.guid = entry.link

            # try and select feed post to see if entry exists first
            try:
                Post.objects.get(guid=p.guid, partner_id=partner.id)
                logger.debug("Prexisting partner_feed.Post with partner id: {}, guid: {}.".format(partner.id, p.guid))
                # print p.guid
                # print partner.id
                # TODO check to see if the story has been updated
            except ObjectDoesNotExist:
                logger.debug("partner_feed.Post does not exist with partner id: {}, guid: {}".format(partner.id, p.guid))
                # skip if URL is too long for database field
                max_length = 500
                if len(entry.link) > max_length:
                    logger.debug("Entry link is longer than {}. Skipping entry link {}.".format(max_length, entry.link))
                    continue

                p.url = entry.link

                # try to get the date of the entry, otherwise, use the current date
                if getattr(entry, 'published_parsed', None):
                    p.date = strftime("%Y-%m-%d %H:%M:%S", utc_time_struct_to_local_time_struct(entry.published_parsed))
                elif getattr(entry, 'updated_parsed', None):
                    p.date = strftime("%Y-%m-%d %H:%M:%S", utc_time_struct_to_local_time_struct(entry.updated_parsed))
                else:
                    p.date = current_datetime

                logger.debug("Saving partner_feed.Post with partner id: {}, guid: {}".format(partner.id, p.guid))
                p.save()
                logger.debug("Finished saving partner_feed.Post with partner id: {}, guid: {}".format(partner.id, p.guid))

                number_of_new_posts = number_of_new_posts + 1

        except Exception:
            client = Client(dsn=settings.RAVEN_CONFIG['dsn'])
            client.captureException(exc_info=sys.exc_info(), data=exception_data)

    # return number of added posts
    return number_of_new_posts
示例#3
0
def update_posts_for_feed(partner):
    """ Load and parse the RSS or ATOM feed associated with the given feed url, and
    for each entry, parse out the individual entries and save each one as a partner_feeds.

    feedparser does a good job normalizing the data, but for a couple of fields we need to
    do a little more work
    """
    from feedparser import parse
    from partner_feeds.models import Post, Partner
    import timelib
    import time
    from datetime import datetime
    from django.utils.text import get_text_list

    feed = parse(partner.feed_url)

    for entry in feed.entries:

        # Required: title, link, skip the entry if it doesn't have them
        if 'title' in entry or 'link' in entry:
            p = Post(partner_id=partner.id, title=entry.title)

            # Links and GUID
            if 'id' in entry:
                p.guid = entry.id
            else:
                p.guid = entry.link
            p.url = entry.link

            # Date
            if 'date' in entry:
                entry_date = entry.date
            elif 'published' in entry:
                entry_date = entry.published
            elif 'date' in feed:
                entry_date = feed.date
            else:
                entry_date = None

            # entry.date and entry.published appear to be strings while
            # feed.date is a time.struct_time for some reason
            if type(entry_date) is not time.struct_time:
                entry_date = timelib.strtotime(entry_date)  # convert to a timestamp
                entry_date = time.localtime(entry_date)  # converts to a time.struct_time (with regards to local timezone)

            if entry_date is not None:
                entry_date = time.strftime("%Y-%m-%d %H:%M:%S", entry_date)  # converts to mysql date format
            else:
                entry_date = time.strftime("%Y-%m-%d %H:%M:%S")

            p.date = entry_date

            # feedparser doesn't seem to save the ATOM summary tag to
            # entry.description, but the summary is saved as one of the
            # rows in the entry.content list
            #
            # To find the summary, we loop through the list and
            # use the smallest field
            if 'content' in entry and len(entry.content) > 1:
                summary = entry.content.pop(0)['value']
                for content in entry.content:
                    if len(content['value']) < len(summary):
                        summary = content['value']
                p.description = summary
            elif 'description' in entry:
                p.description = entry.description

            if 'media_content' in entry and 'url' in entry.media_content[0]:
                p.image_url = entry.media_content[0]['url']

            if 'authors' in entry and entry.authors[0]:
                authors = [a['name'] for a in entry.authors if 'name' in a]
                p.byline = get_text_list(authors, 'and')
            elif 'author' in entry:
                p.byline = entry.author

            p.save()

    # Set the current time as when the partner feed was last retrieved
    # Needs to be an UPDATE and not a SAVE or else we will get an infinite loop
    Partner.objects.filter(
        pk=partner.pk).update(date_feed_updated=datetime.now())
示例#4
0
def update_posts_for_feed_task(partner):
    """
    Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual
    entries and save each one as a partner_feeds.
    """
    logger.debug(u"Updating posts for partner feed: {} - {}.".format(
        partner, partner.pk))

    current_datetime = datetime.now()
    number_of_new_posts = 0
    feed = parse(partner.feed_url)

    for entry in feed.entries:
        p = Post()
        exception_data = {'entry': entry}
        try:

            p.partner_id = partner.id
            p.title = entry.title

            if not p.title or len(p.title) == 0:
                continue

            if hasattr(entry, 'summary'):
                p.subheader = entry.summary
            else:
                p.subheader = ''

            try:
                p.author = entry.author
            except AttributeError:
                pass

            try:
                p.guid = entry.id
            except AttributeError:
                p.guid = entry.link

            # try and select feed post to see if entry exists first
            try:
                Post.objects.get(guid=p.guid, partner_id=partner.id)
                logger.debug(
                    u"Prexisting partner_feed.Post with partner id: {}, guid: {}."
                    .format(partner.id, p.guid))
                # TODO check to see if the story has been updated
            except ObjectDoesNotExist:
                logger.debug(
                    u"partner_feed.Post does not exist with partner id: {}, guid: {}"
                    .format(partner.id, p.guid))
                # skip if URL is too long for database field
                max_length = 500
                if len(entry.link) > max_length:
                    logger.debug(
                        u"Entry link is longer than {}. Skipping entry link {}."
                        .format(max_length, entry.link))
                    continue

                p.url = entry.link

                # try to get the date of the entry, otherwise, use the current date
                if getattr(entry, 'published_parsed', None):
                    p.date = strftime(
                        "%Y-%m-%d %H:%M:%S",
                        utc_time_struct_to_local_time_struct(
                            entry.published_parsed))
                elif getattr(entry, 'updated_parsed', None):
                    p.date = strftime(
                        "%Y-%m-%d %H:%M:%S",
                        utc_time_struct_to_local_time_struct(
                            entry.updated_parsed))
                else:
                    p.date = current_datetime

                logger.debug(
                    u"Saving partner_feed.Post with partner id: {}, guid: {}".
                    format(partner.id, p.guid))
                p.save()
                logger.debug(
                    u"Finished saving partner_feed.Post with partner id: {}, guid: {}"
                    .format(partner.id, p.guid))

                number_of_new_posts = number_of_new_posts + 1

        except Exception:
            raven_client.captureException(exc_info=sys.exc_info(),
                                          data=exception_data)

    # return number of added posts
    return number_of_new_posts
示例#5
0
def update_posts_for_feed_task(partner):
    """
	Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual
	entries and save each one as a partner_feeds.
	"""
    from feedparser import parse
    from partner_feeds.models import Post
    import timelib, re, time

    feed = parse(partner.feed_url)

    for entry in feed.entries:
        p = Post()
        try:

            p.partner_id = partner.id
            p.title = entry.title

            p.subheader = entry.summary

            try:
                p.author = entry.author
            except AttributeError:
                pass

            try:
                p.guid = entry.id
            except AttributeError:
                p.guid = entry.link

            p.url = entry.link

            # try to get the date of the entry, otherwise, try the date of the feed
            try:
                entry_date = re.sub('\|', '', entry.date)
                entry_date = timelib.strtotime(
                    entry_date)  # convert to a timestamp
                entry_date = time.localtime(
                    entry_date
                )  # converts to a time.struct_time (with regards to local timezone)
                entry_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    entry_date)  # converts to mysql date format
                p.date = entry_date
            except AttributeError:
                p.date = time.strftime("%Y-%m-%d %H:%M:%S", feed.date)

            p.save()
        except AttributeError:
            # needs logging
            pass