示例#1
0
    def get(self):
        """
        Query the DB and queue any feeds that haven't been processed since
        update_interval
        """

        update_interval = timedelta(hours=1)

        current_datetime = timeutils.now_utc()

        query = ContentFeed.all()
        query.filter("last_update <", current_datetime - update_interval)

        if query.count() == 0:
            logging.debug("No entries to queue")
        else:
            for feed in query:
                # get the member's OAuth token and secret

                last_update = timeutils.add_utc_tzinfo(feed.last_update)
                feed_consumer_params = {
                    "feed_key": feed.key(),
                    "owner_id": feed.owner.user_id()
                }

                try:
                    taskqueue.add(url="/blogs/feed/consumer",
                                  params=feed_consumer_params)
                    logging.debug("Queued feed: \"%s\" %s" %
                                  (feed.url, last_update.ctime()))
                except taskqueue.Error:
                    logging.error("Unable to queue feed: \"%s\"", feed.url)
                    return
示例#2
0
文件: blogs.py 项目: ning/ning-sync
    def get(self):
        """
        Query the DB and queue any feeds that haven't been processed since
        update_interval
        """

        update_interval = timedelta(hours=1)

        current_datetime = timeutils.now_utc()

        query = ContentFeed.all()
        query.filter("last_update <", current_datetime - update_interval)

        if query.count() == 0:
            logging.debug("No entries to queue")
        else:
            for feed in query:
                # get the member's OAuth token and secret

                last_update = timeutils.add_utc_tzinfo(feed.last_update)
                feed_consumer_params = {
                    "feed_key": feed.key(),
                    "owner_id": feed.owner.user_id()}

                try:
                    taskqueue.add(url="/blogs/feed/consumer",
                        params=feed_consumer_params)
                    logging.debug("Queued feed: \"%s\" %s" %
                        (feed.url, last_update.ctime()))
                except taskqueue.Error:
                    logging.error("Unable to queue feed: \"%s\"",
                        feed.url)
                    return
示例#3
0
文件: blogs.py 项目: ning/ning-sync
    def get(self):
        """
        Query the DB and queue any entries that haven't been uploaded yet
        """

        query = ContentEntry.all()
        query.filter("pub_date <", timeutils.now_utc())
        query.filter("ning_id =", None)

        if query.count() == 0:
            logging.debug("No entries to queue")
        else:
            for entry in query:

                # Backoff method for trying to upload
                if entry.retry_count > 100:
                    logging.info("Too many retries, deleting \"%s\"" %
                        entry.title)
                    entry.delete()
                    continue
                next_try = timeutils.add_utc_tzinfo(entry.pub_date +
                    timedelta(minutes=entry.retry_count**2))
                if next_try > timeutils.now_utc():
                    logging.debug("Too soon to retry, will try again at %s" %
                        next_try.ctime())
                    continue

                entry_consumer_params = {
                    "entry_key": entry.key()}
                try:
                    taskqueue.add(url="/blogs/entry/consumer",
                        params=entry_consumer_params)
                    logging.debug("Queued entry: \"%s\" %s" %
                        (entry.title, entry.pub_date.ctime()))
                except taskqueue.Error:
                    logging.error("Unable to queue feed: \"%s\"",
                        entry_consumer_params["url"])
                    return
示例#4
0
    def get(self):
        """
        Query the DB and queue any entries that haven't been uploaded yet
        """

        query = ContentEntry.all()
        query.filter("pub_date <", timeutils.now_utc())
        query.filter("ning_id =", None)

        if query.count() == 0:
            logging.debug("No entries to queue")
        else:
            for entry in query:

                # Backoff method for trying to upload
                if entry.retry_count > 100:
                    logging.info("Too many retries, deleting \"%s\"" %
                                 entry.title)
                    entry.delete()
                    continue
                next_try = timeutils.add_utc_tzinfo(entry.pub_date + timedelta(
                    minutes=entry.retry_count**2))
                if next_try > timeutils.now_utc():
                    logging.debug("Too soon to retry, will try again at %s" %
                                  next_try.ctime())
                    continue

                entry_consumer_params = {"entry_key": entry.key()}
                try:
                    taskqueue.add(url="/blogs/entry/consumer",
                                  params=entry_consumer_params)
                    logging.debug("Queued entry: \"%s\" %s" %
                                  (entry.title, entry.pub_date.ctime()))
                except taskqueue.Error:
                    logging.error("Unable to queue feed: \"%s\"",
                                  entry_consumer_params["url"])
                    return
示例#5
0
class FeedConsumer(webapp.RequestHandler):
    def post(self):
        """
        Use feedparser to save any blog posts from the given URL since
        the given timestamp
        """

        feed_key = self.request.get("feed_key", None)
        if not feed_key:
            logging.error("No feed URL provided")
            return

        feed = ContentFeed.get(feed_key)

        if not feed:
            logging.error("Couldn't find feed in the DB \"%s\"" % feed_key)
            return

        logging.debug("Dequeued feed: \"%s\"" % (feed.url))

        last_update = timeutils.add_utc_tzinfo(feed.last_update)
        logging.debug("Last processed feed on: %s" % last_update.ctime())

        try:
            result = urlfetch.fetch(feed.url)
        except urlfetch.Error, e:
            logging.warn("Exception when fetching feed: \"%s\" %s" %
                         (feed.url, e))
            return

        if result.status_code != 200:
            logging.warn("Unable to fetch feed: (%s) \"%s\"" %
                         (result.status_code, feed.url))
            return

        current_datetime = timeutils.now_utc()

        d = feedparser.parse(result.content)
        if not d.feed:
            logging.error("Unable to parse feed: \"%s\"" % feed.url)
            return

        for entry in d.entries:
            if not "updated_parsed" in entry:
                logging.warning("Entry doesn't have an updated date, skipping")
                continue
            if not "title" in entry:
                logging.warning("Entry is missing a title, skipping")
                continue
            if not "description" in entry:
                logging.warning("Entry is missing a description, skipping")
                continue
            if not "link" in entry:
                logging.warning("Entry is missing a link, skipping")
                continue

            entry_datetime = timeutils.struct_to_datetime(entry.updated_parsed)

            if entry_datetime < last_update:
                logging.debug("Stopping processing with: \"%s\" @ %s" %
                              (entry.title, entry_datetime.ctime()))
                break

            if "content" in entry and len(entry.content) > 0:
                entry_content = entry.content[0].value
            else:
                entry_content = ""

            if "description" in entry:
                entry_description = entry.description
            else:
                entry_description = ""

            # Choose a body that has the most content
            if len(entry_content) > len(entry_description):
                body = entry_content
            else:
                body = entry_description

            body = '%s\n\n<a href="%s">Original post</a>' % (body, entry.link)

            # Save the entry to the DB
            db_entry = ContentEntry(title=entry.title,
                                    description=body,
                                    owner=feed.owner,
                                    link=entry.link)
            db_entry.put()

            logging.info("Saved entry: \"%s\" @ %s" %
                         (entry.title, entry_datetime.ctime()))

        feed.last_update = current_datetime
        feed.put()