Пример #1
0
def twitter_bot(rss_guid=None):
    """
    Consumes a feed and checks if there are new entries in db.
    If so, gets a shortened url and tweets the new status.
    """

    if rss_guid is None:
        # ancestor_key = ndb.Key("RSS_GUID", rss_guid or "*norss*")
        # consumer = FeedConsume.get_last_rss_guid(ancestor_key)
        # rss_guid = consumer[0].rss_guid
        query = FeedConsume.gql("WHERE entry = :1", "latest")
        result = query.get()
        rss_guid = result.rss_guid
    else:
        consumer = FeedConsume(parent=ndb.Key("RSS_GUID", rss_guid or "*norss*"),
                               rss_guid=rss_guid, entry="latest")
        consumer.put()
    url = "{}erss.cgi?rss_guid={}".format(conf("pubmed_rss"), rss_guid)
    feeds = feedparser.parse(url)
    tweets = []
    for feed in feeds["items"]:
        pmid = (feed["link"].split("/")[-1]).rstrip("?dopt=Abstract")
        if "entrez?" in pmid:
            continue
        query = FeedItem.gql("WHERE pmid = :1", pmid)
        # if pmid not in db
        if (query.count() == 0):
            title = feed["title"]
            otitle = title
            url = feed["link"]
            category = feed["category"]
            item = FeedItem()
            item.pmid = pmid
        
            # shorten the url with Bitly.com
            shorturl = shorten_url_bitly(url)

            # tweet the new entry
            max_length = (140 - len(category) - len(shorturl) - 7)
            print(max_length)
            if len(title) > max_length:
                title = title[0:max_length]
            status = "#{}: {}... {}".format("".join(category.split()), title.rstrip(". "), shorturl)
            try:
                status = unicode(status).encode("utf-8")
            except UnicodeEncodeError:
                pass
                # TODO: add logging

            # tweet new status
            # tweets.append({'title': "{}...".format(title.rstrip(". ")), 'url': shorturl})
            ttitle = "#{}: {}...".format("".join(category.split()), otitle[0:100].rstrip(". "))
            tweets.append({'title': ttitle, 'url': shorturl})
            try:
                update_status_twitter(status)
                item.put()
            except:
                pass
            
    return tweets
Пример #2
0
    def on_post(self, req, resp):
        # print req.stream.read()
        # try:
        #     raw = req.stream.read()
        # except Exception as ex:
        #     raise falcon.HTTPError(falcon.HTTP_400,
        #         'Error',
        #         ex.message)
        #
        # try:
        #     result = json.loads(raw, encoding='utf-8')
        # except ValueError:
        #     raise falcon.HTTPError(falcon.HTTP_400,
        #         'Malformed JSON',
        #         'Could not decode the request body. The '
        #         'JSON was incorrect.')
        result = req.stream.read().split('&')
        item_id = result[0].split('=')[1]
        pinned = result[1].split('=')[1]
        if pinned == 'False':
            pinned = False
        else:
            pinned = True

        success = FeedItem.pin(item_id, pinned)

        resp.content_type = 'text/json'

        if success:
            resp.body = json.dumps(FeedItem.get(item_id))
            resp.status = falcon.HTTP_200
        else:
            resp.status = falcon.HTTP_400
Пример #3
0
  def update_feed(self, feed):
    """Fetch the feed and process new items"""
    d = self.parse_feed(feed)
    if d is None:
      logging.warn("Parsing failed")
      feed.last_polled = datetime.utcnow()
      feed.put()
      return

    to_put = []
    for entry in d['entries']:
      item = FeedItem.process_entry(entry, feed)
      if item is not None:
        item_exists = FeedItem.get_by_key_name(item._key_name)
        if item_exists is None:
          to_put.append(item)
          # TODO: what about updates?

    if len(to_put) > 0:
      db.put(to_put)
      # self.update_mavenn_activity(feed.stream_id, to_put)

    # update feedstream properties
    if hasattr(d, 'status'):
      feed.http_status = str(d.status)
      if hasattr(d, 'modified'):
        feed.http_last_modified = datetime(*d.modified[:6])
      if hasattr(d, 'etag'):
        feed.http_etag = d.etag
    feed.last_polled = datetime.utcnow()
    feed.put()
    return
Пример #4
0
def get_feed_item(url, description, user_id):
    if description is None:
        description = ""
    request = urllib.request.Request(
        url, headers={"User-Agent": "Mozila/5.0"})
    client = urllib.request.urlopen(request)
    web_page = BeautifulSoup(client.read(), "html.parser")
    title = web_page.title.string
    item = FeedItem(url, title)
    item.description = description
    item.user_id = user_id
    return item
Пример #5
0
def setup_database(db_file_path=settings.DB_FILE_PATH):
    """
        Connect the database.

        Create the database tables if they don't exists already.
    """
    # import models here to avoid circular references
    from models import BaseModel, Feed, FeedItem

    BaseModel.db.connect()

    Feed.create_table()
    FeedItem.create_table()
Пример #6
0
    def run(self, feed_id):

        # Get feed object
        feed = Feed.objects.get(id=feed_id)

        # Debug log
        LOGGER.debug("Updating feed %s", feed.name)

        # Set User-Agent header
        feedparser.USER_AGENT = "Element43Feedreader/Git +https://element-43.com/"

        # Fetch feed
        document = feedparser.parse(feed.url)

        if document.status == 200:
            # Remove all existing entries of that feed
            FeedItem.objects.filter(feed=feed).delete()

            # Add current content
            for item in document.entries:
                # Create item
                feed_item = FeedItem(
                    feed=feed,
                    title=(item.title[:97] +
                           '...') if len(item.title) > 100 else item.title,
                    description=item.summary,
                    link=item.link,
                    published=pytz.utc.localize(
                        datetime.fromtimestamp(mktime(item.published_parsed))))

                # Save item to DB
                feed_item.save()

        else:
            # Log HTTP error
            LOGGER.warn(
                "Error fetching feed %s - server returned status code %d .",
                feed.name, document.status)

        # Set next update to be 5 minutes from now
        five_minutes_from_now = datetime.utcnow() + timedelta(minutes=5)
        feed.next_update = pytz.utc.localize(five_minutes_from_now)
        feed.save()

        LOGGER.debug("Finished updating feed %s", feed.name)
Пример #7
0
    def run(self, feed_id):

        # Get feed object
        feed = Feed.objects.get(id=feed_id)

        # Debug log
        LOGGER.debug("Updating feed %s", feed.name)

        # Set User-Agent header
        feedparser.USER_AGENT = "Element43Feedreader/Git +https://element-43.com/"

        # Fetch feed
        document = feedparser.parse(feed.url)

        if document.status == 200:
            # Remove all existing entries of that feed
            FeedItem.objects.filter(feed=feed).delete()

            # Add current content
            for item in document.entries:
                # Create item
                feed_item = FeedItem(feed=feed,
                                     title=item.title,
                                     description=item.summary,
                                     link=item.link,
                                     published=pytz.utc.localize(datetime.fromtimestamp(mktime(item.published_parsed))))

                # Save item to DB
                feed_item.save()

        else:
            # Log HTTP error
            LOGGER.warn("Error fetching feed %s - server returned status code %d .", feed.name, document.status)

        # Set next update to be 5 minutes from now
        five_minutes_from_now = datetime.utcnow() + timedelta(minutes=5)
        feed.next_update = pytz.utc.localize(five_minutes_from_now)
        feed.save()

        LOGGER.debug("Finished updating feed %s", feed.name)
Пример #8
0
        def _init(self):
                logging.debug("ShowFeed::_init()")
                feeds = Feed.get_active_feeds()
                if feeds:
                    for feed in feeds:
                        if time.time() - time.mktime(datetime.datetime.timetuple(feed.last_updated)) >= 3600*4:
                                logging.debug("Feed '%s' Triggered feed update" % feed.name)
                                self._updateFeeds()
                                break
                
		
		self.items = FeedItem.get_latest()
		logging.debug("Got %d feed items" % self.items.count())
Пример #9
0
	def get(self):
		if self.request.get("id"):
			item = FeedItem.get_by_id(int(self.request.get("id")))
			if item:
				if 'content_type' not in item.__dict__:
					item.content_type = 'image/png'
					item.put()

				self.response.headers["Content-Type"] = item.content_type
				self.response.out.write(item.enclosure)
			else:
				self.redirect("/assets/hello.jpg")
		else:
			self.redirect("/assets/hello.jpg")
Пример #10
0
 def on_get(self, req, resp):
     collection_type = req.params.get('sort') or 'default'
     start = int(req.params.get('start') or 0)
     count = int(req.params.get('count') or 10)
     if start < 0:
         start = 0
     if count < 0 or count > 10:
         count = 10
     items = FeedItem.all(collection_type, start, count)
     data = {
         'pagination': {
             'next_url': '/api/feed/?sort=%s&start=%s&count=%s' % (collection_type, start + count, count),
             'sort': collection_type,
             'start': start,
             'next_start': start + count,
             'count': count,
             'total': FeedItem.count()
         },
         'items': items
     }
     resp.content_type = 'text/json'
     resp.body = json.dumps(data)
     resp.status = falcon.HTTP_200
Пример #11
0
def refresh_rss(url):
    ''' Deletes and re-fills the blog feed database with RSS entries '''
    feed = parse(url)
    if feed['bozo'] == 1:
        raise Exception, '%s is not a valid RSS stream!' % url

    items = []
    for item in islice(feed['items'], 5):
        feeditem = FeedItem(title=item.title,
                            description=truncate(strip_html(item.description, True)),
                            date=datetime.fromtimestamp(mktime(item.updated_parsed)),
                            url=item.link)
        items.append(feeditem)

    FeedItem.query.delete()
    db.session.add_all(items)
    db.session.commit()
Пример #12
0
  def post(self, stream_id):
    """Handles Content Distribution notifications."""
    logging.debug(self.request.headers)
    logging.debug(self.request.body)

    feed = feedparser.parse(self.request.body)
    if feed.bozo:
      logging.error('Bozo feed data. %s: %r',
                     feed.bozo_exception.__class__.__name__,
                     feed.bozo_exception)
      if (hasattr(feed.bozo_exception, 'getLineNumber') and
          hasattr(feed.bozo_exception, 'getMessage')):
        line = feed.bozo_exception.getLineNumber()
        logging.error('Line %d: %s', line, feed.bozo_exception.getMessage())
        segment = self.request.body.split('\n')[line-1]
        logging.info('Body segment with error: %r', segment.decode('utf-8'))
      return self.response.set_status(500)

    feedstream = FeedStream.get_by_key_name("z%s" % stream_id)
    if feedstream is None:
      logging.warn("Discarding update from unknown feed '%s'", stream_id)
      self.error(404)
      return

    logging.info("Processing update for feed '%s'", feedstream.url)
    logging.info('Found %d entries', len(feed.entries))

    to_put = []  # batch datastore updates
    for entry in feed.entries:
      item = FeedItem.process_entry(entry, feedstream)
      if item is not None:
        to_put.append(item)
    if len(to_put) > 0:
      db.put(to_put)
      # update feed last_polled or http_last_modified so feed poller doesn't have to check this feed for a while
      feedstream.last_polled = datetime.utcnow()
      feedstream.put()
      #self.update_mavenn_activity(feedstream.stream_id, to_put)

    # Response headers (body can be empty) 
    # X-Hub-On-Behalf-Of
    self.response.set_status(200)
    self.response.out.write("ok");
Пример #13
0
def create_feed_item(url, description, user_id, title):
    item = FeedItem(url, title)
    item.description = description
    item.user_id = user_id
    return item
Пример #14
0
	def get(self):
		feeds = self._getFeeds()
		logging.debug("Got %d feeds" % feeds.count())
		for feed in feeds:
			logging.debug("Feed %s last updated %s" % (feed.name, feed.last_updated))
			force = self.request.get("force") == "1"
			if force:
				logging.debug("Force option enabled")

			if not force and time.time() - time.mktime(datetime.datetime.timetuple(feed.last_updated)) < 3600*4:
				logging.debug("Feed %s doesn't need updates, skipping" % feed.name)
				continue

			logging.debug("Fetching %s" % feed.url)
			feed_content = urlfetch.fetch(feed.url)
			logging.debug("Fetched, status = %d" % feed_content.status_code)
			if feed_content.status_code == 200:
				parsed_feed = feedparser.parse(feed_content.content)
				feed.last_updated = datetime.datetime.now()
				feed.put()
			else:
				logging.error("Failed to load feed %s" % feed.name)
				self.error(500)
				
			linkre = re.compile("http://(?:www\.)?explosm.net/comics/\d+/?")
			comicre = re.compile('(http://(?:www\.)?explosm.net/db/files/Comics/[A-z0-9_\-\+]+/[A-z0-9\-_\+]+\.(gif|png))')
	
			logging.debug("Got %d entries" % len(parsed_feed.entries))
			for e in parsed_feed.entries:
				if linkre.match(e.link):
					if not FeedItem.is_fetched(e.link):
						logging.debug("Going to fetch entry %s" % e.link)
						result = urlfetch.fetch(e.link)
						logging.debug("Fetched, status = %d" % result.status_code)
						if result.status_code == 200:
							results = comicre.findall(result.content)
							if results and len(results) > 0:
								logging.debug("Going to fetch enclosure %s" % results[0][0])
								enclosure = urlfetch.fetch(results[0][0])
								logging.debug("Fetched, status = %d" % enclosure.status_code)
								if enclosure.status_code == 200:
									feed_item = FeedItem()
									feed_item.title = e.title
									feed_item.url = e.link
									feed_item.content_type = "image/"+results[0][1]
									feed_item.feed = feed
									feed_item.date = datetime.datetime.fromtimestamp(time.mktime(e.updated_parsed))
									feed_item.content = db.Text(e.description)
									feed_item.enclosure = enclosure.content
									feed_item.put()
								else:
									logging.error("Failed to fetch enclosure %s" % results[0])
								
							else:
								logging.debug("Got no enclosure in %s" % e.link)
	
						else:
							logging.debug("Failed to download %s" % e.link)
					else:
						logging.debug("Skipping already fetch item %s" % e.link)
				else:
					logging.debug("Skipping unknown link %s" % e.link)
Пример #15
0
    def check(self):

        logger.debug("{name} 正在检查更新", name=self.name)

        @retry(stop_max_attempt_number=3)
        def fetch():
            logger.debug(f"{self.name} 正在抓取" + self.feed_url)
            feed_text = requests.get(self.feed_url).text
            logger.debug(f"{self.name} 抓取成功")
            f = feedparser.parse(feed_text)
            assert not f.get("bozo_exception")
            feed_entries = f['entries']
            logger.debug(f"{self.name} 获取到{len(feed_entries)}条")

            if len(feed_entries) == 0:
                logger.error(f"{self.name} 获取到的feed为空")
                raise Exception()

            return feed_entries

        feed_entries = fetch()

        changed_items = []
        for e in feed_entries:

            item = FeedItem()
            item.title = e["title"]
            item.uuid = e['id']
            item.link = e['link']
            item.content = e['summary']
            item.full_content = f"【{item.title}】 {item.content}"
            last = self.storage.get(item.uuid)

            if last != item.full_content:
                self.storage.set(item.uuid, item.full_content)
                item.is_new = (last is None)
                item.old_full_content = last

                logger.info(
                    f"{self.name} {'新增' if item.is_new else '修改'}内容 {item.title} - {item.uuid}"
                )

                changed_items.append(item)

        logger.info(f"{self.name} 检测到变化条目%d" % len(changed_items))

        if self.first_run:
            logger.info(f"{self.name} 第一次运行,不通知")
            self.first_run = False
            return

        if len(changed_items) > 5:
            logger.warning(f"{self.name} 周期内更新的条目太多")
        else:
            for item in changed_items:
                for n in self.notifies:

                    @retry(stop_max_attempt_number=3)
                    def do_notify():
                        logger.info(f"{self.name} 正在通过{n}发送通知")
                        n.notify(item)

                    do_notify()