def twitter_bot(rss_guid=None): """ Consumes a feed and checks if there are new entries in db. If so, gets a shortened url and tweets the new status. """ if rss_guid is None: # ancestor_key = ndb.Key("RSS_GUID", rss_guid or "*norss*") # consumer = FeedConsume.get_last_rss_guid(ancestor_key) # rss_guid = consumer[0].rss_guid query = FeedConsume.gql("WHERE entry = :1", "latest") result = query.get() rss_guid = result.rss_guid else: consumer = FeedConsume(parent=ndb.Key("RSS_GUID", rss_guid or "*norss*"), rss_guid=rss_guid, entry="latest") consumer.put() url = "{}erss.cgi?rss_guid={}".format(conf("pubmed_rss"), rss_guid) feeds = feedparser.parse(url) tweets = [] for feed in feeds["items"]: pmid = (feed["link"].split("/")[-1]).rstrip("?dopt=Abstract") if "entrez?" in pmid: continue query = FeedItem.gql("WHERE pmid = :1", pmid) # if pmid not in db if (query.count() == 0): title = feed["title"] otitle = title url = feed["link"] category = feed["category"] item = FeedItem() item.pmid = pmid # shorten the url with Bitly.com shorturl = shorten_url_bitly(url) # tweet the new entry max_length = (140 - len(category) - len(shorturl) - 7) print(max_length) if len(title) > max_length: title = title[0:max_length] status = "#{}: {}... {}".format("".join(category.split()), title.rstrip(". "), shorturl) try: status = unicode(status).encode("utf-8") except UnicodeEncodeError: pass # TODO: add logging # tweet new status # tweets.append({'title': "{}...".format(title.rstrip(". ")), 'url': shorturl}) ttitle = "#{}: {}...".format("".join(category.split()), otitle[0:100].rstrip(". ")) tweets.append({'title': ttitle, 'url': shorturl}) try: update_status_twitter(status) item.put() except: pass return tweets
def on_post(self, req, resp): # print req.stream.read() # try: # raw = req.stream.read() # except Exception as ex: # raise falcon.HTTPError(falcon.HTTP_400, # 'Error', # ex.message) # # try: # result = json.loads(raw, encoding='utf-8') # except ValueError: # raise falcon.HTTPError(falcon.HTTP_400, # 'Malformed JSON', # 'Could not decode the request body. The ' # 'JSON was incorrect.') result = req.stream.read().split('&') item_id = result[0].split('=')[1] pinned = result[1].split('=')[1] if pinned == 'False': pinned = False else: pinned = True success = FeedItem.pin(item_id, pinned) resp.content_type = 'text/json' if success: resp.body = json.dumps(FeedItem.get(item_id)) resp.status = falcon.HTTP_200 else: resp.status = falcon.HTTP_400
def update_feed(self, feed): """Fetch the feed and process new items""" d = self.parse_feed(feed) if d is None: logging.warn("Parsing failed") feed.last_polled = datetime.utcnow() feed.put() return to_put = [] for entry in d['entries']: item = FeedItem.process_entry(entry, feed) if item is not None: item_exists = FeedItem.get_by_key_name(item._key_name) if item_exists is None: to_put.append(item) # TODO: what about updates? if len(to_put) > 0: db.put(to_put) # self.update_mavenn_activity(feed.stream_id, to_put) # update feedstream properties if hasattr(d, 'status'): feed.http_status = str(d.status) if hasattr(d, 'modified'): feed.http_last_modified = datetime(*d.modified[:6]) if hasattr(d, 'etag'): feed.http_etag = d.etag feed.last_polled = datetime.utcnow() feed.put() return
def get_feed_item(url, description, user_id): if description is None: description = "" request = urllib.request.Request( url, headers={"User-Agent": "Mozila/5.0"}) client = urllib.request.urlopen(request) web_page = BeautifulSoup(client.read(), "html.parser") title = web_page.title.string item = FeedItem(url, title) item.description = description item.user_id = user_id return item
def setup_database(db_file_path=settings.DB_FILE_PATH): """ Connect the database. Create the database tables if they don't exists already. """ # import models here to avoid circular references from models import BaseModel, Feed, FeedItem BaseModel.db.connect() Feed.create_table() FeedItem.create_table()
def run(self, feed_id): # Get feed object feed = Feed.objects.get(id=feed_id) # Debug log LOGGER.debug("Updating feed %s", feed.name) # Set User-Agent header feedparser.USER_AGENT = "Element43Feedreader/Git +https://element-43.com/" # Fetch feed document = feedparser.parse(feed.url) if document.status == 200: # Remove all existing entries of that feed FeedItem.objects.filter(feed=feed).delete() # Add current content for item in document.entries: # Create item feed_item = FeedItem( feed=feed, title=(item.title[:97] + '...') if len(item.title) > 100 else item.title, description=item.summary, link=item.link, published=pytz.utc.localize( datetime.fromtimestamp(mktime(item.published_parsed)))) # Save item to DB feed_item.save() else: # Log HTTP error LOGGER.warn( "Error fetching feed %s - server returned status code %d .", feed.name, document.status) # Set next update to be 5 minutes from now five_minutes_from_now = datetime.utcnow() + timedelta(minutes=5) feed.next_update = pytz.utc.localize(five_minutes_from_now) feed.save() LOGGER.debug("Finished updating feed %s", feed.name)
def run(self, feed_id): # Get feed object feed = Feed.objects.get(id=feed_id) # Debug log LOGGER.debug("Updating feed %s", feed.name) # Set User-Agent header feedparser.USER_AGENT = "Element43Feedreader/Git +https://element-43.com/" # Fetch feed document = feedparser.parse(feed.url) if document.status == 200: # Remove all existing entries of that feed FeedItem.objects.filter(feed=feed).delete() # Add current content for item in document.entries: # Create item feed_item = FeedItem(feed=feed, title=item.title, description=item.summary, link=item.link, published=pytz.utc.localize(datetime.fromtimestamp(mktime(item.published_parsed)))) # Save item to DB feed_item.save() else: # Log HTTP error LOGGER.warn("Error fetching feed %s - server returned status code %d .", feed.name, document.status) # Set next update to be 5 minutes from now five_minutes_from_now = datetime.utcnow() + timedelta(minutes=5) feed.next_update = pytz.utc.localize(five_minutes_from_now) feed.save() LOGGER.debug("Finished updating feed %s", feed.name)
def _init(self): logging.debug("ShowFeed::_init()") feeds = Feed.get_active_feeds() if feeds: for feed in feeds: if time.time() - time.mktime(datetime.datetime.timetuple(feed.last_updated)) >= 3600*4: logging.debug("Feed '%s' Triggered feed update" % feed.name) self._updateFeeds() break self.items = FeedItem.get_latest() logging.debug("Got %d feed items" % self.items.count())
def get(self): if self.request.get("id"): item = FeedItem.get_by_id(int(self.request.get("id"))) if item: if 'content_type' not in item.__dict__: item.content_type = 'image/png' item.put() self.response.headers["Content-Type"] = item.content_type self.response.out.write(item.enclosure) else: self.redirect("/assets/hello.jpg") else: self.redirect("/assets/hello.jpg")
def on_get(self, req, resp): collection_type = req.params.get('sort') or 'default' start = int(req.params.get('start') or 0) count = int(req.params.get('count') or 10) if start < 0: start = 0 if count < 0 or count > 10: count = 10 items = FeedItem.all(collection_type, start, count) data = { 'pagination': { 'next_url': '/api/feed/?sort=%s&start=%s&count=%s' % (collection_type, start + count, count), 'sort': collection_type, 'start': start, 'next_start': start + count, 'count': count, 'total': FeedItem.count() }, 'items': items } resp.content_type = 'text/json' resp.body = json.dumps(data) resp.status = falcon.HTTP_200
def refresh_rss(url): ''' Deletes and re-fills the blog feed database with RSS entries ''' feed = parse(url) if feed['bozo'] == 1: raise Exception, '%s is not a valid RSS stream!' % url items = [] for item in islice(feed['items'], 5): feeditem = FeedItem(title=item.title, description=truncate(strip_html(item.description, True)), date=datetime.fromtimestamp(mktime(item.updated_parsed)), url=item.link) items.append(feeditem) FeedItem.query.delete() db.session.add_all(items) db.session.commit()
def post(self, stream_id): """Handles Content Distribution notifications.""" logging.debug(self.request.headers) logging.debug(self.request.body) feed = feedparser.parse(self.request.body) if feed.bozo: logging.error('Bozo feed data. %s: %r', feed.bozo_exception.__class__.__name__, feed.bozo_exception) if (hasattr(feed.bozo_exception, 'getLineNumber') and hasattr(feed.bozo_exception, 'getMessage')): line = feed.bozo_exception.getLineNumber() logging.error('Line %d: %s', line, feed.bozo_exception.getMessage()) segment = self.request.body.split('\n')[line-1] logging.info('Body segment with error: %r', segment.decode('utf-8')) return self.response.set_status(500) feedstream = FeedStream.get_by_key_name("z%s" % stream_id) if feedstream is None: logging.warn("Discarding update from unknown feed '%s'", stream_id) self.error(404) return logging.info("Processing update for feed '%s'", feedstream.url) logging.info('Found %d entries', len(feed.entries)) to_put = [] # batch datastore updates for entry in feed.entries: item = FeedItem.process_entry(entry, feedstream) if item is not None: to_put.append(item) if len(to_put) > 0: db.put(to_put) # update feed last_polled or http_last_modified so feed poller doesn't have to check this feed for a while feedstream.last_polled = datetime.utcnow() feedstream.put() #self.update_mavenn_activity(feedstream.stream_id, to_put) # Response headers (body can be empty) # X-Hub-On-Behalf-Of self.response.set_status(200) self.response.out.write("ok");
def create_feed_item(url, description, user_id, title): item = FeedItem(url, title) item.description = description item.user_id = user_id return item
def get(self): feeds = self._getFeeds() logging.debug("Got %d feeds" % feeds.count()) for feed in feeds: logging.debug("Feed %s last updated %s" % (feed.name, feed.last_updated)) force = self.request.get("force") == "1" if force: logging.debug("Force option enabled") if not force and time.time() - time.mktime(datetime.datetime.timetuple(feed.last_updated)) < 3600*4: logging.debug("Feed %s doesn't need updates, skipping" % feed.name) continue logging.debug("Fetching %s" % feed.url) feed_content = urlfetch.fetch(feed.url) logging.debug("Fetched, status = %d" % feed_content.status_code) if feed_content.status_code == 200: parsed_feed = feedparser.parse(feed_content.content) feed.last_updated = datetime.datetime.now() feed.put() else: logging.error("Failed to load feed %s" % feed.name) self.error(500) linkre = re.compile("http://(?:www\.)?explosm.net/comics/\d+/?") comicre = re.compile('(http://(?:www\.)?explosm.net/db/files/Comics/[A-z0-9_\-\+]+/[A-z0-9\-_\+]+\.(gif|png))') logging.debug("Got %d entries" % len(parsed_feed.entries)) for e in parsed_feed.entries: if linkre.match(e.link): if not FeedItem.is_fetched(e.link): logging.debug("Going to fetch entry %s" % e.link) result = urlfetch.fetch(e.link) logging.debug("Fetched, status = %d" % result.status_code) if result.status_code == 200: results = comicre.findall(result.content) if results and len(results) > 0: logging.debug("Going to fetch enclosure %s" % results[0][0]) enclosure = urlfetch.fetch(results[0][0]) logging.debug("Fetched, status = %d" % enclosure.status_code) if enclosure.status_code == 200: feed_item = FeedItem() feed_item.title = e.title feed_item.url = e.link feed_item.content_type = "image/"+results[0][1] feed_item.feed = feed feed_item.date = datetime.datetime.fromtimestamp(time.mktime(e.updated_parsed)) feed_item.content = db.Text(e.description) feed_item.enclosure = enclosure.content feed_item.put() else: logging.error("Failed to fetch enclosure %s" % results[0]) else: logging.debug("Got no enclosure in %s" % e.link) else: logging.debug("Failed to download %s" % e.link) else: logging.debug("Skipping already fetch item %s" % e.link) else: logging.debug("Skipping unknown link %s" % e.link)
def check(self): logger.debug("{name} 正在检查更新", name=self.name) @retry(stop_max_attempt_number=3) def fetch(): logger.debug(f"{self.name} 正在抓取" + self.feed_url) feed_text = requests.get(self.feed_url).text logger.debug(f"{self.name} 抓取成功") f = feedparser.parse(feed_text) assert not f.get("bozo_exception") feed_entries = f['entries'] logger.debug(f"{self.name} 获取到{len(feed_entries)}条") if len(feed_entries) == 0: logger.error(f"{self.name} 获取到的feed为空") raise Exception() return feed_entries feed_entries = fetch() changed_items = [] for e in feed_entries: item = FeedItem() item.title = e["title"] item.uuid = e['id'] item.link = e['link'] item.content = e['summary'] item.full_content = f"【{item.title}】 {item.content}" last = self.storage.get(item.uuid) if last != item.full_content: self.storage.set(item.uuid, item.full_content) item.is_new = (last is None) item.old_full_content = last logger.info( f"{self.name} {'新增' if item.is_new else '修改'}内容 {item.title} - {item.uuid}" ) changed_items.append(item) logger.info(f"{self.name} 检测到变化条目%d" % len(changed_items)) if self.first_run: logger.info(f"{self.name} 第一次运行,不通知") self.first_run = False return if len(changed_items) > 5: logger.warning(f"{self.name} 周期内更新的条目太多") else: for item in changed_items: for n in self.notifies: @retry(stop_max_attempt_number=3) def do_notify(): logger.info(f"{self.name} 正在通过{n}发送通知") n.notify(item) do_notify()