def downloadSource(self,source): if source.etag != None: d = feedparser.parse(source.url,etag=source.etag) else: d = feedparser.parse(source.url) save_in_db = False articles = [] if d.has_key('status'): if d.status == 200: if d.has_key('etag'): save_in_db = True source.etag = d.etag for entry in d.entries: title = entry.title \ if entry.has_key('title') \ else None summary = entry.summary \ if entry.has_key('summary') \ else None description = entry.description \ if entry.has_key('description') \ else None link = entry.link \ if entry.has_key('link') \ else None article = Article(parent=ndb.Key('Article',source.name),\ published=datetime(*entry.published_parsed[:6]),\ source=source.key,title=title,description=description,\ summary=summary,link=link) articles.append(article) if save_in_db: self.bulkEntityInsertion(articles) source.put() print articles
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): modified = None etag = None address = cache_bust_url(address) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (self.feed.title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL) if self.options.get('feed_xml'): logging.debug( u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug( u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.title[:30])) return FEED_OK, self.fpf try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError), e: logging.debug(u' ***> [%-30s] ~FR%s, turning off microformats.' % (self.feed.title[:30], e)) feedparser.PARSE_MICROFORMATS = False self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) feedparser.PARSE_MICROFORMATS = True
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): modified = None etag = None address = cache_bust_url(address) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % ( self.feed.title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL ) if self.options.get('feed_xml'): logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % ( self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % ( self.feed.title[:30])) return FEED_OK, self.fpf try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError), e: logging.debug(u' ***> [%-30s] ~FR%s, turning off microformats.' % (self.feed.title[:30], e)) feedparser.PARSE_MICROFORMATS = False self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) feedparser.PARSE_MICROFORMATS = True
def fetch(self): """ Downloads and parses a feed. """ socket.setdefaulttimeout(30) identity = self.get_identity() log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (identity, unicode(self.feed)[:30], self.feed.id) logging.debug(log_msg) # Check if feed still needs to be updated # feed = Feed.objects.get(pk=self.feed.pk) # if feed.next_scheduled_update > datetime.datetime.now() and not self.options.get('force'): # log_msg = u' ---> Already fetched %s (%d)' % (self.feed.feed_title, # self.feed.id) # logging.debug(log_msg) # feed.save_feed_history(303, "Already fetched") # return FEED_SAME, None # else: self.feed.set_next_scheduled_update() etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force'): modified = None etag = None self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, unicode(self.feed)[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once: modified = None etag = None USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL ) self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def _blog_fetch_callback(self, rpc): content = rpc.get_result().content d = feedparser.parse(StringIO(content)) s = Signatures.get_single() signatures_and_times = dict(zip(s.hashes, s.times)) posts = [] tags_entity = Tags.get_single() tags = set(tags_entity.available) for entry in d['entries']: blog_post = BlogPost.blog_post_from_feed_entry(entry) signature_time = signatures_and_times.get(blog_post.signature) if signature_time: signature_time = datetime.datetime.fromtimestamp(signature_time).strftime('%m/%d/%Y %H:%M') posts.append((blog_post, signature_time)) for tag in blog_post.tags: tags.add(tag.lower()) template_values = {"posts":posts} tags_entity.available = list(tags) tags_entity.save() template = jinja_environment.get_template('moderate.html') self.response.out.write(template.render(template_values))
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ identity = self.get_identity() log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % ( identity, unicode(self.feed)[:30], self.feed.id) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once: modified = None etag = None USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', URL) self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ identity = self.get_identity() log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (identity, unicode(self.feed)[:30], self.feed.id) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once: modified = None etag = None USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', URL ) self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def main(self): """ 处理数据 """ rss_list = session.query(CheckList).filter(CheckList.status == 1).all() for item in rss_list: try: article_list = (feedparser.parse(item.url)).entries # 倒序输出 for _c in reversed(article_list): black_list_key = False # 关键词黑名单 for _black in self.black_list: if _black in _c.title: black_list_key = True if black_list_key: continue _hash = self._url_hash(_c.link) _status = self._check_hash(item.id, _hash) if not _status: # 入库 _time = int(time.time()) read_info = ReadList(checklist_id=int(item.id), hash=_hash, add_time=_time) session.add(read_info) session.commit() # 钉钉机器人 AliHook().ali_hook(item.nickname, item.tag, _c.title, _c.link) except Exception as e: logging.warn("_data error: {}".format(str(e)))
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, unicode(self.feed)[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once: modified = None etag = None USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL) self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def add_feed(self, xml): feeds = feedparser.parse(xml) self.articles += len(feeds['items']) for f in feeds['items']: self.u_mut.acquire () self.items[f['link']] = (f['title'], f['summary']) self.u_mut.release() self.u_sem.release()
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY)' % (identity, self.feed.title[:30], self.feed.id) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None address = self.feed.feed_address # If is forced or random is less than 1%, set modified = None and etag = None, # means it will fetch new if (self.options.get('force') or random.random() <= .01): modified = None etag = None address = cache_bust_url(address) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % ( self.feed.title[:30], address)) # If this feed_id in not fetched once before or not known_good elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = ('NewsBlur Feed Fetcher - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.permalink, )) try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError), e: logging.debug(u' ***> [%-30s] ~FR%s, turning off headers.' % (self.feed.title[:30], e)) self.fpf = feedparser.parse(address, agent=USER_AGENT)
def _blog_fetch_callback(self, rpc): content = rpc.get_result().content d = feedparser.parse(StringIO(content)) signatures = Signatures.signatures() posts = [] for entry in d['entries']: blog_post = BlogPost.blog_post_from_feed_entry(entry) if blog_post.signature in signatures: posts.append(blog_post) template_values = {"posts":posts, "DEBUG":DEBUG} template = jinja_environment.get_template('main.html') self.response.out.write(template.render(template_values))
def _blog_fetch_callback(self, rpc): content = rpc.get_result().content d = feedparser.parse(StringIO(content)) signatures = Signatures.signatures() posts = [] for entry in d['entries']: blog_post = BlogPost.blog_post_from_feed_entry(entry) if blog_post.signature in signatures: posts.append(blog_post) template_values = {"posts": posts, "DEBUG": DEBUG} template = jinja_environment.get_template('main.html') self.response.out.write(template.render(template_values))
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None if self.options.get( 'force' ) or not self.feed.fetched_once or not self.feed.known_good: modified = None etag = None USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL) if self.options.get('feed_xml'): logging.debug( u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug( u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.title[:30])) else: self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, self.fpf
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, unicode(self.feed)[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get('force') or not self.feed.fetched_once or not self.feed.known_good: modified = None etag = None USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL ) if self.options.get('feed_xml'): logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % ( unicode(self.feed)[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % ( unicode(self.feed)[:30])) else: self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) if self.options['verbose'] and getattr(self.fpf, 'status', None) == 200: logging.debug(u' ---> [%-30s] ~FBTIME: feed fetch in ~FM%.4ss' % ( unicode(self.feed)[:30], time.time() - start)) return FEED_OK, self.fpf
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ socket.setdefaulttimeout(30) identity = self.get_identity() log_msg = u"%2s ---> [%-30s] Fetching feed (%d)" % (identity, unicode(self.feed)[:30], self.feed.id) logging.debug(log_msg) self.feed.set_next_scheduled_update() etag = self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get("force") or not self.feed.fetched_once: modified = None etag = None self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) return FEED_OK, self.fpf
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u"%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s" % ( identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update, ) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None if self.options.get("force") or not self.feed.fetched_once or not self.feed.known_good: modified = None etag = None USER_AGENT = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2 (NewsBlur Feed Fetcher - %s subscriber%s - %s)" % (self.feed.num_subscribers, "s" if self.feed.num_subscribers != 1 else "", settings.NEWSBLUR_URL) ) if self.options.get("feed_xml"): logging.debug( u" ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s" % (self.feed.title[:30], len(self.options.get("feed_xml"))) ) if self.options.get("fpf"): self.fpf = self.options.get("fpf") logging.debug(u" ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping." % (self.feed.title[:30])) else: self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified) logging.debug(u" ---> [%-30s] ~FYFeed fetch in ~FM%.4ss" % (self.feed.title[:30], time.time() - start)) return FEED_OK, self.fpf
def parse(self): logging.info('Parse feed: %s' % self.url) referrer = "https://www.google.com/reader/view/" self.feed = feedparser.parse(self.url, agent=self.user_agent,referrer=referrer) if self.feed.bozo == 1: raise self.feed.bozo_exception self.ffname = ascii_filename(self.feed.feed.title) self.book_dir = '%s%s' % (self.data_dir, self.ffname) #如果目录存在换个名字 #i,tmpdir = 1,self.book_dir #while True: # if os.path.isdir(tmpdir): # tmpdir = self.book_dir + ('(%s)' % i) # i = i + 1 # else: # self.book_dir = tmpdir # break self.book_dir = self.book_dir + '/' if os.path.isdir(self.book_dir) is False: os.mkdir(self.book_dir, 0777) if os.path.isdir(self.book_dir+'images/') is False: os.mkdir(self.book_dir+'images/', 0777) if os.path.isdir(self.book_dir+'articles/') is False: os.mkdir(self.book_dir+'articles/', 0777) return self
def fetch(self): """ Uses requests to download the feed, parsing it in feedparser. Will be storified later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, self.feed.log_title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): self.options['force'] = True modified = None etag = None if address.startswith('http'): address = qurl(address, add={"_": random.randint(0, 10000)}) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (self.feed.log_title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None if self.options.get('feed_xml'): logging.debug( u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (self.feed.log_title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug( u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.log_title[:30])) return FEED_OK, self.fpf if 'youtube.com' in address: try: youtube_feed = self.fetch_youtube(address) except (requests.adapters.ConnectionError): youtube_feed = None if not youtube_feed: logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(youtube_feed) elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])): twitter_feed = self.fetch_twitter(address) if not twitter_feed: logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(twitter_feed) elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])): facebook_feed = self.fetch_facebook() if not facebook_feed: logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(facebook_feed) if not self.fpf: try: headers = self.feed.fetch_headers() if etag: headers['If-None-Match'] = etag if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. short_weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % ( short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]) headers['If-Modified-Since'] = modified_header if etag or modified: headers['A-IM'] = 'feed' raw_feed = requests.get(address, headers=headers) if raw_feed.status_code >= 400: logging.debug( " ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)) raw_feed = requests.get( self.feed.feed_address, headers=self.feed.fetch_headers(fake=True)) if raw_feed.content and 'application/json' in raw_feed.headers.get( 'Content-Type', ""): # JSON Feed json_feed = self.fetch_json_feed(address, raw_feed) if not json_feed: logging.debug( u' ***> [%-30s] ~FRJSON fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(json_feed) elif raw_feed.content and raw_feed.status_code < 400: response_headers = raw_feed.headers response_headers['Content-Location'] = raw_feed.url self.raw_feed = smart_unicode(raw_feed.content) self.fpf = feedparser.parse( self.raw_feed, response_headers=response_headers) if self.options.get('debug', False): logging.debug( " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode( raw_feed.content)), raw_feed.headers)) except Exception, e: logging.debug( " ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100])) if not self.fpf or self.options.get('force_fp', False): try: self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e)) pass
if not self.fpf or self.options.get('force_fp', False): try: self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e)) pass if not self.fpf: try: logging.debug(u' ***> [%-30s] ~FRTurning off headers...' % (self.feed.log_title[:30])) self.fpf = feedparser.parse(address, agent=self.feed.user_agent) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' % (self.feed.log_title[:30], e)) return FEED_ERRHTTP, None logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % ( self.feed.log_title[:30], time.time() - start)) return FEED_OK, self.fpf def get_identity(self): identity = "X" current_process = multiprocessing.current_process() if current_process._identity:
def update(self): """ Update Uses FeedParser to grab and parse feed. """ if self.enabled is False: return if self.last_modified: modified = self.last_modified.timetuple() else: modified = None data = feedparser.parse(self.url, etag=self.etag, modified=modified, agent=USER_AGENT) if data.bozo != 1: if data.status != 304: data.entries.reverse() while data.entries: dentry = data.entries.pop() if dentry.has_key('published'): created_on = datetime.datetime(*dentry.published_parsed[0:6]) else: created_on = datetime.datetime(*dentry.updated_parsed[0:6]) if self.last_modified is not None and (created_on <= self.last_modified): break if dentry.has_key('title'): text = dentry.title elif dentry.has_key('summary'): text = dentry.summary else: dentry.get('content', 'None') if self.auto_link: text = self._auto_link(text) if self.feed_type == 't': text = self._twitter_parser(text) Entry.objects.create( source=self, url=dentry.link, text=text, created_on=created_on ) if data.has_key('etag'): self.etag = data.etag if data.has_key('modified'): self.last_modified = datetime.datetime(*data.modified[0:6]) else: try: latest = Entry.objects.filter(source=self).latest('created_on') self.last_modified = latest.created_on except ObjectDoesNotExist: pass self.last_update_successful = True self.last_status_code = data.status else: self.last_update_successful = False self.error_message = pprint(data.bozo_exception) self.save()
def fetch(self): """ Uses requests to download the feed, parsing it in feedparser. Will be storified later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): self.options['force'] = True modified = None etag = None address = qurl(address, add={"_": random.randint(0, 10000)}) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % ( self.feed.title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', self.feed.permalink, )) if self.options.get('feed_xml'): logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % ( self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % ( self.feed.title[:30])) return FEED_OK, self.fpf if 'youtube.com' in address: try: youtube_feed = self.fetch_youtube(address) except (requests.adapters.ConnectionError): youtube_feed = None if not youtube_feed: logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(youtube_feed) elif re.match('(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])): # try: twitter_feed = self.fetch_twitter(address) # except Exception, e: # logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s: %e' % # (self.feed.title[:30], address, e)) # twitter_feed = None if not twitter_feed: logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' % (self.feed.title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(twitter_feed) if not self.fpf: try: headers = { 'User-Agent': USER_AGENT, 'Accept-encoding': 'gzip, deflate', 'A-IM': 'feed', } if etag: headers['If-None-Match'] = etag if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]) headers['If-Modified-Since'] = modified_header raw_feed = requests.get(address, headers=headers) if raw_feed.content: response_headers = raw_feed.headers response_headers['Content-Location'] = raw_feed.url self.fpf = feedparser.parse(smart_unicode(raw_feed.content), response_headers=response_headers) except Exception, e: logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], unicode(e)[:100])) if not self.fpf: try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.title[:30], e)) pass
if not self.fpf: try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.title[:30], e)) pass if not self.fpf: try: logging.debug(u' ***> [%-30s] ~FRTurning off headers...' % (self.feed.title[:30])) self.fpf = feedparser.parse(address, agent=USER_AGENT) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' % (self.feed.title[:30], e)) return FEED_ERRHTTP, None logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % ( self.feed.title[:30], time.time() - start)) return FEED_OK, self.fpf def get_identity(self): identity = "X" current_process = multiprocessing.current_process() if current_process._identity:
def fetch(self): """ Uses requests to download the feed, parsing it in feedparser. Will be storified later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, self.feed.log_title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): self.options['force'] = True modified = None etag = None address = qurl(address, add={"_": random.randint(0, 10000)}) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % ( self.feed.log_title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None if self.options.get('feed_xml'): logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % ( self.feed.log_title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % ( self.feed.log_title[:30])) return FEED_OK, self.fpf if 'youtube.com' in address: try: youtube_feed = self.fetch_youtube(address) except (requests.adapters.ConnectionError): youtube_feed = None if not youtube_feed: logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(youtube_feed) elif re.match(r'(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])): twitter_feed = self.fetch_twitter(address) if not twitter_feed: logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(twitter_feed) elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])): facebook_feed = self.fetch_facebook() if not facebook_feed: logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(facebook_feed) if not self.fpf: try: headers = self.feed.fetch_headers() if etag: headers['If-None-Match'] = etag if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]) headers['If-Modified-Since'] = modified_header if etag or modified: headers['A-IM'] = 'feed' raw_feed = requests.get(address, headers=headers) if raw_feed.status_code >= 400: logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)) raw_feed = requests.get(self.feed.feed_address, headers=self.feed.fetch_headers(fake=True)) if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""): # JSON Feed json_feed = self.fetch_json_feed(address, raw_feed) if not json_feed: logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' % (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(json_feed) elif raw_feed.content and raw_feed.status_code < 400: response_headers = raw_feed.headers response_headers['Content-Location'] = raw_feed.url self.raw_feed = smart_unicode(raw_feed.content) self.fpf = feedparser.parse(self.raw_feed, response_headers=response_headers) if self.options.get('debug', False): logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers)) except Exception, e: logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100])) if not self.fpf or self.options.get('force_fp', False): try: self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e)) pass
def feedParsedItems(request, feed_id): feed = Feed.objects.get(id=feed_id) items = feedparser.parse(feed.feed_address).entries #print feed.feed_address #print items return render_to_response('feed_parsed_items.html', locals())
def fetch(self): """ Uses feedparser to download the feed. Will be parsed later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag=self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): modified = None etag = None address = cache_bust_url(address) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % ( self.feed.title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', self.feed.permalink, )) if self.options.get('feed_xml'): logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % ( self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % ( self.feed.title[:30])) return FEED_OK, self.fpf if 'youtube.com' in address: try: youtube_feed = self.fetch_youtube(address) except (requests.adapters.ConnectionError): youtube_feed = None if not youtube_feed: logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(youtube_feed) if not self.fpf: try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.title[:30], e)) pass
def feedParsedItems(request, feed_id): feed = Feed.objects.get(id=feed_id) items = feedparser.parse(feed.feed_address).entries #print feed.feed_address #print items return render_to_response('feed_parsed_items.html',locals())
try: self.fpf = feedparser.parse(address, agent=self.feed.user_agent, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.log_title[:30], e)) pass if not self.fpf: try: logging.debug(u' ***> [%-30s] ~FRTurning off headers...' % (self.feed.log_title[:30])) self.fpf = feedparser.parse(address, agent=self.feed.user_agent) except (TypeError, ValueError, KeyError, EOFError, MemoryError), e: logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' % (self.feed.log_title[:30], e)) return FEED_ERRHTTP, None logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (self.feed.log_title[:30], time.time() - start)) return FEED_OK, self.fpf def get_identity(self): identity = "X" current_process = multiprocessing.current_process() if current_process._identity:
def _run(self): return _feed.parse(self.url, etag = self.etag, modified = self.modified)
if not self.fpf: try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.title[:30], e)) pass if not self.fpf: try: logging.debug(u' ***> [%-30s] ~FRTurning off headers...' % (self.feed.title[:30])) self.fpf = feedparser.parse(address, agent=USER_AGENT) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' % (self.feed.title[:30], e)) return FEED_ERRHTTP, None logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, self.fpf def get_identity(self): identity = "X" current_process = multiprocessing.current_process() if current_process._identity:
def fetch(self): """ Uses requests to download the feed, parsing it in feedparser. Will be storified later. """ start = time.time() identity = self.get_identity() log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % ( identity, self.feed.title[:30], self.feed.id, datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) etag = self.feed.etag modified = self.feed.last_modified.utctimetuple( )[:7] if self.feed.last_modified else None address = self.feed.feed_address if (self.options.get('force') or random.random() <= .01): self.options['force'] = True modified = None etag = None address = qurl(address, add={"_": random.randint(0, 10000)}) logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (self.feed.title[:30], address)) elif (not self.feed.fetched_once or not self.feed.known_good): modified = None etag = None USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', self.feed.permalink, )) if self.options.get('feed_xml'): logging.debug( u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (self.feed.title[:30], len(self.options.get('feed_xml')))) if self.options.get('fpf'): self.fpf = self.options.get('fpf') logging.debug( u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (self.feed.title[:30])) return FEED_OK, self.fpf if 'youtube.com' in address: try: youtube_feed = self.fetch_youtube(address) except (requests.adapters.ConnectionError): youtube_feed = None if not youtube_feed: logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(youtube_feed) if not self.fpf: try: headers = { 'User-Agent': USER_AGENT, 'Accept-encoding': 'gzip, deflate', 'A-IM': 'feed', } if etag: headers['If-None-Match'] = etag if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. short_weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % ( short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]) headers['If-Modified-Since'] = modified_header raw_feed = requests.get(address, headers=headers) if raw_feed.content: response_headers = raw_feed.headers response_headers['Content-Location'] = raw_feed.url self.fpf = feedparser.parse( smart_unicode(raw_feed.content), response_headers=response_headers) except Exception, e: logging.debug( " ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], unicode(e)[:100])) if not self.fpf: try: self.fpf = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError, KeyError, EOFError), e: logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' % (self.feed.title[:30], e)) pass