예제 #1
0
 def downloadSource(self,source):
     if source.etag != None:
         d = feedparser.parse(source.url,etag=source.etag)
     else:
         d = feedparser.parse(source.url)
     save_in_db = False
     articles = []
     if d.has_key('status'):
         if d.status == 200:
             if d.has_key('etag'):
                 save_in_db = True
                 source.etag = d.etag
             for entry in d.entries:
                 title = entry.title \
                     if entry.has_key('title') \
                     else None
                 summary = entry.summary \
                     if entry.has_key('summary') \
                     else None
                 description = entry.description \
                     if entry.has_key('description') \
                     else None
                 link = entry.link \
                     if entry.has_key('link') \
                     else None 
                 article = Article(parent=ndb.Key('Article',source.name),\
                     published=datetime(*entry.published_parsed[:6]),\
                     source=source.key,title=title,description=description,\
                     summary=summary,link=link)
                 articles.append(article)
             if save_in_db:
                 self.bulkEntityInsertion(articles)
                 source.put()
     print articles
예제 #2
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if (self.options.get('force') or random.random() <= .01):
            modified = None
            etag = None
            address = cache_bust_url(address)
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' %
                          (self.feed.title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None

        USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % (
            self.feed.num_subscribers, 's'
            if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL)
        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                % (self.feed.title[:30], len(self.options.get('feed_xml'))))

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.title[:30]))
            return FEED_OK, self.fpf

        try:
            self.fpf = feedparser.parse(address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)
        except (TypeError, ValueError), e:
            logging.debug(u'   ***> [%-30s] ~FR%s, turning off microformats.' %
                          (self.feed.title[:30], e))
            feedparser.PARSE_MICROFORMATS = False
            self.fpf = feedparser.parse(address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)
            feedparser.PARSE_MICROFORMATS = True
예제 #3
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                            self.feed.title[:30],
                                                            self.feed.id,
                                                            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)
                                                 
        etag=self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        address = self.feed.feed_address
        
        if (self.options.get('force') or random.random() <= .01):
            modified = None
            etag = None
            address = cache_bust_url(address)
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                          self.feed.title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None
        
        USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % (
            self.feed.num_subscribers,
            's' if self.feed.num_subscribers != 1 else '',
            settings.NEWSBLUR_URL
        )
        if self.options.get('feed_xml'):
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                          self.feed.title[:30], len(self.options.get('feed_xml'))))
        
        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                          self.feed.title[:30]))
            return FEED_OK, self.fpf

        try:
            self.fpf = feedparser.parse(address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)
        except (TypeError, ValueError), e:
            logging.debug(u'   ***> [%-30s] ~FR%s, turning off microformats.' % 
                          (self.feed.title[:30], e))
            feedparser.PARSE_MICROFORMATS = False
            self.fpf = feedparser.parse(address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)
            feedparser.PARSE_MICROFORMATS = True
예제 #4
0
 def fetch(self):
     """ Downloads and parses a feed.
     """
     socket.setdefaulttimeout(30)
     identity = self.get_identity()
     log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (identity,
                                              unicode(self.feed)[:30],
                                              self.feed.id)
     logging.debug(log_msg)
                                              
     # Check if feed still needs to be updated
     # feed = Feed.objects.get(pk=self.feed.pk)
     # if feed.next_scheduled_update > datetime.datetime.now() and not self.options.get('force'):
     #     log_msg = u'        ---> Already fetched %s (%d)' % (self.feed.feed_title,
     #                                                          self.feed.id)
     #     logging.debug(log_msg)
     #     feed.save_feed_history(303, "Already fetched")
     #     return FEED_SAME, None
     # else:
     self.feed.set_next_scheduled_update()
         
     etag=self.feed.etag
     modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
     
     if self.options.get('force'):
         modified = None
         etag = None
         
     self.fpf = feedparser.parse(self.feed.feed_address,
                                 agent=USER_AGENT,
                                 etag=etag,
                                 modified=modified)
     
     return FEED_OK, self.fpf
예제 #5
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                            unicode(self.feed)[:30],
                                                            self.feed.id,
                                                            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)
                                                 
        self.feed.set_next_scheduled_update()
        etag=self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        
        if self.options.get('force') or not self.feed.fetched_once:
            modified = None
            etag = None
            
        USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % (
            self.feed.num_subscribers,
            's' if self.feed.num_subscribers != 1 else '',
            settings.NEWSBLUR_URL
        )

        self.fpf = feedparser.parse(self.feed.feed_address,
                                    agent=USER_AGENT,
                                    etag=etag,
                                    modified=modified)
        
        return FEED_OK, self.fpf
예제 #6
0
    def _blog_fetch_callback(self, rpc):

        content = rpc.get_result().content
        d = feedparser.parse(StringIO(content))
        s = Signatures.get_single()
        signatures_and_times = dict(zip(s.hashes, s.times))
        posts = []

        tags_entity = Tags.get_single()
        tags = set(tags_entity.available)

        for entry in d['entries']:
            blog_post = BlogPost.blog_post_from_feed_entry(entry)
            signature_time = signatures_and_times.get(blog_post.signature)
            if signature_time:
                signature_time = datetime.datetime.fromtimestamp(signature_time).strftime('%m/%d/%Y %H:%M')
            posts.append((blog_post, signature_time))

            for tag in blog_post.tags:
                tags.add(tag.lower())

        template_values = {"posts":posts}

        tags_entity.available = list(tags)
        tags_entity.save()

        template = jinja_environment.get_template('moderate.html')
        self.response.out.write(template.render(template_values))
예제 #7
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (
            identity, unicode(self.feed)[:30], self.feed.id)
        logging.debug(log_msg)

        self.feed.set_next_scheduled_update()
        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None

        if self.options.get('force') or not self.feed.fetched_once:
            modified = None
            etag = None

        USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % (
            self.feed.num_subscribers,
            's' if self.feed.num_subscribers != 1 else '', URL)
        self.fpf = feedparser.parse(self.feed.feed_address,
                                    agent=USER_AGENT,
                                    etag=etag,
                                    modified=modified)

        return FEED_OK, self.fpf
예제 #8
0
 def fetch(self):
     """ 
     Uses feedparser to download the feed. Will be parsed later.
     """
     identity = self.get_identity()
     log_msg = u'%2s ---> [%-30s] Fetching feed (%d)' % (identity,
                                                         unicode(self.feed)[:30],
                                                         self.feed.id)
     logging.debug(log_msg)
                                              
     self.feed.set_next_scheduled_update()
     etag=self.feed.etag
     modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
     
     if self.options.get('force') or not self.feed.fetched_once:
         modified = None
         etag = None
         
     USER_AGENT = 'NewsBlur Feed Fetcher (%s subscriber%s) - %s' % (
         self.feed.num_subscribers,
         's' if self.feed.num_subscribers != 1 else '',
         URL
     )
     self.fpf = feedparser.parse(self.feed.feed_address,
                                 agent=USER_AGENT,
                                 etag=etag,
                                 modified=modified)
     
     return FEED_OK, self.fpf
예제 #9
0
 def main(self):
     """
     处理数据
     """
     rss_list = session.query(CheckList).filter(CheckList.status == 1).all()
     for item in rss_list:
         try:
             article_list = (feedparser.parse(item.url)).entries
             # 倒序输出
             for _c in reversed(article_list):
                 black_list_key = False
                 # 关键词黑名单
                 for _black in self.black_list:
                     if _black in _c.title:
                         black_list_key = True
                 if black_list_key:
                     continue
                 _hash = self._url_hash(_c.link)
                 _status = self._check_hash(item.id, _hash)
                 if not _status:
                     # 入库
                     _time = int(time.time())
                     read_info = ReadList(checklist_id=int(item.id), hash=_hash, add_time=_time)
                     session.add(read_info)
                     session.commit()
                     # 钉钉机器人
                     AliHook().ali_hook(item.nickname, item.tag, _c.title, _c.link)
         except Exception as e:
             logging.warn("_data error: {}".format(str(e)))
예제 #10
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, unicode(self.feed)[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        self.feed.set_next_scheduled_update()
        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None

        if self.options.get('force') or not self.feed.fetched_once:
            modified = None
            etag = None

        USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % (
            self.feed.num_subscribers, 's'
            if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL)

        self.fpf = feedparser.parse(self.feed.feed_address,
                                    agent=USER_AGENT,
                                    etag=etag,
                                    modified=modified)

        return FEED_OK, self.fpf
예제 #11
0
파일: feedman.py 프로젝트: enry86/NewSearch
 def add_feed(self, xml):
     feeds = feedparser.parse(xml)
     self.articles += len(feeds['items'])
     for f in feeds['items']:
         self.u_mut.acquire ()
         self.items[f['link']] = (f['title'], f['summary'])
         self.u_mut.release()
         self.u_sem.release()
예제 #12
0
    def fetch(self):
        """     
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY)' % (identity,
                                                            self.feed.title[:30],
                                                            self.feed.id)
        logging.debug(log_msg)
                                                 
        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        address = self.feed.feed_address
        
        # If is forced or random is less than 1%, set modified = None and etag = None,
        # means it will fetch new
        if (self.options.get('force') or random.random() <= .01):
            modified = None
            etag = None
            address = cache_bust_url(address)
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                          self.feed.title[:30], address))
        # If this feed_id in not fetched once before or not known_good
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None
        
        USER_AGENT = ('NewsBlur Feed Fetcher - %s '
                      '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                      'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                      'Safari/534.48.3)' % (
                          self.feed.permalink,
                     ))

        try:
            self.fpf = feedparser.parse(address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)
        except (TypeError, ValueError, KeyError), e:
            logging.debug(u'   ***> [%-30s] ~FR%s, turning off headers.' % 
                          (self.feed.title[:30], e))
            self.fpf = feedparser.parse(address, agent=USER_AGENT)
예제 #13
0
    def _blog_fetch_callback(self, rpc):

        content = rpc.get_result().content
        d = feedparser.parse(StringIO(content))
        signatures = Signatures.signatures()
        posts = []
        for entry in d['entries']:
            blog_post = BlogPost.blog_post_from_feed_entry(entry)
            if blog_post.signature in signatures:
                posts.append(blog_post)

        template_values = {"posts":posts, "DEBUG":DEBUG}

        template = jinja_environment.get_template('main.html')
        self.response.out.write(template.render(template_values))
예제 #14
0
    def _blog_fetch_callback(self, rpc):

        content = rpc.get_result().content
        d = feedparser.parse(StringIO(content))
        signatures = Signatures.signatures()
        posts = []
        for entry in d['entries']:
            blog_post = BlogPost.blog_post_from_feed_entry(entry)
            if blog_post.signature in signatures:
                posts.append(blog_post)

        template_values = {"posts": posts, "DEBUG": DEBUG}

        template = jinja_environment.get_template('main.html')
        self.response.out.write(template.render(template_values))
예제 #15
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None

        if self.options.get(
                'force'
        ) or not self.feed.fetched_once or not self.feed.known_good:
            modified = None
            etag = None

        USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % (
            self.feed.num_subscribers, 's'
            if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL)
        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                % (self.feed.title[:30], len(self.options.get('feed_xml'))))
        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.title[:30]))
        else:
            self.fpf = feedparser.parse(self.feed.feed_address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)

        logging.debug(u'   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' %
                      (self.feed.title[:30], time.time() - start))

        return FEED_OK, self.fpf
예제 #16
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                            unicode(self.feed)[:30],
                                                            self.feed.id,
                                                            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)
                                                 
        etag=self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        
        if self.options.get('force') or not self.feed.fetched_once or not self.feed.known_good:
            modified = None
            etag = None
            
        USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3 (NewsBlur Feed Fetcher - %s subscriber%s - %s)' % (
            self.feed.num_subscribers,
            's' if self.feed.num_subscribers != 1 else '',
            settings.NEWSBLUR_URL
        )
        if self.options.get('feed_xml'):
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                          unicode(self.feed)[:30], len(self.options.get('feed_xml'))))
        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                          unicode(self.feed)[:30]))
        else:
            self.fpf = feedparser.parse(self.feed.feed_address,
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)

        if self.options['verbose'] and getattr(self.fpf, 'status', None) == 200:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed fetch in ~FM%.4ss' % (
                          unicode(self.feed)[:30], time.time() - start))
        
        return FEED_OK, self.fpf
예제 #17
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        socket.setdefaulttimeout(30)
        identity = self.get_identity()
        log_msg = u"%2s ---> [%-30s] Fetching feed (%d)" % (identity, unicode(self.feed)[:30], self.feed.id)
        logging.debug(log_msg)

        self.feed.set_next_scheduled_update()
        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None

        if self.options.get("force") or not self.feed.fetched_once:
            modified = None
            etag = None

        self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified)

        return FEED_OK, self.fpf
예제 #18
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u"%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s" % (
            identity,
            self.feed.title[:30],
            self.feed.id,
            datetime.datetime.now() - self.feed.last_update,
        )
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None

        if self.options.get("force") or not self.feed.fetched_once or not self.feed.known_good:
            modified = None
            etag = None

        USER_AGENT = (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2 (NewsBlur Feed Fetcher - %s subscriber%s - %s)"
            % (self.feed.num_subscribers, "s" if self.feed.num_subscribers != 1 else "", settings.NEWSBLUR_URL)
        )
        if self.options.get("feed_xml"):
            logging.debug(
                u"   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s"
                % (self.feed.title[:30], len(self.options.get("feed_xml")))
            )
        if self.options.get("fpf"):
            self.fpf = self.options.get("fpf")
            logging.debug(u"   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping." % (self.feed.title[:30]))
        else:
            self.fpf = feedparser.parse(self.feed.feed_address, agent=USER_AGENT, etag=etag, modified=modified)

        logging.debug(u"   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss" % (self.feed.title[:30], time.time() - start))

        return FEED_OK, self.fpf
예제 #19
0
    def parse(self):
        
        logging.info('Parse feed: %s' % self.url)
        
        referrer = "https://www.google.com/reader/view/"
        self.feed = feedparser.parse(self.url, agent=self.user_agent,referrer=referrer)
        
        if self.feed.bozo == 1:
            raise self.feed.bozo_exception

        self.ffname = ascii_filename(self.feed.feed.title)
        
        self.book_dir = '%s%s' % (self.data_dir, self.ffname)
        
        #如果目录存在换个名字
        #i,tmpdir = 1,self.book_dir
        #while True:
        #    if os.path.isdir(tmpdir):
        #        tmpdir = self.book_dir + ('(%s)' % i)
        #        i = i + 1
        #    else:
        #        self.book_dir = tmpdir
        #        break

        self.book_dir = self.book_dir + '/'
        
        if os.path.isdir(self.book_dir) is False:
            os.mkdir(self.book_dir, 0777)
            
        if os.path.isdir(self.book_dir+'images/') is False:
            os.mkdir(self.book_dir+'images/', 0777)
        
        if os.path.isdir(self.book_dir+'articles/') is False:
            os.mkdir(self.book_dir+'articles/', 0777)
        
        return self
예제 #20
0
    def fetch(self):
        """ 
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.log_title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            if address.startswith('http'):
                address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' %
                          (self.feed.log_title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None

        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                %
                (self.feed.log_title[:30], len(self.options.get('feed_xml'))))

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.log_title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' %
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)
        elif re.match(r'(https?)?://twitter.com/\w+/?',
                      qurl(address, remove=['_'])):
            twitter_feed = self.fetch_twitter(address)
            if not twitter_feed:
                logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s' %
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(twitter_feed)
        elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address,
                                                        remove=['_'])):
            facebook_feed = self.fetch_facebook()
            if not facebook_feed:
                logging.debug(u'   ***> [%-30s] ~FRFacebook fetch failed: %s' %
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(facebook_feed)

        if not self.fpf:
            try:
                headers = self.feed.fetch_headers()
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = [
                        'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'
                    ]
                    months = [
                        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                        'Sep', 'Oct', 'Nov', 'Dec'
                    ]
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (
                        short_weekdays[modified[6]], modified[2],
                        months[modified[1] - 1], modified[0], modified[3],
                        modified[4], modified[5])
                    headers['If-Modified-Since'] = modified_header
                if etag or modified:
                    headers['A-IM'] = 'feed'
                raw_feed = requests.get(address, headers=headers)
                if raw_feed.status_code >= 400:
                    logging.debug(
                        "   ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s"
                        % (self.feed.log_title[:30], raw_feed.status_code,
                           raw_feed.headers))
                    raw_feed = requests.get(
                        self.feed.feed_address,
                        headers=self.feed.fetch_headers(fake=True))

                if raw_feed.content and 'application/json' in raw_feed.headers.get(
                        'Content-Type', ""):
                    # JSON Feed
                    json_feed = self.fetch_json_feed(address, raw_feed)
                    if not json_feed:
                        logging.debug(
                            u'   ***> [%-30s] ~FRJSON fetch failed: %s' %
                            (self.feed.log_title[:30], address))
                        return FEED_ERRHTTP, None
                    self.fpf = feedparser.parse(json_feed)
                elif raw_feed.content and raw_feed.status_code < 400:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.raw_feed = smart_unicode(raw_feed.content)
                    self.fpf = feedparser.parse(
                        self.raw_feed, response_headers=response_headers)
                    if self.options.get('debug', False):
                        logging.debug(
                            " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
                            % (self.feed.log_title[:30], raw_feed.status_code,
                               len(smart_unicode(
                                   raw_feed.content)), raw_feed.headers))
            except Exception, e:
                logging.debug(
                    "   ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s"
                    % (self.feed.log_title[:30], unicode(e)[:100]))

            if not self.fpf or self.options.get('force_fp', False):
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=self.feed.user_agent,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError,
                        MemoryError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                                  (self.feed.log_title[:30], e))
                    pass
예제 #21
0
            if not self.fpf or self.options.get('force_fp', False):
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=self.feed.user_agent,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                                  (self.feed.log_title[:30], e))
                    pass
                
        if not self.fpf:
            try:
                logging.debug(u'   ***> [%-30s] ~FRTurning off headers...' % 
                              (self.feed.log_title[:30]))
                self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
            except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
                logging.debug(u'   ***> [%-30s] ~FRFetch failed: %s.' % 
                              (self.feed.log_title[:30], e))
                return FEED_ERRHTTP, None
            
        logging.debug(u'   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
                      self.feed.log_title[:30], time.time() - start))

        return FEED_OK, self.fpf
        
    def get_identity(self):
        identity = "X"

        current_process = multiprocessing.current_process()
        if current_process._identity:
예제 #22
0
파일: models.py 프로젝트: jobscry/vz-stream
    def update(self):
        """
        Update
        
        Uses FeedParser to grab and parse feed.
        """
        
        if self.enabled is False:
            return

        if self.last_modified:
            modified = self.last_modified.timetuple()
        else:
            modified = None

        data = feedparser.parse(self.url, etag=self.etag,
            modified=modified, agent=USER_AGENT)

        if data.bozo != 1:
            if data.status != 304:
                data.entries.reverse()
                while data.entries:
                    dentry = data.entries.pop()

                    if dentry.has_key('published'):
                        created_on = datetime.datetime(*dentry.published_parsed[0:6])
                    else:
                        created_on = datetime.datetime(*dentry.updated_parsed[0:6])

                    if self.last_modified is not None and (created_on <= self.last_modified):
                        break

                    if dentry.has_key('title'):
                        text = dentry.title
                    elif dentry.has_key('summary'):
                        text = dentry.summary
                    else:
                        dentry.get('content', 'None')

                    if self.auto_link:
                        text = self._auto_link(text)

                    if self.feed_type == 't':
                        text = self._twitter_parser(text)

                    Entry.objects.create(
                        source=self,
                        url=dentry.link,
                        text=text,
                        created_on=created_on
                    )

                if data.has_key('etag'):
                    self.etag = data.etag
                if data.has_key('modified'):
                    self.last_modified = datetime.datetime(*data.modified[0:6])
                else:
                    try:
                        latest = Entry.objects.filter(source=self).latest('created_on')
                        self.last_modified = latest.created_on
                    except ObjectDoesNotExist:
                        pass

            self.last_update_successful = True
            self.last_status_code = data.status
        else:
            self.last_update_successful = False
            self.error_message = pprint(data.bozo_exception)
        self.save()
예제 #23
0
 def fetch(self):
     """ 
     Uses requests to download the feed, parsing it in feedparser. Will be storified later.
     """
     start = time.time()
     identity = self.get_identity()
     log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                         self.feed.title[:30],
                                                         self.feed.id,
                                                         datetime.datetime.now() - self.feed.last_update)
     logging.debug(log_msg)
     
     etag=self.feed.etag
     modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
     address = self.feed.feed_address
     
     if (self.options.get('force') or random.random() <= .01):
         self.options['force'] = True
         modified = None
         etag = None
         address = qurl(address, add={"_": random.randint(0, 10000)})
         logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                       self.feed.title[:30], address))
     elif (not self.feed.fetched_once or not self.feed.known_good):
         modified = None
         etag = None
     
     USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s '
                   '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                   'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                   'Safari/534.48.3)' % (
                       self.feed.num_subscribers,
                       's' if self.feed.num_subscribers != 1 else '',
                       self.feed.permalink,
                  ))
     if self.options.get('feed_xml'):
         logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                       self.feed.title[:30], len(self.options.get('feed_xml'))))
     
     if self.options.get('fpf'):
         self.fpf = self.options.get('fpf')
         logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                       self.feed.title[:30]))
         return FEED_OK, self.fpf
     
     if 'youtube.com' in address:
         try:
             youtube_feed = self.fetch_youtube(address)
         except (requests.adapters.ConnectionError):
             youtube_feed = None
         if not youtube_feed:
             logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' % 
                           (self.feed.title[:30], address))
             return FEED_ERRHTTP, None
         self.fpf = feedparser.parse(youtube_feed)
     elif re.match('(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])):
         # try:
         twitter_feed = self.fetch_twitter(address)
         # except Exception, e:
         #     logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s: %e' %
         #                   (self.feed.title[:30], address, e))
         #     twitter_feed = None
         if not twitter_feed:
             logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s' % 
                           (self.feed.title[:30], address))
             return FEED_ERRHTTP, None
         self.fpf = feedparser.parse(twitter_feed)
     
     if not self.fpf:
         try:
             headers = {
                 'User-Agent': USER_AGENT,
                 'Accept-encoding': 'gzip, deflate',
                 'A-IM': 'feed',
             }
             if etag:
                 headers['If-None-Match'] = etag
             if modified:
                 # format into an RFC 1123-compliant timestamp. We can't use
                 # time.strftime() since the %a and %b directives can be affected
                 # by the current locale, but RFC 2616 states that dates must be
                 # in English.
                 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
                 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
                 modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])
                 headers['If-Modified-Since'] = modified_header
             raw_feed = requests.get(address, headers=headers)
             if raw_feed.content:
                 response_headers = raw_feed.headers
                 response_headers['Content-Location'] = raw_feed.url
                 self.fpf = feedparser.parse(smart_unicode(raw_feed.content),
                                             response_headers=response_headers)
         except Exception, e:
             logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], unicode(e)[:100]))
         
         if not self.fpf:
             try:
                 self.fpf = feedparser.parse(address,
                                             agent=USER_AGENT,
                                             etag=etag,
                                             modified=modified)
             except (TypeError, ValueError, KeyError, EOFError), e:
                 logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                               (self.feed.title[:30], e))
                 pass
예제 #24
0
            if not self.fpf:
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=USER_AGENT,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                                  (self.feed.title[:30], e))
                    pass
                
        if not self.fpf:
            try:
                logging.debug(u'   ***> [%-30s] ~FRTurning off headers...' % 
                              (self.feed.title[:30]))
                self.fpf = feedparser.parse(address, agent=USER_AGENT)
            except (TypeError, ValueError, KeyError, EOFError), e:
                logging.debug(u'   ***> [%-30s] ~FRFetch failed: %s.' % 
                              (self.feed.title[:30], e))
                return FEED_ERRHTTP, None
            
        logging.debug(u'   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
                      self.feed.title[:30], time.time() - start))

        return FEED_OK, self.fpf
        
    def get_identity(self):
        identity = "X"

        current_process = multiprocessing.current_process()
        if current_process._identity:
예제 #25
0
    def fetch(self):
        """ 
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                            self.feed.log_title[:30],
                                                            self.feed.id,
                                                            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)
        
        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        address = self.feed.feed_address
        
        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                          self.feed.log_title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None
        
        if self.options.get('feed_xml'):
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                          self.feed.log_title[:30], len(self.options.get('feed_xml'))))
        
        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                          self.feed.log_title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' % 
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)
        elif re.match(r'(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])):
            twitter_feed = self.fetch_twitter(address)
            if not twitter_feed:
                logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s' % 
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(twitter_feed)
        elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
            facebook_feed = self.fetch_facebook()
            if not facebook_feed:
                logging.debug(u'   ***> [%-30s] ~FRFacebook fetch failed: %s' % 
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(facebook_feed)
        
        if not self.fpf:
            try:
                headers = self.feed.fetch_headers()
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
                    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])
                    headers['If-Modified-Since'] = modified_header
                if etag or modified:
                    headers['A-IM'] = 'feed'
                raw_feed = requests.get(address, headers=headers)
                if raw_feed.status_code >= 400:
                    logging.debug("   ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers))
                    raw_feed = requests.get(self.feed.feed_address, headers=self.feed.fetch_headers(fake=True))
                
                if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""):
                    # JSON Feed
                    json_feed = self.fetch_json_feed(address, raw_feed)
                    if not json_feed:
                        logging.debug(u'   ***> [%-30s] ~FRJSON fetch failed: %s' % 
                                      (self.feed.log_title[:30], address))
                        return FEED_ERRHTTP, None
                    self.fpf = feedparser.parse(json_feed)
                elif raw_feed.content and raw_feed.status_code < 400:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.raw_feed = smart_unicode(raw_feed.content)
                    self.fpf = feedparser.parse(self.raw_feed,
                                                response_headers=response_headers)
                    if self.options.get('debug', False):
                        logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers))
            except Exception, e:
                logging.debug("   ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100]))
            
            if not self.fpf or self.options.get('force_fp', False):
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=self.feed.user_agent,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                                  (self.feed.log_title[:30], e))
                    pass
예제 #26
0
파일: views.py 프로젝트: cash2one/rssEngine
def feedParsedItems(request, feed_id):
    feed = Feed.objects.get(id=feed_id)
    items = feedparser.parse(feed.feed_address).entries
    #print feed.feed_address
    #print items
    return render_to_response('feed_parsed_items.html', locals())
예제 #27
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                            self.feed.title[:30],
                                                            self.feed.id,
                                                            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)
                                                 
        etag=self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        address = self.feed.feed_address
        
        if (self.options.get('force') or random.random() <= .01):
            modified = None
            etag = None
            address = cache_bust_url(address)
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                          self.feed.title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None
        
        USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s '
                      '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                      'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                      'Safari/534.48.3)' % (
                          self.feed.num_subscribers,
                          's' if self.feed.num_subscribers != 1 else '',
                          self.feed.permalink,
                     ))
        if self.options.get('feed_xml'):
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                          self.feed.title[:30], len(self.options.get('feed_xml'))))
        
        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                          self.feed.title[:30]))
            return FEED_OK, self.fpf
        
        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' % 
                              (self.feed.title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)

        if not self.fpf:
            try:
                self.fpf = feedparser.parse(address,
                                            agent=USER_AGENT,
                                            etag=etag,
                                            modified=modified)
            except (TypeError, ValueError, KeyError, EOFError), e:
                logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                              (self.feed.title[:30], e))
                pass
예제 #28
0
def feedParsedItems(request, feed_id):
    feed = Feed.objects.get(id=feed_id)
    items = feedparser.parse(feed.feed_address).entries
    #print feed.feed_address
    #print items
    return render_to_response('feed_parsed_items.html',locals())
예제 #29
0
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=self.feed.user_agent,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError,
                        MemoryError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                                  (self.feed.log_title[:30], e))
                    pass

        if not self.fpf:
            try:
                logging.debug(u'   ***> [%-30s] ~FRTurning off headers...' %
                              (self.feed.log_title[:30]))
                self.fpf = feedparser.parse(address,
                                            agent=self.feed.user_agent)
            except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
                logging.debug(u'   ***> [%-30s] ~FRFetch failed: %s.' %
                              (self.feed.log_title[:30], e))
                return FEED_ERRHTTP, None

        logging.debug(u'   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' %
                      (self.feed.log_title[:30], time.time() - start))

        return FEED_OK, self.fpf

    def get_identity(self):
        identity = "X"

        current_process = multiprocessing.current_process()
        if current_process._identity:
예제 #30
0
파일: crawler.py 프로젝트: CMGS/Collapsar
 def _run(self):        
     return _feed.parse(self.url, etag = self.etag, modified = self.modified)
예제 #31
0
            if not self.fpf:
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=USER_AGENT,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                                  (self.feed.title[:30], e))
                    pass

        if not self.fpf:
            try:
                logging.debug(u'   ***> [%-30s] ~FRTurning off headers...' %
                              (self.feed.title[:30]))
                self.fpf = feedparser.parse(address, agent=USER_AGENT)
            except (TypeError, ValueError, KeyError, EOFError), e:
                logging.debug(u'   ***> [%-30s] ~FRFetch failed: %s.' %
                              (self.feed.title[:30], e))
                return FEED_ERRHTTP, None

        logging.debug(u'   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' %
                      (self.feed.title[:30], time.time() - start))

        return FEED_OK, self.fpf

    def get_identity(self):
        identity = "X"

        current_process = multiprocessing.current_process()
        if current_process._identity:
예제 #32
0
    def fetch(self):
        """ 
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' %
                          (self.feed.title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None

        USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s '
                      '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                      'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                      'Safari/534.48.3)' % (
                          self.feed.num_subscribers,
                          's' if self.feed.num_subscribers != 1 else '',
                          self.feed.permalink,
                      ))
        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                % (self.feed.title[:30], len(self.options.get('feed_xml'))))

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' %
                              (self.feed.title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)

        if not self.fpf:
            try:
                headers = {
                    'User-Agent': USER_AGENT,
                    'Accept-encoding': 'gzip, deflate',
                    'A-IM': 'feed',
                }
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = [
                        'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'
                    ]
                    months = [
                        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                        'Sep', 'Oct', 'Nov', 'Dec'
                    ]
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (
                        short_weekdays[modified[6]], modified[2],
                        months[modified[1] - 1], modified[0], modified[3],
                        modified[4], modified[5])
                    headers['If-Modified-Since'] = modified_header
                raw_feed = requests.get(address, headers=headers)
                if raw_feed.content:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.fpf = feedparser.parse(
                        smart_unicode(raw_feed.content),
                        response_headers=response_headers)
            except Exception, e:
                logging.debug(
                    " ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s"
                    % (self.feed.title[:30], unicode(e)[:100]))

            if not self.fpf:
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=USER_AGENT,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                                  (self.feed.title[:30], e))
                    pass