Python parse 예제들, speedparser.speedparser.parse Python 예제들

예제 #1

0

파일 보기

파일: speedparsertests.py 프로젝트: tedder/speedparser

    def test_entries_coverage(self):
        success = 0
        fperrors = 0
        sperrors = 0
        errcompats = 0
        fperror = False
        total = len(self.files)
        total = 1000
        failedpaths = []
        failedentries = []
        bozoentries = []
        for f in self.files[:total]:
            fperror = False
            with open(f) as fo:
                document = fo.read()
            try:
                fpresult = load_cache(f)
                if fpresult is None:
                    fpresult = feedparser.parse(document)
            except:
                fperrors += 1
                fperror = True
            if fpresult.get('bozo', 0):
                fperrors += 1
                fperror = True
            try:
                spresult = speedparser.parse(document)
            except:
                if fperror:
                    errcompats += 1
                else:
                    sperrors += 1
                    bozoentries.append(f)
                continue
            if 'bozo_exception' in spresult:
                if fperror:
                    errcompats += 1
                else:
                    sperrors += 1
                    bozoentries.append(f)
                continue
            try:
                entry_equivalence(self, fpresult, spresult)
                success += 1
            except:
                import traceback
                print("Failure: %s" % f)
                traceback.print_exc()
                failedentries.append(f)
        print("Success: %d out of %d (%0.2f %%, fpe: %d, spe: %d, both: %d)" % (success,
                total, (100 * success)/float(total-fperrors), fperrors, sperrors, errcompats))

        print("Failed entries:\n%s" % pformat(failedentries))
        print("Bozo entries:\n%s" % pformat(bozoentries))

예제 #2

0

파일 보기

파일: speedparsertests.py 프로젝트: BinaryBlob/speedparser

    def test_single_feed(self):
        fpresult = feedparser.parse(self.doc)
        spresult = speedparser.parse(self.doc)

        d = dict(fpresult)
        d['entries'] = d['entries'][:4]
        pprint(d)
        d = dict(spresult)
        d['entries'] = d['entries'][:4]
        pprint(d)
        feed_equivalence(self, fpresult, spresult)
        entry_equivalence(self, fpresult, spresult)

예제 #3

0

파일 보기

파일: speedparsertests.py 프로젝트: tedder/speedparser

    def test_single_feed(self):
        fpresult = feedparser.parse(self.doc)
        spresult = speedparser.parse(self.doc)

        d = dict(fpresult)
        d['entries'] = d['entries'][:4]
        pprint(d)
        d = dict(spresult)
        d['entries'] = d['entries'][:4]
        pprint(d)
        feed_equivalence(self, fpresult, spresult)
        entry_equivalence(self, fpresult, spresult)

예제 #4

0

파일 보기

파일: speedparsertests.py 프로젝트: BinaryBlob/speedparser

 def test_feeds(self):
     for path in self.filenames:
         with open(path) as f:
             doc = f.read()
         fpresult = feedparser.parse(doc)
         spresult = speedparser.parse(doc)
         try:
             feed_equivalence(self, fpresult, spresult)
             entry_equivalence(self, fpresult, spresult)
         except:
             import traceback
             print "Comp Failure: %s" % path
             traceback.print_exc()

예제 #5

0

파일 보기

파일: speedparsertests.py 프로젝트: tedder/speedparser

 def test_feeds(self):
     for path in self.filenames:
         with open(path) as f:
             doc = f.read()
         fpresult = feedparser.parse(doc)
         spresult = speedparser.parse(doc)
         try:
             feed_equivalence(self, fpresult, spresult)
             entry_equivalence(self, fpresult, spresult)
         except:
             import traceback
             print("Comp Failure: %s" % path)
             traceback.print_exc()

예제 #6

0

파일 보기

 def test_feed_coverage(self):
     success = 0
     fperrors = 0
     sperrors = 0
     total = 300
     failedpaths = []
     failedentries = []
     for f in self.files[:total]:
         fperror = False
         with open(f) as fo:
             document = fo.read()
         try:
             fpresult = feedparser.parse(document)
         except:
             fperrors += 1
             fperror = True
         try:
             spresult = speedparser.parse(document)
         except:
             sperrors += 1
             continue
         try:
             feed_equivalence(self, fpresult, spresult)
             success += 1
         except:
             failedpaths.append(f)
             pass
         try:
             entry_equivalence(self, fpresult, spresult)
         except:
             failedentries.append(f)
     print "Success: %d out of %d (%0.2f %%, fpe: %d, spe: %d)" % (
         success, total,
         (100 * success) / float(total - fperrors), fperrors, sperrors)
     print "Entry Success: %d out of %d (%0.2f %%)" % (
         success - len(failedentries), success,
         (100 * (success - len(failedentries))) / float(total - fperrors))
     print "Failed Paths:\n%s" % pformat(failedpaths)
     print "Failed entries:\n%s" % pformat(failedentries)

예제 #7

0

파일 보기

파일: speedparsertests.py 프로젝트: BinaryBlob/speedparser

 def test_feed_coverage(self):
     success = 0
     fperrors = 0
     sperrors = 0
     total = 300
     failedpaths = []
     failedentries = []
     for f in self.files[:total]:
         fperror = False
         with open(f) as fo:
             document = fo.read()
         try:
             fpresult = feedparser.parse(document)
         except:
             fperrors += 1
             fperror = True
         try:
             spresult = speedparser.parse(document)
         except:
             sperrors += 1
             continue
         try:
             feed_equivalence(self, fpresult, spresult)
             success += 1
         except:
             failedpaths.append(f)
             pass
         try:
             entry_equivalence(self, fpresult, spresult)
         except:
             failedentries.append(f)
     print "Success: %d out of %d (%0.2f %%, fpe: %d, spe: %d)" % (success,
             total, (100 * success)/float(total-fperrors), fperrors, sperrors)
     print "Entry Success: %d out of %d (%0.2f %%)" % (success-len(failedentries),
             success, (100*(success-len(failedentries)))/float(total-fperrors))
     print "Failed Paths:\n%s" % pformat(failedpaths)
     print "Failed entries:\n%s" % pformat(failedentries)

예제 #8

0

파일 보기

파일: models.py 프로젝트: jobscry/preader

    def update_feeds(num=10):

        with SimpleBufferObject(Entry) as new_entry_buffer:
            current_time = now()

            # get all active feeds with subscribers that have not been checked or need to be checked based
            # on "next_checked"
            feeds = Feed.active.filter(
                Q(next_checked=None) | Q(next_checked__lte=current_time))[:num]

            for feed in feeds:
                # update last checked to current time
                feed.last_checked = now()
                # set "next_checked" based on "check_frequency"
                feed.next_checked = feed.last_checked + timedelta(
                    hours=feed.check_frequency)

                # create new FeedLog object
                log = FeedLog(feed=feed)
                notes = []

                # load conditional GET headers from feed object
                headers = HEADERS
                if feed.etag and feed.etag != '':
                    headers['If-None-Match'] = feed.etag
                if feed.last_modified:
                    last_modified = make_naive(feed.last_modified)
                    headers['If-Modified-Since'] = http_date(
                        last_modified.timestamp())

                try:
                    req = requests.get(feed.feed_url,
                                       headers=headers,
                                       allow_redirects=True)

                    log.status_code = req.status_code
                    log.headers = ', '.join("{!s}={!r}".format(key, val)
                                            for (key, val) in headers.items())
                    log.headers += "--\n"
                    log.headers += ', '.join("{!s}={!r}".format(key, val)
                                             for (key,
                                                  val) in req.headers.items())

                    notes.append('updating {0}'.format(feed))

                    # update feed URL if redirected or altered
                    if (req.url != feed.feed_url) and (
                            req.history[-1].status_code == 301):
                        # if updated feed URL already exists, something is wrong
                        if Feed.objects.filter(feed_url=req.url).exists():
                            feed.disabled = True
                            notes.append('Feed URL does not match response, \
                                but new feed already exists with {0}.'.format(
                                req.url))
                        else:
                            notes.append(
                                'Updating feed url from {0} to {1}.'.format(
                                    feed.feed_url, req.url))
                            feed.feed_url = req.url

                    if req.status_code == requests.codes.not_modified:
                        notes.append('not modified')

                    elif req.status_code == requests.codes.ok:
                        notes.append('status OK, parsing')

                        # update conditional GET data
                        feed.etag = alphanum.sub('',
                                                 req.headers.get('etag', ''))
                        feed.last_modified = parse_http_date(
                            req.headers.get('last-modified', None),
                            default=feed.last_checked)

                        # must remove encoding declaration from feed or lxml will pitch a fit
                        text = XML_DECLARATION.sub('', req.text, 1)
                        parsed = speedparser.parse(text, encoding=req.encoding)

                        # bozo feed
                        if parsed.bozo == 1:
                            notes.append('bozo feed')
                            notes.append(parsed.bozo_tb)
                            feed.increment_error_count()
                        else:
                            # update feed meta data, reset error count
                            feed.reset_error_count()
                            feed.title = parsed.feed.get('title', feed.title)
                            feed.title = shorten_string(feed.title)
                            feed.description = parsed.feed.get(
                                'description',
                                parsed.feed.get('subtitle', None))
                            # icon/logo are not working in speedparser
                            # feed.icon = parsed.feed.get('logo', feed.icon)

                            # get latest existing entry for feed
                            try:
                                latest_entry = feed.entry_set.latest()
                            except Entry.DoesNotExist:
                                latest_entry = None

                            for count, entry in enumerate(parsed.entries):
                                published = feed_datetime(
                                    entry.get(
                                        'published_parsed',
                                        entry.get('updated_parsed', None)),
                                    default=feed.last_checked)

                                # only proceed if entry is newer than last
                                # entry for feed
                                if latest_entry is None or published > latest_entry.published:

                                    # entry ID is a hash of the link or entry id
                                    entry_id = hashlib.sha1(
                                        entry.get('id', entry.link).encode(
                                            'utf-8')).hexdigest()
                                    author = bleach.clean(entry.get(
                                        'author', 'no author'),
                                                          strip=True,
                                                          strip_comments=True)
                                    author = shorten_string(author)

                                    content = None
                                    content_items = entry.get('content', None)
                                    if content_items is None:
                                        content = entry.get(
                                            'summary', 'No summary.')
                                    else:
                                        for c in content_items:
                                            if c.get('type',
                                                     None) in ('text', 'html',
                                                               'xhtml', None):
                                                if content is None:
                                                    content = c.get(
                                                        'value', '')
                                                else:
                                                    content += c.get(
                                                        'value', '')
                                        content = bleach.clean(
                                            content,
                                            tags=BLEACH_TAGS,
                                            attributes=BLEACH_ATTRS,
                                            strip=True,
                                            strip_comments=True)

                                    title = bleach.clean(entry.get(
                                        'title', 'no title'),
                                                         strip=True,
                                                         strip_comments=True)
                                    title = shorten_string(title)

                                    new_entry_buffer.add(
                                        Entry(feed=feed,
                                              entry_id=entry_id,
                                              link=entry.get('link', ''),
                                              title=title,
                                              author=author,
                                              content=content,
                                              published=published,
                                              updated=feed_datetime(
                                                  entry.get(
                                                      'updated_parsed', None),
                                                  default=feed.last_checked)))
                                    log.entries += 1
                                else:
                                    break

                            if log.entries > 0:
                                feed.has_new_feeds = True
                    else:
                        notes.append('error: {0}'.format(req.status_code))
                        feed.increment_error_count()

                except requests.exceptions.Timeout:  # pragma: no cover
                    log.notes = 'timeout error'
                    feed.increment_error_count()
                except requests.exceptions.ConnectionError:  # pragma: no cover
                    log.notes = 'connection error'
                    feed.increment_error_count()
                except requests.exceptions.HTTPError:  # pragma: no cover
                    log.notes = 'HTTP error'
                    feed.increment_error_count()
                except requests.exceptions.TooManyRedirects:  # pragma: no cover
                    log.notes = 'too many redirects'
                    feed.increment_error_count()

                log.notes = '\n'.join(notes)
                duration = now() - feed.last_checked
                log.duration = duration.microseconds
                feed.save()
                log.save()