Пример #1
0
def parse_feed(feed_url, page):
    ids = []
    entries = get_feed_entries(feed_url)
    for entry in entries: 
        parsed = parse_entry(entry)
        log.debug(parsed['title'])
        if not parsed:
            log.warn("Parsing feed failed. continuing")
            continue
        if isinstance(parsed['content'], feedparser.FeedParserDict):
            if 'value' in parsed['content'].keys():
                body = parsed['content']['value']
        else:
            body = parsed['content']
        if not body:
            log.warn("Parsing feed failed - no body found")
            continue
        cleaned_body = smart_str(bleach.clean(body, tags=(), strip=True))
        try:
            # [Bugzilla-670890]
            # Needed to allow for the same article but from different sources. This ensures a unique checksum per source
            checksum = hashlib.md5(cleaned_body + page).hexdigest() 
            exists = FeedEntry.objects.filter(checksum=checksum)
            if not exists:
                log.debug('Logging - %s' % parsed['title'])
                entry = FeedEntry(
                    title=parsed['title'].encode('utf-8'),
                    link=parsed['link'].encode('utf-8'),
                    body=cleaned_body,
                    page=page,
                    checksum=checksum,
                    created_on=time.strftime(
                        "%Y-%m-%d %H:%M:%S", parsed['updated']))
                entry.save()
                feed_id = entry.id
            else:
                # if it's already in the feed we still want to keep a reference to it's ID so we know to display it
                log.debug('Found a duplicate - entry')
                feed_id = exists[0].id
            ids.append(feed_id)
        except:
            log.warn("Encountered an error creating FeedEntry. Skipping.")
            continue
    log.debug(ids)
    return ids
Пример #2
0
def parse_feed(feed_url, page):
    ids = []
    entries = get_feed_entries(feed_url)
    for entry in entries:
        parsed = parse_entry(entry)
        log.debug(parsed['title'])
        if not parsed:
            log.warn("Parsing feed failed. continuing")
            continue
        if isinstance(parsed['content'], feedparser.FeedParserDict):
            if 'value' in parsed['content'].keys():
                body = parsed['content']['value']
        else:
            body = parsed['content']
        if not body:
            log.warn("Parsing feed failed - no body found")
            continue
        cleaned_body = smart_str(bleach.clean(body, tags=(), strip=True))
        try:
            # [Bugzilla-670890]
            # Needed to allow for the same article but from different sources. This ensures a unique checksum per source
            checksum = hashlib.md5(cleaned_body + page).hexdigest()
            exists = FeedEntry.objects.filter(checksum=checksum)
            if not exists:
                log.debug('Logging - %s' % parsed['title'])
                entry = FeedEntry(title=parsed['title'].encode('utf-8'),
                                  link=parsed['link'].encode('utf-8'),
                                  body=cleaned_body,
                                  page=page,
                                  checksum=checksum,
                                  created_on=time.strftime(
                                      "%Y-%m-%d %H:%M:%S", parsed['updated']))
                entry.save()
                feed_id = entry.id
            else:
                # if it's already in the feed we still want to keep a reference to it's ID so we know to display it
                log.debug('Found a duplicate - entry')
                feed_id = exists[0].id
            ids.append(feed_id)
        except:
            log.warn("Encountered an error creating FeedEntry. Skipping.")
            continue
    log.debug(ids)
    return ids
Пример #3
0
def parse_feed(feed_url, page):
    ids = []
    entries = get_feed_entries(feed_url)

    for entry in entries:
        parsed = parse_entry(entry)
        if not parsed:
            log.warn("Parsing feed failed. continuing")
            continue
        if isinstance(parsed['content'], feedparser.FeedParserDict):
            if 'value' in parsed['content'].keys():
                body = parsed['content']['value']
        else:
            body = parsed['content']
        if not body:
            log.warn("Parsing feed failed - no body found")
            continue
        cleaned_body = smart_str(bleach.clean(body, tags=(), strip=True, strip_comments=True))
        try:
            checksum = hashlib.md5(cleaned_body).hexdigest()
            exists = FeedEntry.objects.filter(checksum=checksum)
            if not exists:
                entry = FeedEntry(
                    title=parsed['title'].encode('utf-8'),
                    link=parsed['link'].encode('utf-8'),
                    body=cleaned_body,
                    page=page,
                    checksum=checksum,
                    created_on=time.strftime(
                        "%Y-%m-%d %H:%M:%S", parsed['updated']))
                entry.save()
                ids.append(entry.id)
            else:
                ids.append(exists[0].id)
        except:
            log.warn("Encountered an error creating FeedEntry. Skipping.")
            continue
    return ids