示例#1
0
def feed():
    ''' returns feed instance '''
    return FeedParserDict(
        feed=FeedParserDict(
            link='https://sample.com',
            subtitle='This is the best podcast show in the entire universe',
            title='PodcastTitle'),
        entries=[
            FeedParserDict(title='Episode1',
                           subtitle='episode of week 1',
                           published=str(datetime.datetime(2021, 1, 1)),
                           links=[{
                               'href': 'https://somesite.com/episode1.mp3',
                               'type': 'audio/mpeg'
                           }, {
                               'href': 'https://somesite.com/episode1',
                               'type': 'text'
                           }]),
            FeedParserDict(title='Episode2',
                           subtitle='episode of week 2',
                           published=str(datetime.datetime(2021, 1, 7)),
                           links=[{
                               'href': 'https://somesite.com/episode2',
                               'type': 'text',
                           }, {
                               'href': 'https://somesite.com/episode2.mp3',
                               'type': 'audio/mpeg'
                           }])
        ],
        etag='5f77c6d7-45f1e',
        href='https://sample.podcast.tv/test.xml',
        updated=str(datetime.datetime(2021, 1, 1)),
        status=200)
示例#2
0
def get_url(item: FeedParserDict) -> str:
    try:
        if link := item.get("link", ""):
            validate_url(link)
    except ValidationError:
        link = ""

    return link
示例#3
0
 def __init__(self, entry: FeedParserDict):
     self.entry = entry
     self.title = entry.title
     self.url = entry.link
     self.summary = entry.get('summary', '')
     self.published_parsed = entry.get('published_parsed')
     self.update_parsed = entry.get('update_parsed')
     self.published_date = self._define_published_date(
         self.published_parsed, self.update_parsed)
示例#4
0
    def parse_single_entry(entry: FeedParserDict) -> Article:
        article = Article()
        article.title = entry.get('title')
        article.link = entry.get('link')
        article.pub_date = BlogAggregatorService.parse_pub_date(
            entry.get('published_parsed'))
        article.desc = entry.get('summary')

        return article
def rss_feed_missing_feed_title():
    """An invalid rss feed for tests"""
    item1 = {"title": "test1", "link": "https://test.com/item1"}
    item2 = {"title": "test1", "description": "foo"}
    item3 = {"title": "test3", "summary": "bar"}
    entries = [
        FeedParserDict(item1),
        FeedParserDict(item2),
        FeedParserDict(item3)
    ]

    feed = FeedParserDict({"title": "test", "description": "testing"})

    rss_feed = FeedParserDict({"feed": feed, "entries": entries})
    return rss_feed
示例#6
0
def make_fake_feedparser_dict(feed_url, n_items=30):
    """Creates a fake but valid FeedParserDict object.

    Args:
        feed_url (str): Fake URL for fake Feed
        n_items (int): number of fake entries to generate
    """
    return FeedParserDict(
        feed=FeedParserDict(title='Sample Feed',
                            description='This is a sample feed',
                            link=feed_url),
        entries=make_feed_entries_list(n_items=n_items, feed_url=feed_url),
        version='rss20',
        bozo=0,
    )
示例#7
0
def test_feeds(mock_feedparse, patch_try_shorten):
    from plugins import feeds

    mock_feedparse.return_value = FeedParserDict(entries=[], )
    assert feeds.rss('xkcd') == "Feed not found."

    mock_feedparse.assert_called_with('http://xkcd.com/rss.xml')

    mock_feedparse.reset_mock()

    mock_feedparse.return_value = FeedParserDict(
        entries=[FeedParserDict(title='foo1', link='http://example.com')],
        feed=FeedParserDict(title='test'),
    )

    with_title = "\x02test\x02: foo1 (http://example.com)"

    assert feeds.rss('http://rss.example.com/feed.xml') == with_title

    mock_feedparse.assert_called_with('http://rss.example.com/feed.xml')

    mock_feedparse.reset_mock()

    mock_feedparse.return_value = FeedParserDict(
        entries=[FeedParserDict(title='foo1', link='http://example.com')],
        feed=FeedParserDict(),
    )

    without_title = "foo1 (http://example.com)"

    assert feeds.rss('http://rss.example.com/feed.xml') == without_title

    mock_feedparse.assert_called_with('http://rss.example.com/feed.xml')

    mock_feedparse.reset_mock()
示例#8
0
文件: rss.py 项目: asiaron/aicrawler
 def entry_to_page(self, entry: FeedParserDict) -> Info:
     time = entry.published_parsed
     return Info(
         action=1,
         type='rss',
         source=netloc_to_source(self.netloc),
         source_url=self.netloc,
         title=entry.title,
         guid=self.get_guid(entry.guid),
         link=entry.link,
         description=entry.get('summary', ''),
         preview=entry.get('summary', ''),
         subjects=[tag.term.lower() for tag in entry.get('tags', [])],
         time=struct_time_to_datetime(time),
         zone=TimeZone(
             offset=3,  #  time.tm_gmtoff /1800,
             name='MSK'  # time.tm_zone
         ))
示例#9
0
def check_parsed(parsed: feedparser.FeedParserDict, req_keys: list) -> bool:
    """
    Checks either the feed or entry of a parsed RSS feed

    :param parsed: valid, utf-8 feedparsed RSS
    :req_keys: keys that the `parsed` must contain
    :return: whether the parsed has the needed elements
    """
    return all([parsed.get(x) is not None for x in req_keys])
示例#10
0
def test_fetch_feed_unknown_issue(mocked_parse, hacks_feed):
    """If a feed encounters an unknown issue, it is disabled."""
    mocked_parse.return_value = FeedParserDict(bozo=1)
    stream = fetch_feed(hacks_feed)
    assert stream is None
    feed = Feed.objects.get()
    assert not feed.enabled
    expected_reason = "Error while reading the feed: 500 __ "
    assert feed.disabled_reason == expected_reason
示例#11
0
def test_fetch_feed_exception(mocked_parse, hacks_feed):
    """If a feed encounters an exception, it is disabled."""
    mocked_parse.return_value = FeedParserDict(
        bozo=1, bozo_exception=Exception("I am grumpy today."))
    stream = fetch_feed(hacks_feed)
    assert stream is None
    feed = Feed.objects.get()
    assert not feed.enabled
    expected_reason = "Error while reading the feed: 500 __ I am grumpy today."
    assert feed.disabled_reason == expected_reason
示例#12
0
 def _update_feed_data(self,
                       feed_data_obj: feedparser.FeedParserDict) -> None:
     """
     Updates feed data given a "feedparser.FeedParserDict.feed" object
     :param feed_data_obj:
     :return:
     """
     self.title = feed_data_obj['title']
     site_url = feed_data_obj.get('link', None)
     if site_url:
         self.site_url = site_url
def test_parse_rss_error(mocker):
    """
    Test exceptions are handled for error while parsing rss
    """
    feed = FeedParserDict({
        "bozo": True,
        "bozo_exception": "details",
    })
    mocker.patch("apps.feeds.feed_parser.feedparser.parse", return_value=feed)

    with pytest.raises(ParseContentError):
        parse("foo")

    feed = FeedParserDict({"bozo": False, "feed": "test"})
    mocker.patch("apps.feeds.feed_parser.feedparser.parse", return_value=feed)

    mocker.patch("apps.feeds.feed_parser.has_required_fields",
                 return_value=False)
    with pytest.raises(ParseContentError):
        parse("foo")
示例#14
0
def getFeed(url, request_headers=None, handlers=None):
    try:
        return feedparser.parse(
            sickrage.srCore.srWebSession.normalize_url(url),
            agent=random.choice(USER_AGENTS),
            etag=False,
            modified=False,
            request_headers=request_headers,
            handlers=handlers)
    except Exception as e:
        return FeedParserDict()
示例#15
0
def test_fetch_feed_timeout(mocked_parse, hacks_feed, settings):
    """If a feed times out, it is disabled."""
    settings.FEEDER_TIMEOUT = 10
    mocked_parse.return_value = FeedParserDict(
        bozo=1, bozo_exception=URLError(reason=socket.timeout("timed out")))
    stream = fetch_feed(hacks_feed)
    assert stream is None
    feed = Feed.objects.get()
    assert feed.etag == ""
    assert not feed.enabled
    expected_reason = "This feed didn't respond after 10 seconds"
    assert feed.disabled_reason == expected_reason
示例#16
0
def rss_feed():
    """
    Create a rss FeedParserDict obj for tests
    """
    item1 = {"title": "test1", "link": "https://test.com/item1"}
    item2 = {"title": "test2", "description": "foo"}
    item3 = {"title": "test3", "summary": "bar"}
    entries = [
        FeedParserDict(item1),
        FeedParserDict(item2),
        FeedParserDict(item3)
    ]

    feed = FeedParserDict({
        "title": "test",
        "link": "https://test.com",
        "description": "testing"
    })

    rss_feed = FeedParserDict({"feed": feed, "entries": entries, "bozo": 0})
    return rss_feed
示例#17
0
 def _from_feed_entry(entry: feedparser.FeedParserDict) -> 'Result':
     """
     Converts a feedparser entry for an arXiv search result feed into a
     Result object.
     """
     return Result(
         entry_id=entry.id,
         updated=Result._to_datetime(entry.updated_parsed),
         published=Result._to_datetime(entry.published_parsed),
         title=re.sub(r'\s+', ' ', entry.title),
         authors=[
             Result.Author._from_feed_author(a) for a in entry.authors
         ],
         summary=entry.summary,
         comment=entry.get('comment'),
         journal_ref=entry.get('arxiv_journal_ref'),
         doi=entry.get('arxiv_doi'),
         primary_category=entry.arxiv_primary_category.get('term'),
         categories=[tag.get('term') for tag in entry.tags],
         links=[Result.Link._from_feed_link(link) for link in entry.links],
         _raw=entry)
示例#18
0
def prepare_link_title(
    item: feedparser.FeedParserDict) -> feedparser.FeedParserDict:
    """
    Для RSS Item возвращает ссылку, заголовок и описание
    :param item:
    :return:
    """
    result = None
    if item:
        assert item.title, 'Not found title in item'
        assert item.link, 'Not found link in item'

        link = item.link.replace('https://www.google.com/url?rct=j&sa=t&url=',
                                 '')
        ge_ind = link.find('&ct=ga')
        if ge_ind > -1:
            link = link[0:ge_ind]
        title = item.title.replace('<b>', '').replace('</b>', '')
        item.link = link
        item.title = title
        result = item
    return result
示例#19
0
def prepare_link_title(
        item: feedparser.FeedParserDict) -> feedparser.FeedParserDict:
    """
    Для RSS Item возвращает ссылку, заголовок и описание
    :param item:
    :return:
    """
    result = None
    if item:
        assert item.title, 'Not found title in item'
        assert item.link, 'Not found link in item'

        link = item.link.replace('https://www.google.com/url?rct=j&sa=t&url=',
                                 '')
        ge_ind = link.find('&ct=ga')
        if ge_ind > -1:
            link = link[0:ge_ind]
        title = item.title.replace('<b>', '').replace('</b>', '')
        item.link = link
        item.title = title
        result = item
    return result
示例#20
0
def parse_content(item: FeedDict) -> str:
    content = get_text_from_html(item.get('summary') or '')
    content = textwrap.shorten(content, width=300, placeholder="...")
    content = (
        content
        .replace('_', '')
        .replace('*', '')
        .replace('`', '')
        .replace('[', '')
        .replace(']', '')
        .replace('(', '')
        .replace(')', '')
    )
    return content
示例#21
0
    def load_rss_info(self, parsed: feedparser.FeedParserDict) -> None:
        """
        Load some RSS subscription elements into this feed state."""
        self.entries = []
        for entry in parsed.get("entries"):
            new_entry = {}
            new_entry["title"] = entry["title"]

            new_entry["urls"] = []
            new_entry["metadata"] = {}
            for enclosure in entry["enclosures"]:
                new_entry["urls"].append(enclosure["href"])

            self.entries.append(new_entry)
示例#22
0
def getFeed(url, params=None, request_headers=None, handlers=None):
    try:
        resp = sickrage.app.wsession.get(url, params=params)
        if resp.ok:
            return feedparser.parse(resp.text,
                                    agent=sickrage.app.user_agent,
                                    etag=False,
                                    modified=False,
                                    request_headers=request_headers,
                                    handlers=handlers)
    except Exception as e:
        sickrage.app.log.debug("RSS Error: {}".format(e.message))

    return FeedParserDict()
示例#23
0
def getFeed(url, params=None, request_headers=None, handlers=None):
    try:
        resp = sickrage.srCore.srWebSession.get(url, params=params)
        if resp.ok:
            return feedparser.parse(resp.text,
                                    agent=random.choice(USER_AGENTS),
                                    etag=False,
                                    modified=False,
                                    request_headers=request_headers,
                                    handlers=handlers)
    except Exception as e:
        sickrage.srCore.srLogger.debug("RSS Error: {}".format(e.message))

    return FeedParserDict()
示例#24
0
def check_source(parsed: feedparser.FeedParserDict) -> bool:
    """
    Checks the parsed feed for encoding et bozo

    :param parsed: potentially invalid RSS feed
    :return: whether the RSS feed is actually valid or not
    """
    if parsed.bozo == 1:
        return False
    if not parsed.get('encoding'):
        return False
    if not parsed.encoding.upper() == 'utf-8'.upper():
        return False

    return True
示例#25
0
def getFeed(url, request_headers=None, handlers=None):
    feed = FeedParserDict()
    try:
        try:
            feed = feedparser.parse(normalize_url(url),
                                    False,
                                    False,
                                    request_headers,
                                    handlers=handlers)
        except AttributeError:
            sickrage.LOGGER.debug('RSS ERROR:[{}] CODE:[{}]'.format(
                feed.feed[b'error'][b'description'],
                feed.feed[b'error'][b'code']))
    except:
        pass

    return feed
示例#26
0
    def article(self) -> Article:
        if self.processed:
            return FeedParserDict(authors='', text='')

        article = self.article_supplier(self.input_entry.link)

        if "content" in self.input_entry:
            article.set_html(self.input_entry.content[0].value)
            logging.info("Using inline content")
        else:
            logging.info(f"Getting content from: {self.input_entry.link}")
            article.download()

        article.parse()
        logging.debug("Just retrieved the following article: ")
        logging.debug(article)
        return article
示例#27
0
def make_feed_entries_list(n_items=10, feed_url=''):
    """Generates a list of feed entries.

    Args:
        n_items (int): how many feed entries to make
        feed_url (str): base URL
    """
    tz = ['+0800', 'GMT']
    fmt = '%a, %d %b %Y %H:%M:%S'
    now = datetime.now()
    offset = timedelta(minutes=1)
    return [
        FeedParserDict(
            link=urljoin(feed_url, f'story-{i + 1:05d}.html'),
            published=(
                f'{(now - offset * random.randint(1, 180)).strftime(fmt)} '
                f'{random.choice(tz)}'),
            author=f'Author {i + 1}',
            summary=f'Summary {i + 1}',
            title=f'Title {i + 1}') for i in range(n_items)
    ]
示例#28
0
def parse_vacancies(data: feedparser.FeedParserDict) -> Iterator[Vacancy]:
    for entry in data.get('entries', []):
        try:
            year, month, day, hour, minutes, seconds, *_ = entry.published_parsed
            date = datetime(year, month, day, hour, minutes, seconds)
            text = prepare_text(entry.description)
            url = entry.link
        except Exception as exception:
            app.logger.exception(
                msg='Exception during parsing job post',
                exc_info=exception,
            )
            continue
        title = remove_markdown_symbols(entry.title)
        text = f'*{title}*\n\n' + text
        link = f'*Посилання*\n[{title}]({entry.link})'

        result = text + link
        if len(result) > MESSAGE_LIMIT:
            strip_to = MESSAGE_LIMIT - len(link) - 10
            result = text[:strip_to] + '...\n\n' + link

        yield Vacancy(url=url, title=entry.title, text=result, date=date)
示例#29
0
def make_validate_dict(item: feedparser.FeedParserDict) -> dict:
    """
    Создает из RSS элемента словарь для сохранения.
    Метод пытается достать максимум информации из элемента
    :param item:
    :return:
    """
    _ = item.get('published_parsed', None)
    if _:
        published_at = datetime.fromtimestamp(mktime(_))
    else:
        published_at = datetime.now()

    try:
        result = {
            'title': item.title,
            'description': item.summary,
            'link': item.link,
            'published_at': published_at,
        }
    except Exception:
        result = {}
    return result
示例#30
0
def make_validate_dict(item: feedparser.FeedParserDict) -> dict:
    """
    Создает из RSS элемента словарь для сохранения.
    Метод пытается достать максимум информации из элемента
    :param item:
    :return:
    """
    _ = item.get('published_parsed', None)
    if _:
        published_at = datetime.fromtimestamp(mktime(_))
    else:
        published_at = datetime.now()

    try:
        result = {
            'title': item.title,
            'description': item.summary,
            'link': item.link,
            'published_at': published_at,
        }
    except Exception:
        result = {}
    return result
示例#31
0
def get_tags(item: FeedParserDict) -> str:
    return ", ".join(
        sorted([
            tag.get("term") for tag in item.get("tags", []) if tag.get("term")
        ]))
示例#32
0
def get_summary(item: FeedParserDict) -> str:
    return unescape(item.get("summary", ""))