Exemplo n.º 1
0
async def parse_feed(user, feed):
    """
    Fetch a feed.
    Update the feed and return the articles.
    """
    parsed_feed = None
    up_feed = {}
    articles = []
    resp = None
    # with (await sem):
    try:
        logger.info("Retrieving feed {}".format(feed.link))
        resp = newspipe_get(feed.link, timeout=5)
    except Exception:
        logger.info("Problem when reading feed {}".format(feed.link))
        return
    finally:
        if None is resp:
            return
        try:
            content = io.BytesIO(resp.content)
            parsed_feed = feedparser.parse(content)
        except Exception as e:
            up_feed["last_error"] = str(e)
            up_feed["error_count"] = feed.error_count + 1
            logger.exception("error when parsing feed: " + str(e))
        finally:
            up_feed["last_retrieved"] = datetime.now(dateutil.tz.tzlocal())
            if parsed_feed is None:
                try:
                    FeedController().update({"id": feed.id}, up_feed)
                except Exception as e:
                    logger.exception("something bad here: " + str(e))
                return

    if not is_parsing_ok(parsed_feed):
        up_feed["last_error"] = str(parsed_feed["bozo_exception"])
        up_feed["error_count"] = feed.error_count + 1
        FeedController().update({"id": feed.id}, up_feed)
        return
    if parsed_feed["entries"] != []:
        articles = parsed_feed["entries"]

    up_feed["error_count"] = 0
    up_feed["last_error"] = ""

    # Feed information
    try:
        up_feed.update(construct_feed_from(feed.link, parsed_feed))
    except:
        logger.exception("error when constructing feed: {}".format(feed.link))
    if feed.title and "title" in up_feed:
        # do not override the title set by the user
        del up_feed["title"]
    try:
        FeedController().update({"id": feed.id}, up_feed)
    except:
        logger.exception("error when updating feed: {}".format(feed.link))

    return articles
Exemplo n.º 2
0
async def get_article_details(entry, fetch=True):
    article_link = entry.get("link")
    article_title = html.unescape(entry.get("title", ""))
    if (
        fetch
        and application.config["CRAWLER_RESOLV"]
        and article_link
        or not article_title
    ):
        try:
            # resolves URL behind proxies (like feedproxy.google.com)
            response = newspipe_get(article_link, timeout=5)
        except MissingSchema:
            split, failed = urlsplit(article_link), False
            for scheme in "https", "http":
                try:
                    new_link = urlunsplit(SplitResult(scheme, *split[1:]))
                    response = newspipe_get(new_link, timeout=5)
                except Exception:
                    failed = True
                    continue
                failed = False
                article_link = new_link
                break
            if failed:
                return article_link, article_title or "No title"
        except Exception as error:
            logger.info(
                "Unable to get the real URL of %s. Won't fix "
                "link or title. Error: %s",
                article_link,
                error,
            )
            return article_link, article_title or "No title"
        article_link = response.url
        if not article_title:
            bs_parsed = BeautifulSoup(
                response.content, "html.parser", parse_only=SoupStrainer("head")
            )
            try:
                article_title = bs_parsed.find_all("title")[0].text
            except IndexError:  # no title
                pass
    return article_link, article_title or "No title"
Exemplo n.º 3
0
 def _build_from_url(self, attrs):
     if "url" in attrs and "content" not in attrs:
         try:
             resp = newspipe_get(attrs["url"], timeout=5)
             attrs.update({
                 "url":
                 resp.url,
                 "mimetype":
                 resp.headers.get("content-type", None),
                 "content":
                 base64.b64encode(resp.content).decode("utf8"),
             })
         except requests.exceptions.ConnectionError:
             pass
     return attrs