Пример #1
0
def add_links_from_html(url, modify_link_url=None, modify_source_name=None):
    page = requests.get(url)
    soup = BeautifulSoup(page.text)

    hrefs = defaultdict(set)
    for link in soup.find_all('a'):
        href = link.get("href")

        if href is None:
            # Remove junk links
            continue

        # Resolve relative URLs
        href = urljoin(url, href)

        if not href.startswith("http"):
            # Remove junk links
            continue

        if href == url:
            # Don't include the same page!
            continue

        if modify_link_url:
            href = modify_link_url(href)

        if "rd.yahoo." in href or "my.yahoo." in href:
            # Yahoo Reader add button
            continue

        if "rss" in href or ".xml" in href:
            hrefs[href].add(href)

            hrefs[href].add(link.get("title"))

            # could also go up the tree trying to get the text
            hrefs[href].add(link.string)

            # Filter out Nones
            hrefs[href] = set(h for h in hrefs[href] if h is not None)

    for href, info in sorted(hrefs.iteritems()):
        source_name = url_to_name(href)
        source_name = modify_source_name(source_name)
        if SOURCE_NAME_REGEX.match(source_name) is None:
            continue
        add_source_interactive(source_name, href, " ".join(info))
Пример #2
0
def add_links_from_html(url, modify_link_url=None, modify_source_name=None):
    page = requests.get(url)
    soup = BeautifulSoup(page.text)

    hrefs = defaultdict(set)
    for link in soup.find_all('a'):
        href = link.get("href")

        if href is None:
            # Remove junk links
            continue

        # Resolve relative URLs
        href = urljoin(url, href)

        if not href.startswith("http"):
            # Remove junk links
            continue

        if href == url:
            # Don't include the same page!
            continue

        if modify_link_url:
            href = modify_link_url(href)

        if "rd.yahoo." in href or "my.yahoo." in href:
            # Yahoo Reader add button
            continue

        if "rss" in href or ".xml" in href:
            hrefs[href].add(href)

            hrefs[href].add(link.get("title"))

            # could also go up the tree trying to get the text
            hrefs[href].add(link.string)

            # Filter out Nones
            hrefs[href] = set(h for h in hrefs[href] if h is not None)

    for href, info in sorted(hrefs.iteritems()):
        source_name = url_to_name(href)
        source_name = modify_source_name(source_name)
        if SOURCE_NAME_REGEX.match(source_name) is None:
            continue
        add_source_interactive(source_name, href, " ".join(info))
Пример #3
0
#!/usr/bin/env python
from datetime import datetime
from sys import argv
from time import mktime

import feedparser

from rss_catalog.sources import add_source_interactive, url_to_name

# Usage: add_rss.py rss_url [source_name]

feed_url = argv[1]
d = feedparser.parse(feed_url)
try:
    updated = str(datetime.fromtimestamp(mktime(d['updated_parsed'])))
except KeyError:
    updated = "???"
name = url_to_name(d['feed']['link'])

info = "Updated {}".format(updated)

if len(argv) > 2:
    name = argv[2]

add_source_interactive(name, feed_url, source_info=info)
Пример #4
0
#!/usr/bin/env python
from datetime import datetime
from sys import argv
from time import mktime

import feedparser

from rss_catalog.sources import add_source_interactive, url_to_name


# Usage: add_rss.py rss_url [source_name]

feed_url = argv[1]
d = feedparser.parse(feed_url)
try:
    updated = str(datetime.fromtimestamp(mktime(d["updated_parsed"])))
except KeyError:
    updated = "???"
name = url_to_name(d["feed"]["link"])

info = "Updated {}".format(updated)

if len(argv) > 2:
    name = argv[2]

add_source_interactive(name, feed_url, source_info=info)