def add_links_from_html(url, modify_link_url=None, modify_source_name=None): page = requests.get(url) soup = BeautifulSoup(page.text) hrefs = defaultdict(set) for link in soup.find_all('a'): href = link.get("href") if href is None: # Remove junk links continue # Resolve relative URLs href = urljoin(url, href) if not href.startswith("http"): # Remove junk links continue if href == url: # Don't include the same page! continue if modify_link_url: href = modify_link_url(href) if "rd.yahoo." in href or "my.yahoo." in href: # Yahoo Reader add button continue if "rss" in href or ".xml" in href: hrefs[href].add(href) hrefs[href].add(link.get("title")) # could also go up the tree trying to get the text hrefs[href].add(link.string) # Filter out Nones hrefs[href] = set(h for h in hrefs[href] if h is not None) for href, info in sorted(hrefs.iteritems()): source_name = url_to_name(href) source_name = modify_source_name(source_name) if SOURCE_NAME_REGEX.match(source_name) is None: continue add_source_interactive(source_name, href, " ".join(info))
#!/usr/bin/env python from datetime import datetime from sys import argv from time import mktime import feedparser from rss_catalog.sources import add_source_interactive, url_to_name # Usage: add_rss.py rss_url [source_name] feed_url = argv[1] d = feedparser.parse(feed_url) try: updated = str(datetime.fromtimestamp(mktime(d['updated_parsed']))) except KeyError: updated = "???" name = url_to_name(d['feed']['link']) info = "Updated {}".format(updated) if len(argv) > 2: name = argv[2] add_source_interactive(name, feed_url, source_info=info)
#!/usr/bin/env python from datetime import datetime from sys import argv from time import mktime import feedparser from rss_catalog.sources import add_source_interactive, url_to_name # Usage: add_rss.py rss_url [source_name] feed_url = argv[1] d = feedparser.parse(feed_url) try: updated = str(datetime.fromtimestamp(mktime(d["updated_parsed"]))) except KeyError: updated = "???" name = url_to_name(d["feed"]["link"]) info = "Updated {}".format(updated) if len(argv) > 2: name = argv[2] add_source_interactive(name, feed_url, source_info=info)