def clean_fields(self, exclude=None): super(Feed, self).clean_fields(exclude) errors = {} parsed_feed = feedparser.parse(self.url) # parsed feed is an invalid feed # TODO add more robust error handling if (parsed_feed.bozo and not parsed_feed.entries): # try finding a feed at the site feeds = find_feeds(self.url) if (feeds): self.url = feeds[0] parsed_feed = feedparser.parse(self.url) else: urlname = self._meta.get_field('url').name message = "Unable to find a feed at '{0}'".format(self.url) errors[urlname] = ([message]) raise ValidationError(errors) # if no title then use the feed's title if not self.title: self.title = parsed_feed.feed.title # set the source of the feed if parsed_feed.feed.title: self.source = urlparse.urlparse(parsed_feed.feed.link).hostname else: self.source = urlparse.urlparse(self.url).hostname
def main(): a = 0 form = info() if request.method == 'POST': url = form.url.data feed = form.feed.data if (url): rssfeed = find_feeds(url) elif (feed): rssfeed = feed else: return render_template('main.html', form=form) print(rssfeed) t = "CREATE TABLE feedinfo(sno serial,title character varying,link character varying)" c.execute(t) rss = feedparser.parse(rssfeed[0]) for i in range(len(rss.entries)): c.execute("INSERT INTO feedinfo (title,link) VALUES (%s,%s)", (rss.entries[i]['title'], rss.entries[i]['link'])) c.execute("SELECT * FROM feedinfo") data = c.fetchall() c.execute("DROP TABLE feedinfo") conn.commit() return render_template('sucess.html', data=data) else: return render_template('main.html', form=form)
def get_feed(url): """ Given a URL, retrieves the default feed and gets a pubsubhubbub hub url. Returns the hub url, self url, and parsed feed. :param url: Any Url for a site :type url: str :return: tuple """ try: feed_urls = find_feeds(url) logging.info('Feed Urls: {0}'.format(feed_urls)) feed_url = feed_urls[0] logging.info('Feed Url: {0}'.format(feed_url)) except Exception as e: logging.exception(e) raise GetFeedError(e) if feed_url is None: raise GetFeedError('Could not find feed.') parsed_feed = get_parsed_feed(feed_url) if parsed_feed is None: raise GetFeedError('Could not parse feed.') links = get_links(parsed_feed) logging.info('Links: {0}'.format(links)) if links is None: raise GetFeedError('Could not get links from feed.') return (links[0], links[1], parsed_feed)
def __init__(self, site: str): self.__site = site # Check if that url is RSS feed like first. Otherwise, crawl and find possible feed like urls. Returns as list self.__getLink = self.__site if url.URL.is_feedlike_url( self.__site) != False else find_feeds( self.__site, check_all=False, user_agent=None) return None
def collect_feeds(self, link): html = self.make_request(link) self.extract_links(html) if len(self.links) == 0: print "process sleeping for 6 seconds" time.sleep(6) return find_feeds(link) else: return self.links
def parse(self): parse_result = feedparser.parse(self.feed_url) if self.accept_fuzzy and FeedInformationService.parse_exception_key in parse_result: feeds = feedfinder2.find_feeds(self.feed_url) parse_result = map(feedparser.parse, feeds) else: parse_result = [parse_result] result = [FeedInfo( feed_type=parsed_result['version'], feed_url=parsed_result['href'], html_url=parsed_result['feed']['link'], title=parsed_result['feed']['title']) for parsed_result in parse_result] return result
def subscribe(): # Get the requested subscription URL. add_url = flask.request.values.get("url") if add_url is None: return flask.jsonify(message="You must provide a URL."), 400 # Check to make sure that the user doesn't have too many subscriptions. user = _get_user() mx = flask.current_app.config.get("MAX_FEEDS", -1) if mx > 0 and len(user.feeds) >= mx: return flask.jsonify(message="You're already subscribed to the " "maximum number of feeds."), 400 # Try to find a feed below the requested resource. urls = feedfinder2.find_feeds(add_url) if not len(urls): return flask.jsonify(message="The robot can't find a feed at that " "URL. Could you help it with a more specific " "link?"), 400 url = urls[0] # See if the user is already subscribed to a feed at that URL. feed = db.session.query(Feed).join(User.feeds) \ .filter(User.id == user.id) \ .filter(Feed.url == url).first() if feed is not None: return flask.jsonify( message="You've already subscribed to {0}.".format(feed.title), feed=feed.to_dict(), ) # See if a feed object already exists for that URL. feed = Feed.query.filter(Feed.url == url).first() # If it doesn't, create a new one. if feed is None: feed = Feed(url) # Update the feed immediately to get the title, etc. feed.update_info() # Subscribe the current user. user.feeds.append(feed) db.session.commit() return flask.jsonify( message="Successfully subscribed to {0}.".format(feed.title), feed=feed.to_dict(), )
def loadAndFindRssUrlCandidates(inProspectiveRssHostPageUrl): #load URL(inProspectiveRssHostPageUrl), search for potential RSS URLs on loaded URL, and score based on link existance and etc. assert (type(inProspectiveRssHostPageUrl) is unicode), "loadAndFindRssUrlCandidates: inProspectiveRssHostPageUrl is not unicode" assert (inProspectiveRssHostPageUrl), "loadAndFindRssUrlCandidates: inProspectiveRssHostPageUrl is empty" scoredCandidates = [] try: print 'finding feeds for: ' + inProspectiveRssHostPageUrl feeds = find_feeds(inProspectiveRssHostPageUrl) #returns list of RSS feeds found from input URL, through some reliable methods print 'found feeds for: ' + inProspectiveRssHostPageUrl except KeyboardInterrupt: f = open('no_rss_found.txt', 'a') f.write('Feed not found for: '+inProspectiveRssHostPageUrl.encode('ascii','ignore')+'\n') f.close() print('inProspectiveRssHostPageUrl skipped:' + inProspectiveRssHostPageUrl) return "" if not feeds: #checks if feeds list is empty, due to empty lists evaluating as false return "" assert (feeds), "loadAndFindRssUrlCandidates: list of found feeds is empty. \n Tried: " + inProspectiveRssHostPageUrl for rssCandidate in feeds: #assign score to feeds. Use feedparser to find title and other info. Remove points for SoundCloud, etc. if "itunes" not in rssCandidate: parseRss = feedparser.parse(rssCandidate) score = 0 rssKeyWords= ["feed", "rss", "atom", "xml"] if any(x in rssCandidate for x in rssKeyWords): score += 1 if parseRss.version != "": score += 1 if "soundcloud" not in rssCandidate: score += 1 scoredCandidates.append((rssCandidate, score)) assert (scoredCandidates), "loadAndFindRssUrlCandidates: scoredCandidates is empty" assert (type(scoredCandidates[0]) is tuple), "loadAndFindRssUrlCandidates: scoredCandidates is not list of tuples" + scoredCandidates return scoredCandidates
def store(request): url = request.json.get('url') category = request.json.get('category') app_id = request.json.get('app_id') existing_feed = Feed.query.filter(Feed.url.ilike( r"%{}%".format(url))).first() if existing_feed: return jsonify({'error': 'Feed already added.'}) url = url.strip() feed = feedparser.parse(url) if not feed.entries: feed_urls = find_feeds(url) if feed_urls: feed = feedparser.parse(feed_urls[0]) feed_url = feed_urls[0] else: return jsonify({'error': 'Invalid feed url'}) else: feed_url = url title = feed.feed.title if 'icon' in feed.feed: icon = feed.feed.icon elif 'image' in feed.feed: icon = feed.feed.image.href else: icon = None ext = tldextract.extract(url) domain_name = ext.domain + '.' + ext.suffix newFeed = Feed(id=helpers.generate_unique_code(), title=title, url=url, alexa_rank=get_alexa_rank(domain_name), domain=domain_name, app_id=app_id, rss_url=feed_url, icon=icon) db.session.add(newFeed) if category: category_data = add_or_fetch_category(category) newFeed.categories.append(category_data) db.session.commit() return respondWithItem(newFeed)
def subscribe(): # Get the requested subscription URL. add_url = flask.request.values.get("url") if add_url is None: return flask.jsonify(message="You must provide a URL."), 400 # Check to make sure that the user doesn't have too many subscriptions. user = _get_user() mx = flask.current_app.config.get("MAX_FEEDS", -1) if mx > 0 and len(user.feeds) >= mx: return flask.jsonify(message="You're already subscribed to the " "maximum number of feeds."), 400 # Try to find a feed below the requested resource. urls = feedfinder2.find_feeds(add_url) if not len(urls): return ( flask.jsonify( message="The robot can't find a feed at that " "URL. Could you help it with a more specific " "link?" ), 400, ) url = urls[0] # See if the user is already subscribed to a feed at that URL. feed = db.session.query(Feed).join(User.feeds).filter(User.id == user.id).filter(Feed.url == url).first() if feed is not None: return flask.jsonify(message="You've already subscribed to {0}.".format(feed.title), feed=feed.to_dict()) # See if a feed object already exists for that URL. feed = Feed.query.filter(Feed.url == url).first() # If it doesn't, create a new one. if feed is None: feed = Feed(url) # Update the feed immediately to get the title, etc. feed.update_info() # Subscribe the current user. user.feeds.append(feed) db.session.commit() return flask.jsonify(message="Successfully subscribed to {0}.".format(feed.title), feed=feed.to_dict())
from pprint import pprint from feedfinder2 import find_feeds from feedsearch import search # url = 'http://nymag.com/newyork/rss/' url = 'http://arstechnica.com' start1 = time.perf_counter() result1 = search(url, info=True, check_all=False) time1 = int((time.perf_counter() - start1) * 1000) start2 = time.perf_counter() result2 = find_feeds(url, check_all=False) time2 = int((time.perf_counter() - start2) * 1000) print() print(f'Feedsearch searched url in {time1}ms: Found: {result1}') print(f'Feedfinder2 searched url in {time2}ms: Found: {result2}') print() for r in result1: pprint(vars(r)) print() pprint(result2)
import asyncio from feedfinder2 import find_feeds url = "xkcd.com" loop = asyncio.get_event_loop() task = asyncio.ensure_future(find_feeds(url)) feeds = loop.run_until_complete(task) print(feeds)
if 'base_urls' not in sitelist.keys(): raise ValueError( "HJSON file doesn't have sitelist key. Check News Please input format." ) sitelist = sitelist['base_urls'] for site in sitelist: if 'url' not in site: continue if site['url'] in blocked_sites: continue rss_links = find_feeds(site['url']) if len(rss_links) == 0: continue feed = feedparser.parse(rss_links[0]) if 'entries' not in feed: continue print("Processing feed for", site['url']) for entry in tqdm(feed['entries']): if 'link' not in entry: continue
# from feedfinder2_upgraded import find_feeds from feedfinder2 import find_feeds from pprint import pprint # url = "http://boingboing.net" url = "http://localhost:5000/author/1" feeds = find_feeds(url) pprint(feeds)
from bs4 import BeautifulSoup as bs4 import urllib3 import requests import feedparser import urllib.parse from feedfinder2 import find_feeds pag = "https://www.cooperativa.cl/noticias/pais/region-de-nuble/este-viernes-se-inicio-el-juicio-oral-por-fraude-al-fisco-en-chillan/2019-06-07/075336.html" feeds = find_feeds(pag) print(feeds) ''' def findfeed(site): raw = requests.get(site).text result = [] possible_feeds = [] html = bs4(raw, 'lxml') feed_urls = html.findAll("link", rel="alternate") if len(feed_urls) > 1: for f in feed_urls: t = f.get("type",None) if t: if "rss" in t or "xml" in t: href = f.get("href",None) if href: possible_feeds.append(href) parsed_url = urllib.parse.urlparse(site) base = parsed_url.scheme+"://"+parsed_url.hostname atags = html.findAll("a") for a in atags: href = a.get("href",None)