def get_feeds(old_links, url): r = requests.get(url) data = r.text soup = BeautifulSoup(data) news_source_list = [] twitter_feed_list = [] for link in soup.find_all('a'): full_link = link.get('href') try: o = urlparse(full_link) news_source = o.hostname if news_source is None: continue if news_source == 'twitter.com': locs = [m.start() for m in re.finditer('/', o.path)] news_source += o.path[:locs[1]] twitter_feed_list.append(news_source) else: news_source_list.append(news_source) except: pass print 'Links Found: ', len(set(news_source_list)) print 'Old Links: ', len(old_links) news_source_list = list(set(news_source_list) - set(old_links)) print 'New Links: ', len(news_source_list) twitter_feed_list = list(set(twitter_feed_list)) news_rss_list = [] for index, item in enumerate(news_source_list): print 'Feed: %d/%d' %(index, len(news_source_list)) try: # put a timeout as feedfinder can stall indefinitely signal.signal(signal.SIGALRM, handler) signal.alarm(10) rss = feedfinder.feeds(item) # cancel timeout if feedfinder returned signal.alarm(0) if rss is not None and len(rss) > 0: rss.sort(key = lambda s: len(s)) news_rss_list.append((item, rss[0])) except Exception, exc: print exc pass
def url_to_subscription(subscription_url): """Create a subscription object from a url —if possible""" if not subscription_url.startswith('http://'): subscription_url = 'http://%s' % subscription_url feeds = feedfinder.feeds(subscription_url) if feeds: subscription = Subscription() subscription.feed_url = feeds[0] data = feedparser.parse(subscription.feed_url) links = data.feed.get('links', []) if links: hubs = [link for link in data.feed.links if link['rel']==u'hub'] logging.info(hubs) if len(hubs) > 0: subscription.hub = hubs[0]['href'] else: subscription.hub = '' subscription.feed_id = data.feed.get('id', 'No ID') subscription.title = data.feed.get('title', 'No title') subscription.url = data.feed.get('link', subscription_url) subscription.etag = data.get('etag', '') updated_tuple = data.feed.get('date_parsed', None) if updated_tuple: subscription.updated = datetime.datetime(*updated_tuple[:6]) else: subscription.updated = datetime.datetime.today() else: subscription = None return subscription
def add_feed(filename, url, rawdog, config): """Try to add a feed to the config file.""" feeds = feedfinder.feeds(url) if feeds == []: print >>sys.stderr, "Cannot find any feeds in " + url else: feed = feeds[0] if feed in rawdog.feeds: print >>sys.stderr, "Feed " + feed + " is already in the config file" else: print >>sys.stderr, "Adding feed " + feed feedline = "feed %s %s\n" % (config["newfeedperiod"], feed) edit_file(filename, AddFeedEditor(feedline).edit)
def scan_url(url, depth): if url.strip().endswith('robots.txt'): # Ignore such URLs as they are fruitless. print "Skipping {}".format(url) return u2 = urlscanner.url_scanner(url.strip(), depth) for U in u2: if U.strip().lower().endswith('robots.txt'): continue logger.info("searching for feeds in: %s", U) feeds = feedfinder.feeds(U.strip()) logger.info("found %s feeds", len(feeds)) if feeds: logger.info(str(feeds)) feedfinder.store_feeds(feeds)
def DoFeed(self, base_uri, cooked): feedds = [] if feedfinder: feed_uris = feedfinder.feeds(self.page_uri) for feed_uri in feed_uris: feedd = { "@uf" : "feed", "@link" : feed_uri, "@title" : "feed", "src" : feed_uri, } feedds.append(feedd) self.PutData("feed", feedds)
def parsefeed(url): """this just attaches to feedfinder. Let's keep this function as a hook though to add more functionality down the line: eg, multiple results per feed_find""" try: logger.info('using feed finder') res = feedfinder.feeds(url) # res = feedfinder_new.feeds(url) feed_len = len(res) if res: logger.info('{0} feeds found: {1}'.format(feed_len, res)) return res, feed_len else: logger.warn("no results found...") return False, 0 except Exception, e: logger.exception("error!") return False
def add_feed(request): account = request.account unread_feeds = account.get_unread_feeds() if request.POST and request.POST.has_key('link'): data = request.POST.copy() try: fp = feedparser.parse(data['link']) if fp.has_key('feed') and fp.feed.has_key('title'): feeds = [data['link']] else: feeds = feedfinder.feeds(data['link']) if not feeds: request.session['error'] = "No feeds found." return HttpResponseRedirect('/feeds/unread/') elif len(feeds) == 1: try: feed = Feed.objects.get(link=feeds[0]) except Feed.DoesNotExist: # add the feed to the db fp = feedparser.parse(feeds[0]) feed = Feed(title=fp.feed.title, link=feeds[0], is_active=True) feed.save() feed.update() else: pass # already added try: AccountFeed(account=account, feed=feed, title=feed.title).save() except: pass return HttpResponseRedirect("/feeds/%s/" % (feed.slug,)) else: if data.has_key('feed'): form = SelectFeedForm(data, account=account, link=data['link'], feeds=feeds) if form.is_valid(): feed = form.save() return HttpResponseRedirect("/feeds/%s/" % (feed.slug,)) else: initial = { 'link': data['link'], 'feed': feeds[0] } form = SelectFeedForm(initial=initial, account=account, link=data['link'], feeds=feeds) return render_to_response('feeds/select.html', { 'feeds': unread_feeds, 'form': form, 'collapse_sidebar': True }, context_instance=RequestContext(request)) except UnicodeDecodeError: request.session['error'] = """ Encoding error. <a href="http://feedparser.org/">Universal Feed Parser</a> threw an exception while trying to make sense of the feed. When this happens it is almost certainly the feed's fault. """.strip().replace('\n', ' ') except feedfinder.TimeoutError: request.session['error'] = "Timeout" # TODO: handle properly return HttpResponseRedirect('/feeds/unread/')
def add_feed(request): account = request.account unread_feeds = account.get_unread_feeds() if request.POST and request.POST.has_key('link'): data = request.POST.copy() try: fp = feedparser.parse(data['link']) if fp.has_key('feed') and fp.feed.has_key('title'): feeds = [data['link']] else: feeds = feedfinder.feeds(data['link']) if not feeds: request.session['error'] = "No feeds found." return HttpResponseRedirect('/feeds/unread/') elif len(feeds) == 1: try: feed = Feed.objects.get(link=feeds[0]) except Feed.DoesNotExist: # add the feed to the db fp = feedparser.parse(feeds[0]) feed = Feed(title=fp.feed.title, link=feeds[0], is_active=True) feed.save() feed.update() else: pass # already added try: AccountFeed(account=account, feed=feed, title=feed.title).save() except: pass return HttpResponseRedirect("/feeds/%s/" % (feed.slug, )) else: if data.has_key('feed'): form = SelectFeedForm(data, account=account, link=data['link'], feeds=feeds) if form.is_valid(): feed = form.save() return HttpResponseRedirect("/feeds/%s/" % (feed.slug, )) else: initial = {'link': data['link'], 'feed': feeds[0]} form = SelectFeedForm(initial=initial, account=account, link=data['link'], feeds=feeds) return render_to_response( 'feeds/select.html', { 'feeds': unread_feeds, 'form': form, 'collapse_sidebar': True }, context_instance=RequestContext(request)) except UnicodeDecodeError: request.session['error'] = """ Encoding error. <a href="http://feedparser.org/">Universal Feed Parser</a> threw an exception while trying to make sense of the feed. When this happens it is almost certainly the feed's fault. """.strip().replace('\n', ' ') except feedfinder.TimeoutError: request.session['error'] = "Timeout" # TODO: handle properly return HttpResponseRedirect('/feeds/unread/')
def getFeedsFromUrl(url_addr): ''' Checks url_addr for any rss feeds and returns their '''\ '''urls as a json array. ''' feeds = feedfind.feeds(url_addr) return json.dumps(feeds)
blogs.append(l1) a=crawl(l1) except: pass #else: #count=0 for url in urls: try: html = requests.get(url) soup = BeautifulSoup(html.content) name=soup.title.string fr=feedfinder.feeds(url) print (fr) ill=ill+1 # filname=url.replace('http://' or 'https://','') # filname=filname.replace('.*www.','') # filname=filname.replace('.com','') # filname=filname.replace('.org.*','') # print filname blogs=[] with open('link%i.json'%ill,"w") as outfile: html = requests.get(url)