Exemplo n.º 1
0
def get_feeds(old_links, url):

    r  = requests.get(url)

    data = r.text
    soup = BeautifulSoup(data)

    news_source_list  = []
    twitter_feed_list = []

    for link in soup.find_all('a'):
        
        full_link = link.get('href')

        try:

            o = urlparse(full_link)
            news_source = o.hostname 

            if news_source is None:
                continue

            if news_source == 'twitter.com':
                locs = [m.start() for m in re.finditer('/', o.path)]
                news_source += o.path[:locs[1]]
                twitter_feed_list.append(news_source)
            else:
                news_source_list.append(news_source)
        
        except:
            pass


    print 'Links Found: ', len(set(news_source_list))
    print 'Old Links: ', len(old_links)
    news_source_list  = list(set(news_source_list) - set(old_links)) 
    print 'New Links: ', len(news_source_list)
    twitter_feed_list = list(set(twitter_feed_list))

    news_rss_list = []
    for index, item in enumerate(news_source_list):
        print 'Feed: %d/%d' %(index, len(news_source_list))
        try:    
            
            # put a timeout as feedfinder can stall indefinitely
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(10)

            rss = feedfinder.feeds(item)

            # cancel timeout if feedfinder returned
            signal.alarm(0)

            if rss is not None and len(rss) > 0:
                rss.sort(key = lambda s: len(s))
                news_rss_list.append((item, rss[0]))

        except Exception, exc:
            print exc
            pass
Exemplo n.º 2
0
def url_to_subscription(subscription_url):
    """Create a subscription object from a url —if possible"""
    if not subscription_url.startswith('http://'):
        subscription_url = 'http://%s' % subscription_url
    feeds = feedfinder.feeds(subscription_url)
    if feeds:
        subscription = Subscription()
        subscription.feed_url = feeds[0]
        data = feedparser.parse(subscription.feed_url)
        links = data.feed.get('links', [])
        if links:
            hubs = [link for link in data.feed.links if link['rel']==u'hub']
            logging.info(hubs)
            if len(hubs) > 0:
                subscription.hub = hubs[0]['href']
            else:
                subscription.hub = ''
        subscription.feed_id = data.feed.get('id', 'No ID')
        subscription.title = data.feed.get('title', 'No title')
        subscription.url = data.feed.get('link', subscription_url)
        subscription.etag = data.get('etag', '')
        updated_tuple = data.feed.get('date_parsed', None)
        if updated_tuple:
            subscription.updated = datetime.datetime(*updated_tuple[:6])
        else:
            subscription.updated = datetime.datetime.today()

    else:
        subscription = None

    return subscription
Exemplo n.º 3
0
def add_feed(filename, url, rawdog, config):
	"""Try to add a feed to the config file."""
	feeds = feedfinder.feeds(url)
	if feeds == []:
		print >>sys.stderr, "Cannot find any feeds in " + url
	else:
		feed = feeds[0]
		if feed in rawdog.feeds:
			print >>sys.stderr, "Feed " + feed + " is already in the config file"
		else:
			print >>sys.stderr, "Adding feed " + feed
			feedline = "feed %s %s\n" % (config["newfeedperiod"], feed)
			edit_file(filename, AddFeedEditor(feedline).edit)
Exemplo n.º 4
0
def scan_url(url, depth):
    if url.strip().endswith('robots.txt'):
        # Ignore such URLs as they are fruitless.
        print "Skipping {}".format(url)
        return
    u2 = urlscanner.url_scanner(url.strip(), depth)
    for U in u2:
        if U.strip().lower().endswith('robots.txt'):
            continue
        logger.info("searching for feeds in: %s", U)
        feeds = feedfinder.feeds(U.strip())
        logger.info("found %s feeds", len(feeds))
        if feeds:
            logger.info(str(feeds))
            feedfinder.store_feeds(feeds)
Exemplo n.º 5
0
	def DoFeed(self, base_uri, cooked):
		feedds = []
		if feedfinder:
			feed_uris = feedfinder.feeds(self.page_uri)
			for feed_uri in feed_uris:
				feedd = {
					"@uf" : "feed",
					"@link" : feed_uri,
					"@title" : "feed",
					"src" : feed_uri,
				}

				feedds.append(feedd)

		self.PutData("feed", feedds)
Exemplo n.º 6
0
def parsefeed(url):

    """this just attaches to feedfinder.  Let's keep this function as a hook though
    to add more functionality down the line: eg, multiple results per feed_find"""

    try:
        logger.info('using feed finder')
        res = feedfinder.feeds(url)
        # res = feedfinder_new.feeds(url)
        feed_len = len(res)

        if res:
            logger.info('{0} feeds found: {1}'.format(feed_len, res))
            return res, feed_len

        else:
            logger.warn("no results found...")
            return False, 0

    except Exception, e:
        logger.exception("error!")
        return False
Exemplo n.º 7
0
def add_feed(request):
    account = request.account
    unread_feeds = account.get_unread_feeds()
    
    if request.POST and request.POST.has_key('link'):
        data = request.POST.copy()
        
        try:
            fp = feedparser.parse(data['link'])
            if fp.has_key('feed') and fp.feed.has_key('title'):            
                feeds = [data['link']]
            else:
                feeds = feedfinder.feeds(data['link'])
            if not feeds:                
                request.session['error'] = "No feeds found."
                return HttpResponseRedirect('/feeds/unread/')
            
            elif len(feeds) == 1:                
                try:
                    feed = Feed.objects.get(link=feeds[0])
                    
                except Feed.DoesNotExist:
                    # add the feed to the db
                    fp = feedparser.parse(feeds[0])
                    feed = Feed(title=fp.feed.title, link=feeds[0], is_active=True)
                    feed.save()
                    feed.update()
                    
                else:
                    pass # already added                       
                    
                try:
                    AccountFeed(account=account, feed=feed, title=feed.title).save()
                except:
                    pass
                
                return HttpResponseRedirect("/feeds/%s/" % (feed.slug,))
                
            else:                
                if data.has_key('feed'):
                    form = SelectFeedForm(data, account=account, link=data['link'], feeds=feeds)
                    if form.is_valid():
                        feed = form.save()
                        return HttpResponseRedirect("/feeds/%s/" % (feed.slug,))
                else:
                    initial = {
                        'link': data['link'],
                        'feed': feeds[0]
                    }
                    form = SelectFeedForm(initial=initial, account=account, link=data['link'], feeds=feeds)
                
                return render_to_response('feeds/select.html', {                    
                    'feeds': unread_feeds,
                    'form': form,
                    'collapse_sidebar': True
                }, context_instance=RequestContext(request))
        except UnicodeDecodeError:
            request.session['error'] = """
Encoding error.
<a href="http://feedparser.org/">Universal Feed Parser</a>
threw an exception while trying to make sense of the feed.
When this happens it is almost certainly the feed's fault.
            """.strip().replace('\n', ' ')
        except feedfinder.TimeoutError:
            request.session['error'] = "Timeout"
            # TODO: handle properly
            return HttpResponseRedirect('/feeds/unread/')
Exemplo n.º 8
0
def add_feed(request):
    account = request.account
    unread_feeds = account.get_unread_feeds()

    if request.POST and request.POST.has_key('link'):
        data = request.POST.copy()

        try:
            fp = feedparser.parse(data['link'])
            if fp.has_key('feed') and fp.feed.has_key('title'):
                feeds = [data['link']]
            else:
                feeds = feedfinder.feeds(data['link'])
            if not feeds:
                request.session['error'] = "No feeds found."
                return HttpResponseRedirect('/feeds/unread/')

            elif len(feeds) == 1:
                try:
                    feed = Feed.objects.get(link=feeds[0])

                except Feed.DoesNotExist:
                    # add the feed to the db
                    fp = feedparser.parse(feeds[0])
                    feed = Feed(title=fp.feed.title,
                                link=feeds[0],
                                is_active=True)
                    feed.save()
                    feed.update()

                else:
                    pass  # already added

                try:
                    AccountFeed(account=account, feed=feed,
                                title=feed.title).save()
                except:
                    pass

                return HttpResponseRedirect("/feeds/%s/" % (feed.slug, ))

            else:
                if data.has_key('feed'):
                    form = SelectFeedForm(data,
                                          account=account,
                                          link=data['link'],
                                          feeds=feeds)
                    if form.is_valid():
                        feed = form.save()
                        return HttpResponseRedirect("/feeds/%s/" %
                                                    (feed.slug, ))
                else:
                    initial = {'link': data['link'], 'feed': feeds[0]}
                    form = SelectFeedForm(initial=initial,
                                          account=account,
                                          link=data['link'],
                                          feeds=feeds)

                return render_to_response(
                    'feeds/select.html', {
                        'feeds': unread_feeds,
                        'form': form,
                        'collapse_sidebar': True
                    },
                    context_instance=RequestContext(request))
        except UnicodeDecodeError:
            request.session['error'] = """
Encoding error.
<a href="http://feedparser.org/">Universal Feed Parser</a>
threw an exception while trying to make sense of the feed.
When this happens it is almost certainly the feed's fault.
            """.strip().replace('\n', ' ')
        except feedfinder.TimeoutError:
            request.session['error'] = "Timeout"
            # TODO: handle properly
            return HttpResponseRedirect('/feeds/unread/')
Exemplo n.º 9
0
def getFeedsFromUrl(url_addr):
    ''' Checks url_addr for any rss feeds and returns their '''\
    '''urls as a json array. '''

    feeds = feedfind.feeds(url_addr)
    return json.dumps(feeds)
Exemplo n.º 10
0
                        blogs.append(l1)
                    
                
                    a=crawl(l1)
    except:
        pass

            #else:
                #count=0
           
for url in urls:
    try:
        html = requests.get(url)
        soup = BeautifulSoup(html.content)
        name=soup.title.string
        fr=feedfinder.feeds(url)
        print (fr)




        ill=ill+1
        # filname=url.replace('http://' or 'https://','')
        # filname=filname.replace('.*www.','')
        # filname=filname.replace('.com','')
        # filname=filname.replace('.org.*','')
        # print filname
        blogs=[]
        with open('link%i.json'%ill,"w") as outfile:
            
                html = requests.get(url)
Exemplo n.º 11
0
def getFeedsFromUrl(url_addr):
    ''' Checks url_addr for any rss feeds and returns their '''\
    '''urls as a json array. '''

    feeds = feedfind.feeds(url_addr)
    return json.dumps(feeds)