def get_first_working_feed_link(url): """ Try to use the current URL as a feed. If it works, returns it. It it doesn't, load the HTML and try to get links from it then test them one by one and returns the first one that works. >>> get_first_working_feed_link('http://www.codinghorror.com/blog/') u'http://feeds.feedburner.com/codinghorror/' >>> get_first_working_feed_link('http://feeds.feedburner.com/codinghorror/') u'http://feeds.feedburner.com/codinghorror/' """ # if the url is a feed itself, returns it html = urllib2.urlopen(url).read(1000000) feed = feedparser.parse(html) if not feed.get("bozo", 1): return unicode(url) # construct the site url from the domain name and the protocole name parsed_url = urllib2.urlparse.urlparse(url) site_url = u"%s://%s" % (parsed_url.scheme, parsed_url.netloc) # parse the html extracted from the url, and get all the potiential # links from it then try them one by one for link in extract_feed_links(html): if '://' not in link: # if we got a relative URL, make it absolute link = site_url + link feed = feedparser.parse(link) if not feed.get("bozo", 1): return link return None
def getFeed(self, url): """Fetch a feed. This may return a cached result if the cache entry is considered to be fresh. Returned feeds have been cleaned using the cleanFeed method. """ now=time.time() chooser=getUtility(ICacheChooser) cache=chooser("collective.portlet.feedmixer.FeedCache") cached_data=cache.get(url, None) cache_timeout = int(self.cache_timeout) if cached_data is not None: (timestamp, feed)=cached_data if now-timestamp<cache_timeout: return feed newfeed=feedparser.parse(url, etag=getattr(feed, "etag", None), modified=getattr(feed, "modified", None)) if newfeed.status==304: self.cleanFeed(feed) cache[url]=(now+cache_timeout, feed) return feed feed=feedparser.parse(url) self.cleanFeed(feed) cache[url]=(now+cache_timeout, feed) return feed
def test_empty_guid(self, get): get.return_value = responses(304) parsed = feedparser.parse(data_file('no-guid.xml')) data = list(filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] )) feed = FeedFactory.create(user__ttl=99999) with self.assertNumQueries(2): store_entries(feed.url, data) [entry] = es.manager.user(feed.user).fetch(annotate=feed.user)['hits'] self.assertTrue(entry.guid) entry.delete() parsed = feedparser.parse(data_file('no-link-guid.xml')) data = list(filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] # noqa )) feed = FeedFactory.create(user__ttl=99999) with self.assertNumQueries(2): store_entries(feed.url, data) [entry] = es.manager.user(feed.user).fetch()['hits'] self.assertTrue(entry.guid)
def check_github(): old_version = {} for repo in repo_names: old_version[repo] = feedparser.parse( 'https://github.com/' + account_name + '/' + repo + '/commits/' + branch + '.atom' ) time.sleep(SLEEP_SECONDS) # Wait then compare for repo in repo_names: new = feedparser.parse('https://github.com/' + account_name + '/' + repo + '/commits/' + branch + '.atom') try: if new.entries[0] != old_version[repo].entries[0]: #author = new.entries[0].author.split()[0] # First name author = new.entries[0].author_detail.href.split('/')[-1] commit_msg = new.entries[0].title print '\n' print "[" + repo + "] " + author + ": " + commit_msg print '\n' irc_msg("[" + repo + "] " + author + ": " + commit_msg) except: print "GitHub f****d up, I think. Here's what they gave us:" print new
def test_classify_from_feed(): rssParser = RssFeedParser() rssUrl_ny = 'http://newyork.craigslist.org/stp/index.rss' rssUrl_sf = 'http://sfbay.craigslist.org/stp/index.rss' rss_ny = feedparser.parse(rssUrl_ny) rss_sf = feedparser.parse(rssUrl_sf) vocabList,pSF,pNY = rssParser.localWords(rss_ny,rss_sf)
def getFeed(self, url): """Fetch a feed. """ now = time.time() chooser = getUtility(ICacheChooser) cache = chooser("observatorio.tema.FeedCache") cached_data = cache.get(url, None) if cached_data is not None: (timestamp, feed) = cached_data if now-timestamp < self.data.get('cache_timeout'): return feed newfeed = feedparser.parse(url, etag = getattr(feed, "etag", None), modified = getattr(feed, "modified", None)) if newfeed.status == 304: self.cleanFeed(feed) cache[url] = (now + self.data.get('cache_timeout'), feed) return feed feed = feedparser.parse(url) self.cleanFeed(feed) cache[url] = (now + self.data.get('cache_timeout'), feed) return feed
def dl_rss(url, party='Democratic'): """Takes url to media/press release page and finds the rss feed there""" try: if party == 'Democratic': links = get_prs_from_url(url, '//tr/td/a') for url in links: match = re.search(r'RSS_reader_Member\.asp\?Feed=(\d+)', url) if (match): feed_id = match.group(1) rss_feed = feedparser.parse('http://www.pahouse.com/pr/xml/%s.xml' %(feed_id)) if party == 'Republican': links = get_prs_from_url(url, '//div[@id="NewsRSS"]') try: rss_feed = feedparser.parse(links[0]) except: rss_feed = feedparser.parse(links[1]) # print rss_feed list_of_pr_dicts = [] for entry in rss_feed['entries']: # print entry if entry['link'] == None: continue list_of_pr_dicts.append({"title":entry['title'], "datestamp":parse_dates(entry['published']), "url":entry['link']}) return list_of_pr_dicts except: print "Could not get RSS Feed for %s.\nHere are the links:%s" %(url, links)
def read(request): #get initial time from feed getInitialFeed = feedparser.parse(PROTO + USERNAME + ":" + PASSWORD + "@" + SERVER + PATH) lastModified = getInitialFeed.entries[0].modified while True: #keep checking for feed scrapedFeed = feedparser.parse(PROTO+USERNAME+":"+PASSWORD+"@"+SERVER+PATH) scrapedModified = scrapedFeed.entries[0].modified #get time when feed is being checked if lastModified < scrapedModified: #if there is a new message (timestamp is greater than the last time the feed was checked) lastModified = scrapedModified #update the last time a new message arrived name1 = scrapedFeed.entries[0].author_detail.name #get details email1 = scrapedFeed.entries[0].author_detail.email content = str(scrapedFeed.entries[0].title) try: user = User.objects.get(email = email1) #try to get user who sent it from database except: x = find(name1,' ')+1 #if user does not exist, create user in database first = name1[:x] addUser(name1, email1, first) user = User.objects.get(email = email1) time1 = str(scrapedModified) #parse into string so it can be sliced time2 = time1[:10]+' '+time1[11:19] #edit string into a time that can be parsed time3 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S') #parse string into a datetime object underscorename = convert(user.username,' ','_') addMessage(user, email1, content, time3, underscorename) #add new Message object to database current = str(time.strftime('%X')) #get current time today = date.today() #get day of week today dayofweek = today.isoweekday() #get day of week check(current, dayofweek) return HttpResponse()
def getfeed(cls,feedurl,settings): feeddata = None retval = Notification() log = logging.getLogger('FeedGateway') log.info("Getting feed for %s", feedurl) try: if settings and settings["http_proxy"] and settings["http_proxy"]<>"": prxy = urllib2.ProxyHandler({"http": settings["http_proxy"]}) log.debug("Proxy being used is : %s",prxy) else: log.debug("No Proxy in use") prxy = None #print "before calling parse" if prxy: feeddata = feedparser.parse(feedurl, handlers = [prxy]) else: feeddata = feedparser.parse(feedurl) #log.debug(feeddata) #print "after calling parse" except Exception,err: log.error("Getfeed failed for %s with error: %s",feedurl,str(err)) raise
def expand_macro(self, formatter, name, args): outputText = "" #check arguments if (args == None or len(args) == 0): return None largs, kwargs = parse_args(args) if not kwargs.has_key('url'): return _usage() url = kwargs['url'] if kwargs.has_key('details'): details=kwargs['details'] else: details="false" if kwargs.has_key('proxy'): proxy=kwargs['proxy'] else: proxy=None try: if proxy != None: proxyHandler = urllib2.ProxyHandler({"http":proxy}) feedData = feedparser.parse(url, handlers = [proxyHandler]) else: response = urlopen(Request(url)) response.close() feedData = feedparser.parse(url) except HTTPError, e: outputText += "HTTP_ERROR("+str(e.code)+")"
def view_rss_feed(request, rssID): # Get the rss by its ID qs = RssFeed.objects.filter(id=rssID) # If doesn't exist, or if too bad, return empty entries for error if not qs or qs[0].mark < 5: return render(request, "viewrss.html", {'entries': None}) # if exist and accepted, get entries else: rss = qs[0] entries = FeedEntry.objects.filter(rssfeed=rss) # if entries doesn't exist, add all the entries if not entries: feed = feedparser.parse(rss.url) entries = feed['entries'] for x in entries: x['published'] = parser.parse(x['published']).replace(tzinfo=None) entry = FeedEntry(rssfeed=rss, title=x['title'], date=x['published'], link=x['link'], summary=x['summary']) entry.save() # if entries already exist, check updated date of rss feed and add only news entries else: feed = feedparser.parse(rss.url) entries = feed['entries'] for x in entries: x['published'] = parser.parse(x['published']).replace(tzinfo=None) if x['published'] > rss.updatedDate: entry = FeedEntry(rssfeed=rss, title=x['title'], date=x['published'], link=x['link'], summary=x['summary']) entry.save() # Update the rss update date rss.updatedDate = parser.parse(feed['feed']['updated']).replace(tzinfo=None) rss.save() return render(request, "viewrss.html", {'rss':rss, 'entries':entries})
def test_generate_utf8_encode_guid_bug(self): """Some feeds trigger utf8 bugs when the guid is generated.""" feed_str = get_data_file("mobile_it.rss") feed = feedparser.parse(feed_str) for entry in feed["entries"]: guid = feedutil.get_entry_guid(feed, entry) self.assertTrue(guid.startswith("http://")) feed_str = get_data_file("no-steam.rss") feed = feedparser.parse(feed_str) for entry in feed["entries"]: guid = feedutil.get_entry_guid(feed, entry) self.assertTrue(guid) feed_str = get_data_file("fakultet.xml") feed = feedparser.parse(feed_str) for entry in feed["entries"]: guid = feedutil.get_entry_guid(feed, entry) self.assertTrue(guid) feed_str = get_data_file("poker_pl.rss") feed = feedparser.parse(feed_str) for entry in feed["entries"]: guid = feedutil.get_entry_guid(feed, entry) self.assertTrue(guid)
def lookup(geo): """Look up articles for geo""" # Check cache try: if geo in lookup.cache: return lookup.cache[geo] except AttributeError: lookup.cache = {} # Replace special characters escaped = urllib.parse.quote(geo, safe="") # Get feed from Google feed = feedparser.parse(f"https://news.google.com/news/rss/local/section/geo/{escaped}") # If no items in feed, get feed from Onion if not feed["items"]: feed = feedparser.parse("http://www.theonion.com/feeds/rss") # Cache results lookup.cache[geo] = [{"link": item["link"], "title": item["title"]} for item in feed["items"]] # Return results return lookup.cache[geo]
def craigslist_example(): sf = feedparser.parse('http://sfbay.craigslist.org/eng/index.rss') ny = feedparser.parse('http://newyork.craigslist.org/eng/index.rss') sf_file = '/tmp/sf_top_words.csv' ny_file = '/tmp/ny_top_words.csv' get_top_words(sf, ny, sf_file, ny_file)
def tech_blog_example(): mashable = feedparser.parse('http://feeds.mashable.com/Mashable') tech_crunch = feedparser.parse('http://feeds.feedburner.com/TechCrunch/') mashable_file = '/tmp/mashable_top_words.csv' tech_crunch_file = '/tmp/tech_crunch_top_words.csv' get_top_words(mashable, tech_crunch, mashable_file, tech_crunch_file)
def test_304(self): # first retrieve the url u = 'http://localhost:8097/tests/http/http_status_304.xml' f = feedparser.parse(u) self.assertEqual(f.status, 200) self.assertEqual(f.entries[0].title, 'title 304') # extract the etag and last-modified headers e = [v for k, v in f.headers.items() if k.lower() == 'etag'][0] mh = [v for k, v in f.headers.items() if k.lower() == 'last-modified'][0] ms = f.updated mt = f.updated_parsed md = datetime.datetime(*mt[0:7]) self.assertTrue(isinstance(mh, basestring)) self.assertTrue(isinstance(ms, basestring)) self.assertTrue(isinstance(mt, time.struct_time)) self.assertTrue(isinstance(md, datetime.datetime)) # test that sending back the etag results in a 304 f = feedparser.parse(u, etag=e) self.assertEqual(f.status, 304) # test that sending back last-modified (string) results in a 304 f = feedparser.parse(u, modified=ms) self.assertEqual(f.status, 304) # test that sending back last-modified (9-tuple) results in a 304 f = feedparser.parse(u, modified=mt) self.assertEqual(f.status, 304) # test that sending back last-modified (datetime) results in a 304 f = feedparser.parse(u, modified=md) self.assertEqual(f.status, 304)
def get_urls(self): """ Returns a list of URLs to scrape """ urls = [] # NR comittees are LLP based if self.LLP: for i in self.LLP: roman_numeral = roman.toRoman(i) options = self.URLOPTIONS.copy() options['GP'] = roman_numeral options['NRBR'] = 'NR' url_options = urlencode(options) url_llp = "{}?{}".format(self.BASE_URL, url_options) rss = feedparser.parse(url_llp) print "GP {}: NR: {} Comittees".format( roman_numeral, len(rss['entries'])) urls = urls + [entry['link'] for entry in rss['entries']] # AKT = aktiv, AUF = aufgeloest for aktauf in ['AKT', 'AUF']: options['NRBR'] = 'BR' options['R_AKTAUF'] = aktauf url_options = urlencode(options) url_br = "{}?{}".format(self.BASE_URL, url_options) rss = feedparser.parse(url_br) print "BR {}: {} Comittees".format( aktauf, len(rss['entries'])) urls = urls + [entry['link'] for entry in rss['entries']] return urls
def fetch_and_parse_feed(url, etag=None, last_modified=None): # TODO implement etag & last_modified header url = sanitize_url(url) feed_parsed = feedparser.parse(url) if not hasattr(feed_parsed, 'status'): raise FetchingException("Connection error") elif feed_parsed.status not in (200, 301, 302): raise FetchingException("status_code is %d" % feed_parsed.status) if feed_parsed.version == '': # it's probably html instead of rss/atom resp = fetch_url(url) if resp.status_code not in (200, 301, 302): raise FetchingException("status_code is %d" % resp.status_code) soup = BeautifulSoup(resp.content) try: url = soup.find_all("link", rel="alternate")[0]['href'] except (IndexError, KeyError): # alternate-link is missing raise FetchingException("Neither RSS nor good HTML...") if not url.startswith("http"): url = concat_urls(resp.url, url) feed_parsed = feedparser.parse(url) if feed_parsed.status not in (200, 301, 302): raise FetchingException("status_code is %d" % feed_parsed.status) if feed_parsed.status == 302: # moved permanently logger.warning("/!\\ permanent redirect (302) for %s", url) url = feed_parsed.href elif feed_parsed.status == 301: logger.warning("/!\\ temporary redirect (301) for %s", url) return {"feed": feed_parsed, "real_url": url}
def get_accidents(): feed = feedparser.parse(RAIB_ATOM_URL) accidents = [] page = 1 while feed.entries: for entry in feed.entries: if not is_report_or_bulletin(entry): continue new_accident = accident.Accident( 'en', get_pdf_link(entry.content[0].value, feed.href), # Strip "Press release: " 'gb', ': '.join(entry.title.split(": ")[1:]), "RAIB") # Location is too hard to parse for now new_accident.longdesc = get_longdesc(entry.content[0].value) # Company is not provided (usually) # Let's parse the date with regex! regex = \ re.compile( ".* ([0-9]?[0-9](st|nd|rd|th)? [^ ]* [0-9][0-9][0-9][0-9]).*") matches = regex.match(entry.summary) if not matches is None: new_accident.date = dateutil.parser.parse(matches.group(1)) new_accident.published = dateutil.parser.parse(entry.published) new_accident.alturls = {'landing': entry.link} accidents.append(new_accident) page += 1 feed = feedparser.parse(RAIB_ATOM_URL + "&page={}".format(page)) return accidents
def test_empty_guid(self, get): get.return_value = responses(304) parsed = feedparser.parse(test_file('no-guid.xml')) data = filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] ) feed = FeedFactory.create() with self.assertNumQueries(5): store_entries(feed.url, data) self.assertTrue(feed.entries.get().guid) feed.entries.all().delete() parsed = feedparser.parse(test_file('no-link-guid.xml')) data = filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] ) feed = FeedFactory.create() with self.assertNumQueries(5): store_entries(feed.url, data) self.assertTrue(feed.entries.get().guid)
def albums(request, template_name="g_gallery/gallery.html"): """ A page with public albums from Picasa """ # An old-fashioned way to access albums data #gd_client = gdata.photos.service.PhotosService() #albums = gd_client.GetUserFeed(user=DJANGO_SETTINGS_MODULE.GOOGLE_ACCOUNT) g_albums = [] feed = feedparser.parse("https://picasaweb.google.com/data/feed/api/user/" + \ DJANGO_SETTINGS_MODULE.GOOGLE_ACCOUNT) for album in feed['entries']: g_alb = g_album(album.title, album.link, album.media_thumbnail[0]['url'], \ album.gphoto_numphotos, album.gphoto_id, album.summary) g_albums.append(g_alb) g_comments = [] comments = feedparser.parse("https://picasaweb.google.com/data/feed/api/user/" + \ DJANGO_SETTINGS_MODULE.GOOGLE_ACCOUNT + "?kind=comment&max-results=20") for comment in comments['entries']: com = g_comment(comment.content[0].value, comment.author, \ comment.links[0].href, comment.gphoto_thumbnail, comment.id, \ comment.gphoto_photoid) g_comments.append(com) return render_to_response(template_name, { "g_albums": g_albums, "g_comments": g_comments, }, context_instance=RequestContext(request))
def on_feed_input(self, feed, config): config = self.build_config(config) log.debug('Requesting feed `%s` url `%s`' % (feed.name, config['url'])) # check etags and last modified -headers # let's not, flexget works better when feed contains all entries all the time ? etag = None modified = None """ etag = feed.cache.get('etag', None) if etag: log.debug('Sending etag %s for feed %s' % (etag, feed.name)) modified = feed.cache.get('modified', None) if modified: log.debug('Sending last-modified %s for feed %s' % (etag, feed.name)) """ # set timeout to one minute orig_timout = socket.getdefaulttimeout() try: socket.setdefaulttimeout(60) # get the feed & parse if urllib2._opener: rss = feedparser.parse(config['url'], etag=etag, modified=modified, handlers=urllib2._opener.handlers) else: rss = feedparser.parse(config['url'], etag=etag, modified=modified) except LookupError, e: raise PluginError('Unable to parse the RSS: %s' % e)
def preprocess_public_feed(): """ reads the public feed - http://chroniclingamerica.loc.gov/batches/feed/ and returns a dictionary of {batch name: released datetime} """ LOGGER.info("processing public feed for released datetime") feed = feedparser.parse("http://chroniclingamerica.loc.gov/batches/feed/") batch_release_times = {} if len(feed.entries) == 0: LOGGER.error( "public feed did not return any batches! Check to make sure chroniclingamerica.loc.gov is running correctly" ) cont = True while cont: for entry in feed.entries: batch_name = re.match(r'info:lc/ndnp/batch/(.+)', entry.id).group(1) # convert time.struct from feedparser into a datetime for django released = datetime.fromtimestamp(mktime(entry.updated_parsed)) batch_release_times[batch_name] = released # if the batch starts with batch_ remove it, so that it works regardless of that prefix if batch_name.startswith("batch_"): batch_release_times[batch_name[6:]] = released next_page = get_next_page(feed) if next_page: feed = feedparser.parse(next_page) else: cont = False return batch_release_times
def clean_fields(self, exclude=None): super(Feed, self).clean_fields(exclude) errors = {} parsed_feed = feedparser.parse(self.url) # parsed feed is an invalid feed # TODO add more robust error handling if (parsed_feed.bozo and not parsed_feed.entries): # try finding a feed at the site feeds = find_feeds(self.url) if (feeds): self.url = feeds[0] parsed_feed = feedparser.parse(self.url) else: urlname = self._meta.get_field('url').name message = "Unable to find a feed at '{0}'".format(self.url) errors[urlname] = ([message]) raise ValidationError(errors) # if no title then use the feed's title if not self.title: self.title = parsed_feed.feed.title # set the source of the feed if parsed_feed.feed.title: self.source = urlparse.urlparse(parsed_feed.feed.link).hostname else: self.source = urlparse.urlparse(self.url).hostname
def test_details(self): [lp] = self.english_1.license_pools lp.suppressed = False with self.app.test_request_context("/"): response = self.manager.admin_work_controller.details(lp.data_source.name, lp.identifier.type, lp.identifier.identifier) eq_(200, response.status_code) feed = feedparser.parse(response.get_data()) [entry] = feed['entries'] suppress_links = [x['href'] for x in entry['links'] if x['rel'] == "http://librarysimplified.org/terms/rel/hide"] unsuppress_links = [x['href'] for x in entry['links'] if x['rel'] == "http://librarysimplified.org/terms/rel/restore"] eq_(0, len(unsuppress_links)) eq_(1, len(suppress_links)) assert lp.identifier.identifier in suppress_links[0] lp.suppressed = True with self.app.test_request_context("/"): response = self.manager.admin_work_controller.details(lp.data_source.name, lp.identifier.type, lp.identifier.identifier) eq_(200, response.status_code) feed = feedparser.parse(response.get_data()) [entry] = feed['entries'] suppress_links = [x['href'] for x in entry['links'] if x['rel'] == "http://librarysimplified.org/terms/rel/hide"] unsuppress_links = [x['href'] for x in entry['links'] if x['rel'] == "http://librarysimplified.org/terms/rel/restore"] eq_(0, len(suppress_links)) eq_(1, len(unsuppress_links)) assert lp.identifier.identifier in unsuppress_links[0]
def get_feed(url): try: feed = feedparser.parse(url) if feed.bozo: exc = feed.bozo_exception print ("Error detected on %s:" % url) print exc if feed.has_key("status"): if feed.status == 302: feed = feedparser.parse(feed.href) elif feed.status == 200: pass else: print ("%s returned http status code %s" % (url, feed.status)) except AttributeError as e: print ("skipping", url, "- error:", e.args[0]) except KeyError as e: print ("skipping", url, "- error:", e.args[0]) except KeyboardInterrupt: print ("^C recognized - stopping now") exit(1) return feed
def fetch(self, force=False): etag = self.feed.etag modified = self.feed.last_modified address = self.feed.feed_address if force: etag = None modified = None USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers > 1 else '', settings.READER4YOU_URL ) try: fpfeed = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) except (TypeError, ValueError), e: feedparser.PARSE_MICROFORMATS = False fpfeed = feedparser.parse(address, agent=USER_AGENT, etag=etag, modified=modified) feedparser.PARSE_MICROFORMATS = True
def get_feed(self, url): """Given url might be point to http document or to actual feed. In case of http document, we try to find first feed auto discovery url. """ stripped = url.strip() try: resp = requests.get(stripped) except Exception as err: return {'success': False, 'output': str(err)} feed = feedparser.parse(resp.text) if feed.version != '': return {'success': True, 'output': (feed, stripped)} urls = FeedFinder.parse(resp.text) feed_url = '' if len(urls) > 0: # Each url is tuple where href is first element. # NOTE : Sites might have several feeds available and we are just # naively picking first one found. feed_url = urls[0][0] if urlparse(feed_url)[1] == '': # We have empty 'netloc', meaning we have relative url feed_url = urljoin(stripped, feed_url) return {'success': True, 'output': (feedparser.parse(feed_url), feed_url)}
def get_latest_links(lang_name): import feedparser links = [] i = 1 if lang_name == "az": feed = feedparser.parse("http://milli.az/rss.php") for e in feed.entries: if "qtisadiyyat" in e.category: links.append((e.title, e.link)) if i != 5: i = i + 1 else: break if lang_name == "en": feed = feedparser.parse("http://today.az/rss.php") for e in feed.entries: if "Business" in e.category: links.append((e.title, e.link)) if i != 5: i = i + 1 else: break else: feed = feedparser.parse("http://1news.az/rss.php?sec_id=21") for e in feed.entries[:4]: links.append((e.title, e.link)) return links
def test_same_guids(self, get): get.return_value = responses(304) feed = FeedFactory.create() parsed = feedparser.parse(test_file('aldaily-06-27.xml')) data = filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] ) with self.assertNumQueries(5): store_entries(feed.url, data) self.assertEqual(feed.entries.count(), 4) data = filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] ) with self.assertNumQueries(2): store_entries(feed.url, data) self.assertEqual(feed.entries.count(), 4) parsed = feedparser.parse(test_file('aldaily-06-30.xml')) data = filter( None, [UniqueFeed.objects.entry_data( entry, parsed) for entry in parsed.entries] ) with self.assertNumQueries(5): store_entries(feed.url, data) self.assertEqual(feed.entries.count(), 10)
import RPi.GPIO as GPIO, feedparser, time, os.path USERNAME = "******" PASSWORD = "******" GPIO.setwarnings(False) GPIO.setmode(GPIO.BOARD) LIGHTS = 18 GPIO.setup(7, GPIO.OUT) cur_mails = int( feedparser.parse("https://" + USERNAME + ":" + PASSWORD + "@mail.google.com/gmail/feed/atom")["feed"]["fullcount"]) print("You have " + str(cur_mails) + " emails in your inbox.") if os.path.isfile("emails.txt") == False: #create the file if it doesnt exist f = open('emails.txt', 'w') f.write('1') #The interpreter doesn't like reading from an empty file f.close f = open('emails.txt', 'r') last_mails = int(f.read()) f.close() print("Last known number of emails is " + str(last_mails)) if cur_mails < last_mails: last_mails = cur_mails f = open('emails.txt', 'w')
try: data = open(feed_link, 'r').read() except IOError: self.feed.feed_link = 'http://' + feed_link self.fetch_page(urllib_fallback=True) return if data: html = self.rewrite_page(data) self.save_page(html) else: self.save_no_page() return except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL, requests.exceptions.ConnectionError), e: self.feed.save_page_history(401, "Bad URL", e) fp = feedparser.parse(self.feed.feed_address) feed_link = fp.feed.get('link', "") self.feed.save() except (urllib2.HTTPError), e: self.feed.save_page_history(e.code, e.msg, e.fp.read()) except (httplib.IncompleteRead), e: self.feed.save_page_history(500, "IncompleteRead", e) except (requests.exceptions.RequestException, requests.packages.urllib3.exceptions.HTTPError), e: logging.debug( ' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e)) # mail_feed_error_to_admin(self.feed, e, local_vars=locals()) return self.fetch_page(urllib_fallback=True, requests_exception=e) except Exception, e: logging.debug('[%d] ! -------------------------' %
def get_news(publication): feed = feedparser.parse(RSS_Feed[publication.lower()]) return feed['entries']
def download_rss(url): return feedparser.parse(url)
import csv, feedparser, time from datetime import date from storefront.models import BlogPostInfo blog_posts = BlogPostInfo.objects.all() feed = feedparser.parse('http://blog.indextank.com/feed/') for item in feed['items']: if not any(item['link'] == post.url for post in blog_posts): # if there isn't a post with this url, then create it d = date(item['date_parsed'][0], item['date_parsed'][1], item['date_parsed'][2]) BlogPostInfo.objects.create(url=item['link'].encode('utf-8'), title=item['title'].encode('utf-8'), author=item['author'].encode('utf-8'), date=d)
] # Date and time setup. I want only posts from "today," # where the day lasts until 2 AM. utc = pytz.utc homeTZ = pytz.timezone('US/Central') dt = datetime.now(homeTZ) if dt.hour < 2: dt = dt - timedelta(hours=24) start = dt.replace(hour=0, minute=0, second=0, microsecond=0) start = start.astimezone(utc) # Collect all of today's posts and put them in a list of tuples. posts = [] for s in subscriptions: f = fp.parse(s) try: blog = f['feed']['title'] except KeyError: continue for e in f['entries']: try: when = e['updated_parsed'] except KeyError: when = e['published_parsed'] when = utc.localize(datetime.fromtimestamp(time.mktime(when))) if when > start: title = e['title'] try: body = e['content'][0]['value'] except KeyError:
import feedparser rss_url = "http://news.google.com/?output=rss" feed = feedparser.parse(rss_url) count = len(feed['entries']) for i in range(0, count): if (i >= 9): break print '{0: <83} {1}...'.format(' ', feed.entries[i].title[0:40].encode('utf8'))
def get_random(): response = feedparser.parse("https://news.ycombinator.com/rss") entries = response["entries"] number_of_articles = len(entries) article_selected = random.randrange(number_of_articles) return entries[article_selected]
def onHeartbeat(self): now = datetime.now() rss = "" feed = "" FeedValueFTd = "" FeedValueFTm = "" if now >= self.nextpoll: self.nextpoll = now + timedelta(seconds=self.pollinterval) rss = Parameters["Mode1"] feed = feedparser.parse(rss) for key in feed["entries"]: FeedValue = str(key["description"]) FeedValue = '<tr>TODAY ' + FeedValue.split('Today')[1] FeedValueFTd = FeedValue.split('Tomorrow')[0] FeedValueFTm = FeedValue.split('Tomorrow')[1] Domoticz.Log("Gathering Data for:" + str(key["title"])) if not (FeedValueFTd.strip().find('wflag-l1') == -1): Domoticz.Debug( "Alarm(s) for Today: No special awareness required.") #Domoticz.Log("Data Of Warning:"+str(FeedValueFTd.strip())) Domoticz.Debug("Type Of Warning:" + str(FeedValueFTd.find('wflag-l1-t5.jpg'))) Domoticz.Debug("Data:" + str(FeedValueFTd).replace('<', '-')) ValueToUpdate = "No special awareness required" #Devices[1].Update(1,ValueToUpdate, Image=Images[icon].ID) Domoticz.Debug("Current Awareness Status:" + Devices[1].sValue + " with Level " + str(Devices[1].nValue)) if (ValueToUpdate != Devices[1].sValue): Domoticz.Log("Awareness for Today Updated to:" + ValueToUpdate) Devices[1].Update(1, ValueToUpdate) else: Domoticz.Log("Awareness Remains Unchanged for Today.") else: Domoticz.Debug( "------FEEDPARSER OUTPUT for TODAY:------------------") #Domoticz.Log("Type Of Warning:"+str(FeedValueFTd.find('wflag-l1-t5.jpg'))) #Domoticz.Log("Data:"+str(FeedValueFTd).replace('<br>','').replace('</br>','').replace('<td>','').replace('</td>','').replace('<tr>','').replace('</tr>','').replace('<b>','').replace('</b>','').replace('<i>','').replace('</i>','').replace('<','')) FeedValueFTdPeriod = FeedValueFTd.split('<td>')[0] FeedValueFTdPeriod = FeedValueFTdPeriod.split('alt="')[1] FeedValueFTdPeriod = FeedValueFTdPeriod.split(':') Domoticz.Debug( "Icon:" + FeedValueFTd.split('<td>')[0].replace('<', '-')) AWTPossitions = FeedValueFTd.replace('<', '-').split('awt:') #if AWTPossitions[2]: Domoticz.Log("AWT Possitions 2:"+AWTPossitions[2]) WarningText = "" for AWTPos in range(1, len(AWTPossitions)): AWTvalue = "" LEVELvalue = "" AWTvalue = AWTPossitions[AWTPos].split( 'level')[0].strip() Domoticz.Debug("AWT Possitions Value " + str(AWTPos) + ":" + AWTvalue) #LEVELvalue = AWTPossitions[AWTPos].split('level:')[1].split('border')[0].replace('"','').strip() LEVELvalue = AWTPossitions[AWTPos].split( 'level:')[1].split('"')[0] Domoticz.Debug("Level Possitions Value " + str(AWTPos) + ":" + LEVELvalue) AWTtext = AWTvalue if (AWTvalue == "1"): AWTtext = "Wind" if (AWTvalue == "2"): AWTtext = "Snow/Ice" if (AWTvalue == "3"): AWTtext = "ThunderStorm" if (AWTvalue == "4"): AWTtext = "Fog" if (AWTvalue == "5"): AWTtext = "High Temp" if (AWTvalue == "6"): AWTtext = "Low Temp" if (AWTvalue == "7"): AWTtext = "Coastal Event" if (AWTvalue == "8"): AWTtext = "Forestfire" if (AWTvalue == "9"): AWTtext = "Avalanches" if (AWTvalue == "10"): AWTtext = "Rain" if (AWTvalue == "11"): AWTtext = "Flood" if (AWTvalue == "12"): AWTtext = "Rain-Flood" if (AWTPos > 1): WarningText = WarningText + ", " WarningText = WarningText + AWTtext + "(" + LEVELvalue + ")" Domoticz.Debug("Alarm(s) for today:" + str(WarningText)) Domoticz.Debug("AWT:" + FeedValueFTdPeriod[1].split(' ') [0].replace('<', '-').replace('>', '-')) Domoticz.Debug("Level:" + FeedValueFTdPeriod[2].split('"') [0].strip().replace('<', '-')) Domoticz.Debug( "Period:" + FeedValueFTd.split('<td>')[1].strip( ).replace('<br>', '').replace('</br>', '').replace( '<td>', '').replace('</td>', '').replace('<', '-')) #Domoticz.Log("MessageLocal:"+FeedValueFTd.split('<td>')[2].split('.')[0].strip()) #Domoticz.Log("MessageEn:"+FeedValueFTd.split('<td>')[2].split('.')[1].strip().replace('<','-')) #Domoticz.Log("MessageEn:"+FeedValueFTd.split('<td>')[2].split('.')[1].split('english:')[1].strip()) #ValueToUpdate=FeedValueFTd.split('<td>')[2].split('.')[1].split('english:')[1].strip() if (LEVELvalue == "5"): LEVELvalue = "1" Domoticz.Debug("Current Awareness Status:" + Devices[1].sValue + " with Level " + str(Devices[1].nValue)) if (WarningText != Devices[1].sValue) or ( int(LEVELvalue) != Devices[1].nValue): Domoticz.Log("Awareness for Today Updated to:" + WarningText) Devices[1].Update(int(LEVELvalue), WarningText) else: Domoticz.Log("Awareness Remains Unchanged for Today.") if not (FeedValueFTm.strip().find('wflag-l1') == -1): Domoticz.Debug( "Alarm(s) for Tomorrow: No special awareness required") #Domoticz.Log("Data Of Warning:"+str(FeedValueFTm.strip())) Domoticz.Debug("Type Of Warning:" + str(FeedValueFTm.find('wflag-l1-t5.jpg'))) ValueToUpdate = "No special awareness required" Domoticz.Debug("Current Awareness Status:" + Devices[2].sValue + " with Level " + str(Devices[2].nValue)) if (ValueToUpdate != Devices[2].sValue): Domoticz.Log("Awareness for Tomorrow Updated to:" + ValueToUpdate) Devices[2].Update(1, ValueToUpdate) else: Domoticz.Log( "Awareness Remains Unchanged for Tomorrow.") else: #FeedValueFTm = FeedValueFTd.split('<tr>') Domoticz.Debug( "------FEEDPARSER OUTPUT for TOMORROW:------------------" ) #Domoticz.Log("Type Of Warning:"+str(FeedValueFTm.find('awt:5'))) FeedValueFTmPeriod = FeedValueFTm.split('<td>')[0] FeedValueFTmPeriod = FeedValueFTmPeriod.split('alt="')[1] FeedValueFTmPeriod = FeedValueFTmPeriod.split(':') Domoticz.Debug( "Icon:" + FeedValueFTm.split('<td>')[0].replace('<', '-')) AWTPossitions = FeedValueFTm.replace('<', '-').split('awt:') #if AWTPossitions[2]: Domoticz.Log("AWT Possitions 2:"+AWTPossitions[2]) WarningText = "" HLEVELvalue = 1 for AWTPos in range(1, len(AWTPossitions)): AWTvalue = "" LEVELvalue = "" AWTvalue = AWTPossitions[AWTPos].split( 'level')[0].strip() Domoticz.Debug("AWT Possitions Value " + str(AWTPos) + ":" + AWTvalue) #LEVELvalue = AWTPossitions[AWTPos].split('level:')[1].split('border')[0].replace('"','').strip() LEVELvalue = AWTPossitions[AWTPos].split( 'level:')[1].split('"')[0] Domoticz.Debug("Level Possitions Value " + str(AWTPos) + ":" + LEVELvalue) AWTtext = AWTvalue if (AWTvalue == "1"): AWTtext = "Wind" if (AWTvalue == "2"): AWTtext = "Snow/Ice" if (AWTvalue == "3"): AWTtext = "ThunderStorm" if (AWTvalue == "4"): AWTtext = "Fog" if (AWTvalue == "5"): AWTtext = "High Temp" if (AWTvalue == "6"): AWTtext = "Low Temp" if (AWTvalue == "7"): AWTtext = "Coastal Event" if (AWTvalue == "8"): AWTtext = "Forestfire" if (AWTvalue == "9"): AWTtext = "Avalanches" if (AWTvalue == "10"): AWTtext = "Rain" if (AWTvalue == "11"): AWTtext = "Flood" if (AWTvalue == "12"): AWTtext = "Rain-Flood" WarningText = WarningText + AWTtext + "(" + LEVELvalue + ")" if (AWTPos > 1): WarningText = WarningText + ", " Domoticz.Debug("Alarm(s) for Tomorrow:" + str(WarningText)) if (int(LEVELvalue) > HLEVELvalue): HLEVELvalue = int(LEVELvalue) Domoticz.Debug( "Icon:" + FeedValueFTm.split('<td>')[0].replace('<', '-')) Domoticz.Debug("AWT:" + FeedValueFTmPeriod[1].split(' ') [0].strip().replace('<', '-')) Domoticz.Debug("Level:" + FeedValueFTmPeriod[2].split('"') [0].strip().replace('<', '-')) #Domoticz.Log("Period:"+FeedValueFTm.split('<td>')[1].strip().replace('<','-')) #Domoticz.Log("MessageLocal:"+FeedValueFTm.split('<td>')[2].split('.')[0].strip().replace('<','-')) #Domoticz.Log("MessageEn:"+FeedValueFTm.split('<td>')[2].split('.')[1].split('english:')[1].strip().replace('<','-')) #Domoticz.Log(FeedValueFTm) #ValueToUpdate=FeedValueFTm.split('<td>')[2].split('.')[1].split('english:')[1].strip().replace('<','-') if (HLEVELvalue == 5): HLEVELvalue = 0 Domoticz.Debug("Current Awareness Status:" + Devices[2].sValue + " with Level " + str(Devices[2].nValue)) if (WarningText != Devices[2].sValue) or ( int(HLEVELvalue) != Devices[2].nValue): Domoticz.Log("Awareness for Tomorrow Updated to:" + WarningText) Devices[2].Update(HLEVELvalue, WarningText) else: Domoticz.Log( "Awareness Remains Unchanged for Tomorrow.") Domoticz.Debug( "----------------------------------------------------")
outfile = codecs.open(outputfile, 'a', 'utf-8') allWords = [] with open(inputfile) as infile: for account in infile: account = account.strip() time.sleep(1) print "Getting post titles for user: "******"http://%s" % urllib.quote(account) url += ".livejournal.com/data/rss" try: feed = feedparser.parse( url, agent='Web scraper by [email protected] @LawrenceA_UK') except: print "Error getting user: " + account continue # Get titles of last five posts for i in range(0, 4): try: # Get post title titleText = feed.entries[i].title # Get list of words from post title titleWords = re.findall(r'[\w]+', titleText, re.U)
#Author - Samaksh Yadav #Description - The following scraper scrapes rencent content available at TECHCRUNCH.COM #Version 1.0 import feedparser from urllib2 import urlopen from bs4 import BeautifulSoup rss = {'http://feeds.feedburner.com/TechCrunch/'} for key in rss: print(key) d = feedparser.parse(key) for post in d.entries: try: html = urlopen(post.link) bsObj = BeautifulSoup(html, "html.parser") str1 = str(bsObj.find("div", attrs={"class": "article-entry text"})) str2 = str( bsObj.find("div", attrs={"class": "aside aside-related-articles"})) str3 = bsObj.findAll("script") cleantext = bsObj.find("div", attrs={ "class": "article-entry text" }).get_text() date = bsObj.find("meta", attrs={ "class": "swiftype",
import feedparser newsfeed = feedparser.parse( "https://timesofindia.indiatimes.com/rssfeedstopstories.cms") print("number of rss", len(newsfeed.entries)) entry = newsfeed.entries[1] print(entry.title) print(entry.published) print(entry.summary) print(entry.link)
# https://gist.github.com/lukf/9785293 # -*- coding: utf-8 -*- import feedparser, webbrowser, urllib, console, sys, datetime, urllib2, time console.clear() selected = "no" feedURL = "http://feeds.pinboard.in/rss/secret:YOURSECRET/u:USERNAME/" # RSS feed URL outp = "Bookmarks from today \n|Time|Page|Tags|\n|:---|:---|:---|\n" # header dayone_footer = "#bookmark" # Gets appended to the entry now = datetime.datetime.now() for post in feedparser.parse( urllib2.urlopen(feedURL).read().replace('dc:subject>', 'subject>')).entries: postDate = datetime.datetime.strptime( post.date[:-6], '%Y-%m-%dT%H:%M:%S') + datetime.timedelta( seconds=-time.timezone, hours=1) # 2014-04-10T15:00:01+00:00 timediff = now - postDate if timediff < datetime.timedelta(days=1): # add to outp subject = "" try: subject = "#" + post.subject.replace(' ', ' #') except: pass outp = outp + "|" + (datetime.datetime.strftime( postDate, '%H:%M')) + "|" + "[" + post.title.replace( '[priv] ', '').replace( '|', '–') + "](" + post.link + ")|" + subject + "|\n" dayone_entry = outp + dayone_footer # User confirmation
def Parse(rsslink): return feedparser.parse(rsslink)
trainMat = []; trainClasses = [] for docIndex in trainningSet: trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNB0(trainMat, trainClasses) errorCount = 0 for docIndex in testSet: wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "the error rate is:" , float(errorCount)/ len(testSet) return vocabList, p0V, p1V import feedparser ny = feedparser.parse("http://newyork.craigslist.org/res/index.rss") sf = feedparser.parse("http://sfbay.craigslist.org/apa/index.rss") def getTopWords(ny,sf): import operator vocabList,p0V,p1V=localWords(ny,sf) topNY=[]; topSF=[] for i in range(len(p0V)): if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i])) if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i])) sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" for item in sortedSF: print item[0] sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
message('writing to {}'.format(filename)) with open(filename, 'wb') as outfile: outfile.write(data) q.task_done() # Set up some threads to fetch the enclosures. for i in range(num_fetch_threads): worker = threading.Thread( target=download_enclosures, args=(enclosure_queue, ), name='worker-{}'.format(i), ) worker.setDaemon(True) worker.start() # Download the feed(s) and put the enclosure URLs into # the queue. for url in feed_urls: response = feedparser.parse(url, agent='py_02_queueFetchPodcasts.py') for entry in response['entries'][:5]: for enclosure in entry.get('enclosures', []): parsed_url = urlparse(enclosure['url']) message('queuing {}'.format(parsed_url.path.rpartition('/')[-1])) enclosure_queue.put(enclosure['url']) # Now wait for the queue to be empty, indicating that we have # processed all of the downloads. message('*** main thread waiting') enclosure_queue.join() message('*** done')
import feedparser import csv purl = "https://gdata.youtube.com/feeds/api/playlists/" f = open('youtube_failed.csv', 'a') w = csv.writer(f) playlistid = raw_input("Enter the playlist id : ") prss = feedparser.parse(purl + str(playlistid)) for i in range(prss.entries.__len__()): try: vrss = feedparser.parse(str(prss.entries[i].id)) except: print(prss.entries[i].id, ' not parsed') #out = str.join(str(plid,vrss.entries[0].title,vrss.entries[0].description)) try: #print(playlist[plid],vrss.entries[0].title,vrss.entries[0].description) w.writerow([ prss.feed.title.encode('utf-8'), vrss.entries[0].title.encode('utf-8'), vrss.entries[0].link.encode('utf-8'), vrss.entries[0].description.encode('utf-8') ]) except: print("Error detected") f.close()
print e print 'starting from an empty database' db = {} # ----------------------------------------------------------------------------- # main loop where we fetch the new results print 'database has %d entries at start' % (len(db), ) num_added_total = 0 for i in range(args.start_index, args.max_index, args.results_per_iteration): print "Results %i - %i" % (i, i + args.results_per_iteration) query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % ( args.search_query, i, args.results_per_iteration) response = urllib.urlopen(base_url + query).read() parse = feedparser.parse(response) num_added = 0 num_skipped = 0 for e in parse.entries: j = encode_feedparser_dict(e) # extract just the raw arxiv id and version for this paper rawid, version = parse_arxiv_url(j['id']) j['_rawid'] = rawid j['_version'] = version # add to our database if we didn't have it before, or if this is a new version if not rawid in db or j['_version'] > db[rawid]['_version']: db[rawid] = j print 'updated %s added %s' % (j['updated'], j['title'])
def __init__(self, ticker): self.data = feedparser.parse( f'http://articlefeeds.nasdaq.com/nasdaq/symbols?symbol={ticker}')
def FetchURL(url): feed = feedparser.parse(url) feed_items = feed['items']
def rss(): url = request.form['rss_url'] dictionary = feedparser.parse(url) import json return json.dumps(dictionary.entries)
def __init__(self, loop_counter): # 1回目のループで insert google_url = 'http://feeds.feedburner.com/GDBcode' dena_url = 'https://engineer.dena.com/index.xml' merukari_url = 'https://tech.mercari.com/feed' sakura_url = 'https://knowledge.sakura.ad.jp/rss/' # published ---> updated あとで処理 smatrcamp_url = 'https://tech.smartcamp.co.jp/rss' salesforce_url = 'https://developer.salesforce.com/jpblogs/feed/' asana_url = 'https://blog.asana.com/category/eng/feed/' insta_url = 'https://instagram-engineering.com/feed' indeed_url = 'https://engineering.indeedblog.com/blog/feed/' dropbox_url = 'https://dropbox.tech/feed' uber_url = 'https://eng.uber.com/feed/' spotify_url = 'https://labs.spotify.com/feed/' xml_urls = [ google_url, dena_url, merukari_url, sakura_url, smatrcamp_url, salesforce_url, asana_url, insta_url, indeed_url, dropbox_url, uber_url, spotify_url ] xml_titles = [ 'Google', 'Dena', 'Merukari', 'Sakura', 'Smatrcamp', 'SalesForce', 'Asana', 'Insta', 'indeed', 'DropBox', 'Uber', 'Spotify' ] #xml_urls =[google_url,dena_url,merukari_url,sakura_url] if loop_counter == 0: con = psycopg2.connect(host="localhost", database="feed", user="******", password="******") cur = con.cursor() item_id = 0 for x, t in zip(xml_urls, xml_titles): loop_count = 0 while loop_count < 3: feed_url = feedparser.parse(x) media_title = t print(media_title) post_title = feed_url['entries'][loop_count]['title'] post_url = feed_url['entries'][loop_count]['link'] if 'published' in feed_url['entries'][loop_count].keys(): post_date = feed_url['entries'][loop_count][ 'published'] elif 'updated' in feed_url['entries'][loop_count].keys(): post_date = feed_url['entries'][loop_count]['updated'] postgres_insert_query = """ INSERT INTO feed (id, media, title, url, date) VALUES (%s,%s,%s,%s,%s)""" record_to_insert = (item_id, media_title, post_title, post_url, post_date) cur.execute(postgres_insert_query, record_to_insert) con.commit() loop_count += 1 item_id += 1 # XML クロール終了 cur.execute("select id,media,title,url,date from feed") for r in cur: print(r) else: # 2回目のループで update con = psycopg2.connect(host="localhost", database="feed", user="******", password="******") cur = con.cursor() item_id = 0 for x, t in zip(xml_urls, xml_titles): loop_count = 0 while loop_count < 3: feed_url = feedparser.parse(x) media_title = t print(media_title) post_title = feed_url['entries'][loop_count]['title'] post_url = feed_url['entries'][loop_count]['link'] if 'published' in feed_url['entries'][loop_count].keys(): post_date = feed_url['entries'][loop_count][ 'published'] elif 'updated' in feed_url['entries'][loop_count].keys(): post_date = feed_url['entries'][loop_count]['updated'] sql_update_query = """Update feed set title=%s, url=%s, date=%s where id=%s""" cur.execute( sql_update_query, (post_title, post_url, post_date, str(item_id))) con.commit() loop_count += 1 item_id += 1 #cur.execute("select id,media,title,url,date from feed") #for r in cur: # print(r) cur.close() con.close()
link_container = soup.findAll("div", {"class": "blog-details clear"}) #print('linkcontainer ====>', link_container) links = list( map(lambda row: row.find_all("pre", class_="links"), link_container)) flat_list = [item for sublist in links for item in sublist] #print('flatlist ========>', flat_list) filteredLinks = list( filter(lambda entry: hosterName in entry.string, flat_list)) if (len(filteredLinks) > 0): return filteredLinks[0] return None if __name__ == '__main__': d = feedparser.parse('http://rmz.cr/feed') download_folder = config.get('downloadfolder') quality = config.get('quality') shows = config.get('shows') hosterShort = config.get('hosterShort') hosterName = config.get('hosterName') # Iterate through the entries and fetch the title and link, which is the relevant data print('###################start################### ' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) prefiltered_values = list( filter(lambda x: hosterShort in x.title and quality in x.title, d['entries'])) for entry in prefiltered_values: raw_title = entry['title']
[["http://rss.lemonde.fr/c/205/f/3054/index.rss"],["le monde"]], [["http://fr.news.yahoo.com/rss/france"],["yahoo"]], [["http://fr.news.yahoo.com/rss/societe"],["yahoo"]], [["http://www.tf1.fr/xml/rss/0,,1,00.xml"],["tf1"]], [["http://www.20min.ch/rss/rss.tmpl?type=channel&get=6"],["20minutes"]], [["http://www.20min.ch/rss/rss.tmpl?type=channel&get=17"],["20minutes"]], [["http://www.francetvinfo.fr/titres.rss"],["francetv"]]] feeds = [] listemotcle=[] n=0 for element,journal in listeurl: listemotcle.append([]) MyFeedsConfig =element for feed in MyFeedsConfig: feeds.append(feedparser.parse(feed)) for feed in feeds: for item in feed[ "items" ]: phrase=item.title listemots = phrase.split() for elt in listemots: elt=elt.lower() elt=Enleve_Accents(elt) try: if elt[-1]=="s": elt=elt[0:-1] if elt[-1]==","or elt[-1]==".": elt=elt[0:-1] if "'" in elt: elt=elt[2:] if elt[-1]=='"':
#!/usr/bin/python # encoding: utf-8 import hashlib, feedparser, requests, httplib, urllib feed = feedparser.parse('http://www.addic7ed.com/rss.php?mode=hotspot') # verstuur pushover bericht def pushover(str): conn = httplib.HTTPSConnection("api.pushover.net:443") conn.request( "POST", "/1/messages.json", urllib.urlencode({ "token": "yourtoken", "user": "******", "message": str, }), {"Content-type": "application/x-www-form-urlencoded"}) conn.getresponse() return def pushover2(str): conn = httplib.HTTPSConnection("api.pushover.net:443") conn.request( "POST", "/1/messages.json", urllib.urlencode({ "token": "yourtoken", "user": "******", "message": str, }), {"Content-type": "application/x-www-form-urlencoded"}) conn.getresponse()
import sys import feedparser import db_operations import read_content config_file_path = 'datafeeder/conn.config' section_name = 'rss links' #prepare database db = db_operations.db_operations() if not db.connect(): exit(2) print('rss read and database connected') #read rss links from config file config_parser = ConfigParser() config_parser.read(config_file_path) if(config_parser.has_section(section_name)): links = config_parser.items(section_name) for rss_link in links[0][1].split(','): feed = feedparser.parse(rss_link) if len(feed.entries) ==0: continue for post in feed.entries: title = post.title link = post.link if(not db.found_duplicate(link, title)): db.insert(rss_link, title, link) read_content.read_content()
def getForecast(profile): return feedparser.parse("http://rss.wunderground.com/auto/rss_full/" + str(profile['location']))['entries']
import feedparser import re import sys import os # ../mylib.py sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from mylib import print_console def strip(html): return re.sub('<[^<]+?>', '', html) url = "http://www.priberam.pt/dlpo/DoDiaRSS.aspx" try: f = feedparser.parse(url) wotd_l = strip(f["items"][0]["summary"]).split("\n") except Exception: print_console("Error parsing results.") exit(-1) s = "\002%s\002" % wotd_l[0] for l in wotd_l[1:]: if len(l) > 1: l = l.strip() s += "%s\002;\002 " % (l) print_console(s)
def get_urls(): try: urls = [] hit_list = [ 'http://braziljournal.com/rss', 'http://jovempan.uol.com.br/feed', 'http://www.huffpostbrasil.com/rss/index.xml', 'http://www.jb.com.br/noticias/rss.xml', 'http://www.jornaisvirtuais.com.br/feed/', 'http://aosfatos.org/noticias/feed/', 'https://apublica.org/feed/', 'http://br.sputniknews.com/export/rss2/archive/index.xml', 'https://catracalivre.com.br/feed/', 'https://www.metropoles.com/feed', 'http://www.opopular.com.br/cmlink/o-popular-%C3%BAltimas-1.272904', 'http://altamiroborges.blogspot.com/feeds/posts/default', 'http://avozeavezdajuventude.blogspot.com/feeds/posts/default', 'http://blogdoprimo.com.br/feed/', 'http://blogdoriella.com.br/feed/', 'http://blogdoskarlack.com/feed/', 'http://www.jornalcash.com.br/?feed=rss2', 'http://blogmarcosfrahm.com/feed/', 'http://congressoemfoco.uol.com.br/feed/', 'http://elielbezerra.blogspot.com/feeds/posts/default', 'http://osamigosdopresidentelula.blogspot.com/feeds/posts/default', 'http://outraspalavras.net/feed/', 'http://outroladodanoticia.com.br/feed/', 'http://polibiobraga.blogspot.com/feeds/posts/default', 'http://poncheverde.blogspot.com/feeds/posts/default', 'http://previdi.blogspot.com/feeds/posts/default', 'http://sembalela.com/feed/', 'http://www.blogdafloresta.com.br/feed/', 'http://www.blogdoataide.com.br/feed/', 'http://www.blogdoluciosorge.com.br/feed/', 'http://www.diariodocentrodomundo.com.br/feed/', 'http://www.fabiocampana.com.br/feed/', 'http://www.imprensaviva.com/feeds/posts/default', 'http://www.jaderbarbalho.com/v3/index.php/feed/', 'http://www.ma10.com.br/minard/feed/', 'https://www.ocafezinho.com/feed/', 'http://www.ocombatente.com/feed/', 'http://www.politicanarede.com/feeds/posts/default', 'http://www.redebrasilatual.com.br/ultimas-noticias/atom.xml', 'http://www.saibamais.jor.br/feed/', 'http://www.tijolaco.com.br/blog/feed/', 'http://www.vermelho.org.br/xml/rss_noticias.xml', 'https://blogdoneylopes.wordpress.com/feed/', 'http://br.sputniknews.com/export/rss2/archive/index.xml', 'https://osdivergentes.com.br/feed/', 'https://www.balaiodokotscho.com.br/feed/', 'https://www.brasildefato.com.br/rss2.xml', 'https://www.ceticismopolitico.org/feed/', 'https://www.domingoscosta.com.br/feed/', 'https://www.oantagonista.com/feed/', 'https://jornalivre.com/feed/', 'http://marcossilverio.blogspot.com/feeds/posts/default', 'http://mauriciostycer.blogosfera.uol.com.br/feed/', 'http://www.otvfoco.com.br/feed/', 'http://www.telesintese.com.br/feed/', 'http://www.vcfaz.tv/rssnews.php?f=17', 'http://feed.observatoriodatelevisao.bol.uol.com.br/feed', 'http://comunicadores.info/feed/', 'http://portaldapropaganda.com.br/noticias/feed/', 'http://www.administradores.com.br/rss/noticias/', 'http://www.bluebus.com.br/feed/', 'http://www.inteligemcia.com.br/feed/', 'http://www.papelpop.com/feed/', 'http://www.updateordie.com/feed/', 'http://www.mundodomarketing.com.br/feed/rss.xml', 'https://www.promoview.com.br/feed.rss', 'http://feeds.feedburner.com/gospelprime', 'http://justificando.cartacapital.com.br/feed/', 'https://www.comunique-se.com.br/feed/', 'https://www.torcedores.com/comments/feed', 'http://www.portalmidiaesporte.com/feeds/posts/default', 'http://www.esporteemidia.com/feeds/posts/default', 'https://blogdopaulinho.com.br/feed/', 'http://www.mktesportivo.com/feed/', 'http://www.mtesporte.com.br/rss.php', 'http://lulacerda.ig.com.br/feed/', 'https://www.tecmundo.com.br/busca?q=feed', 'https://www12.senado.leg.br/noticias/feed/todasnoticias/RSS', 'https://www.ancine.gov.br/pt-br/rss.xml', 'https://gife.org.br/feed/', 'http://www.pt.org.br/feed/', 'http://servicios.lanacion.com.ar/herramientas/rss/origen=2' ] future_calls = [feedparser.parse(rss_url) for rss_url in hit_list] entries = [] for feed in future_calls: entries.extend(feed["items"]) for entrie in entries: href = entrie['link'] urls.append(href) return urls except: raise Exception('Exception in rss_multiplos')
def parse(self): # fetch etag and last modified head_response = requests.head( self.podcast.rss, headers=get_headers(), timeout=5 ) head_response.raise_for_status() headers = head_response.headers # if etag hasn't changed then we can skip etag = headers.get("ETag") if etag and etag == self.podcast.etag: return [] response = requests.get( self.podcast.rss, headers=get_headers(), stream=True, timeout=5 ) response.raise_for_status() data = feedparser.parse(response.content) feed = data["feed"] entries = {e["id"]: e for e in data.get("entries", []) if "id" in e}.values() if not entries: return [] dates = [d for d in [parse_date(e.get("published")) for e in entries] if d] now = timezone.now() if dates: pub_date = max([date for date in dates if date and date < now]) do_update = ( pub_date and self.podcast.last_updated is None or self.podcast.last_updated < pub_date ) if not do_update: return [] if etag: self.podcast.etag = etag self.podcast.title = feed["title"] self.podcast.description = feed["description"] self.podcast.language = feed.get("language", "en")[:2].strip().lower() self.podcast.explicit = bool(feed.get("itunes_explicit", False)) if not self.podcast.cover_image: image_url = None # try itunes image first soup = BeautifulSoup(response.content, "lxml") itunes_img_tag = soup.find("itunes:image") if itunes_img_tag and "href" in itunes_img_tag.attrs: image_url = itunes_img_tag.attrs["href"] if not image_url: try: image_url = feed["image"]["href"] except KeyError: pass try: if image_url and (img := fetch_image_from_url(image_url)): self.podcast.cover_image = img except InvalidImageURL: pass self.podcast.link = feed.get("link") categories_dct = get_categories_dict() keywords = [t["term"] for t in feed.get("tags", [])] categories = [categories_dct[kw] for kw in keywords if kw in categories_dct] self.podcast.last_updated = now self.podcast.pub_date = pub_date keywords = [kw for kw in keywords if kw not in categories_dct] self.podcast.keywords = " ".join(keywords) authors = set( [ author["name"] for author in feed.get("authors", []) if "name" in author and author["name"] ] ) self.podcast.authors = ", ".join(authors) self.podcast.extracted_text = self.extract_text(categories, entries) self.podcast.save() self.podcast.categories.set(categories) new_episodes = self.create_episodes_from_feed(entries) if new_episodes: self.podcast.pub_date = max(e.pub_date for e in new_episodes) self.podcast.save(update_fields=["pub_date"]) return new_episodes