Exemplo n.º 1
0
	def get(self):
		url = self.request.get("url")
		section = models.Section.get_by_key_name(url)
		query = "select * from html where url = '"+ url +"' and xpath='//div[@id=\"main\"]//a'"
		result = helpers.do_yql(query)
#		self.response.out.write(result)
		resulthash = hashlib.md5(str(result)).hexdigest()
		section.last_hash = resulthash
		section.last_scraped = datetime.datetime.today()
		linklist = []
		for link in result['query']['results']['a']:
			if not "http://www.newsmill.se" in link['href']:
				if not "http://www.pastan.nu" in link['href']:
					if not "http://www.nyteknik.se" in link['href']:
						if not "http://www.alltommotor.se" in link['href']:
							if not "http://www.motimate.se" in link['href']:
								if not "http://altfarm.mediaplex.com" in link['href']:
									if not "http://affiliate.se.espotting.com" in link['href']:
										if not "mailto:" in link['href']:
											if "http://www.dn.se" in link['href']:
												linkurl = link['href']
											else:
												linkurl = "http://www.dn.se" + link['href']
											linkurl = linkurl.replace("#article-readers", "")
											if linkurl not in linklist:
												linklist.append(linkurl)
		datekey = str(datetime.datetime.today()).replace("-", "").replace(" ", "")[0:10]
		sectionkey = datekey + url
		sectionpagelist = models.SectionPageList.get_or_insert(sectionkey, section=section, section_datekey=datekey, pagelist=linklist)
		self.response.out.write(sectionkey + "<br />")
		self.response.out.write(str(datetime.datetime.today()) + "<br />")
		self.response.out.write(str(len(linklist)) + "<br />")
		self.response.out.write(str(linklist) + "<br />")
		section.put()
Exemplo n.º 2
0
	def get(self):
		url = self.request.get("url")
		item = models.Item.get_by_key_name(url)
		query = "select src, height, width from html where url='"+ url +"' and xpath='//div[@id=\"main\"]//img' and height > 200 and width > 200"
		result = helpers.do_yql(query)
		self.response.out.write(url)
		self.response.out.write("<br />")
		imgurl = False
		try:
			element = result['query']['results']['img'][0]
			if "www.dn.se" in element['src']:
				imgurl = element['src']
			else:
				imgurl = "http://www.dn.se%s" % (element['src'])
			self.response.out.write("<img src=\"%s\"/>" % imgurl)
			if imgurl:
				item.img_url = imgurl
			else:
				item.img_url = False
			item.put()
			self.response.out.write("<br />")
			self.response.out.write(imgurl)
		except:
			self.response.out.write(result)
		self.response.out.write("<br />")
		if item:
			self.response.out.write("is an item")
			taskqueue.add(url='/scrape/imagecache', params={"url":item.item_url, "imgurl":imgurl}, method='GET')
		else:
			self.response.out.write("scrape item")
		self.response.out.write("<br />")
Exemplo n.º 3
0
	def get(self):
		url = self.request.get("url")
		query = "select title, link, description, category.content, creator, pubDate, link from rss where url='"+ url +"'"
		result = helpers.do_yql(query)
		feed = models.Feed.get_by_key_name(url)
		resulthash = hashlib.md5(str(result)).hexdigest()
		feed.last_hash = resulthash
		feed.last_scraped = datetime.datetime.today()
		self.response.out.write(resulthash)
		self.response.out.write("<br />")
		self.response.out.write("<br />")
		try:
			for element in result['query']['results']['item']:
				models.Item.get_or_insert(element['link'], item_url=element['link'], title=element['title'], category=element['category'], description=element['description'], byline=element['creator'], pubDate=element['pubDate'])
				self.response.out.write(element['category'])
				self.response.out.write("<br />")
				self.response.out.write(element['description'])
				self.response.out.write("<br />")
				self.response.out.write(element['title'])
				self.response.out.write("<br />")
				self.response.out.write(element['pubDate'])
				self.response.out.write("<br />")
				self.response.out.write(element['creator'])
				self.response.out.write("<br />")
				self.response.out.write(element['link'])
				self.response.out.write("<br />")
				self.response.out.write("<br />")
		except:
			self.response.out.write("fail")
		feed.put()
Exemplo n.º 4
0
	def get(self):
		url = self.request.get("url")
		query = "select script from html where url=\"http://maps.google.com/maps/ms?ie=UTF8&hl=en&msa=0&ll=-3.073324,37.411366&spn=0.126847,0.245819&t=h&z=13&msid=116890249293007182618.00048088bb3d7fc8d89e1\""
		results = helpers.do_yql(query)['query']['results']['body']
		for result in results:
			item = str(result['script']['content'])
			if "KEOB" in item:
				things = item.split("KEOB")
				geoelements = things[1].split("@")
				messageelements = things[3].split("@")
				latlng = geoelements[1][0:geoelements[1].find("\"")]
				lat = latlng.split(",")[0]
				lng = latlng.split(",")[1]
				messagebits = messageelements[1].replace("\",infoWindow:{title:\"", "").replace("\\", "").replace("x3c", "").replace("x3e", "").replace("/div", "").replace("\n", " ").split(":brbr")
				self.response.out.write(lat)
				self.response.out.write("<br />")			
				self.response.out.write(lng)
				self.response.out.write("<br />")			
				self.response.out.write("<br />")
				note = "<strong>From TeamKilimanjaro Guides</strong><br />" + messagebits[1].strip() + "<br />" + messagebits[0].strip()
				self.response.out.write(note)
				self.response.out.write("<br />")			
				self.response.out.write("<br />")
				keyhash = hashlib.md5(lat + lng).hexdigest();
				location = models.Location.get_or_insert(keyhash, lat=lat, lng=lng, note=note)
				geolist = models.List.get_or_insert("geolist", list=[])
				if keyhash not in geolist.list:
					geolist.list.append(keyhash)
					newsitem = models.NewsItem.get_or_insert(keyhash, text=note)
Exemplo n.º 5
0
	def get(self):
		query = "SELECT * FROM flickr.people.publicphotos(0,10) WHERE user_id='51711675@N02' AND extras='url_sq, date_upload, url_m'"
		results = helpers.do_yql(query)['query']['results']['photo']
		flickrlist = models.List.get_or_insert("flickrlist", list=[])
		for photo in results:
			created_at = datetime.datetime.fromtimestamp(float(photo['dateupload']))
			flickr_id = photo['id']
			flickr_secret = photo['secret']
			square = photo['url_sq']
			medium = photo['url_m']
			title = photo['title']
			if flickr_secret not in flickrlist.list:
				flickrlist.list.append(flickr_secret)
				flickrlist.put()
				flickr = models.Flickr.get_or_insert(flickr_secret, flickr_secret=flickr_secret, created_at=created_at, flickr_id=flickr_id, square=square, medium=medium, title=title)
				taskqueue.add(url="/scrape/flickrshortner", params={"flickr_secret": flickr_secret}, method='GET')
				message = "<span class='newsimg'><img src='"+ square +"' height='45' width='45' alt='"+ title +"'/></span>" + "<a href='javascript:showImage(\""+ flickr_secret +"\")'>" + title + "</a>"
				newsitem = models.NewsItem.get_or_insert(flickr_secret, text=message, created_at=created_at)
			self.response.out.write(created_at)
			self.response.out.write("<br />")
			self.response.out.write(square)
			self.response.out.write("<br />")
			self.response.out.write(medium)
			self.response.out.write("<br />")
			self.response.out.write(title)
			self.response.out.write("<br />")
			self.response.out.write(flickr_id)
			self.response.out.write("<br />")
			self.response.out.write("<br />")
Exemplo n.º 6
0
 def get(self):
     people = []
     # gets the Lanyrd URL
     lanyrd = self.request.get("lanyrd")
     # gets the number of winners
     number = self.request.get("number")
     # casts the number to an int, defaults to 5 randoms if no number specified
     if number:
         try:
             number = int(number)
         except:
             number = 5
     else:
         number = 5
     # does the YQL thing to get the people tracking on Lanyrd
     yqlquery = "select href from html where url=\"" + lanyrd + "\" and xpath=\"//div[@class='trackers-placeholder placeholder']/ul/li/a\""
     logging.warn(yqlquery)
     # need to put some handling in here for when there are no trackers, assumes that there are
     results = helpers.do_yql(yqlquery)["query"]["results"]["a"]
     if results:
         # gets a lot more random numbers than needed, but useful where small sample sets to avoid collisions, needs optimising for when there are a big number of trackers or a large number of randoms needed, or god forbid both
         maxrandoms = (len(results) - 1) * number
         maxvalue = len(results) - 1
         # build query for Random.org
         randomquery = "http://www.random.org/integers/?num=" + str(
             maxrandoms) + "&min=0&max=" + str(
                 maxvalue) + "&col=1&base=10&format=plain&rnd=new"
         # and go fetch it
         result = urlfetch.fetch(randomquery)
         if result.status_code == 200:
             # each number is on a new line, so split into an array, sadly it'll have 1 item left over, TODO fix trailing \n
             randoms = result.content.split("\n")
             # put something out on top of page to show how many numbers and for true transparency give the array of random numbers
             self.response.out.write(
                 "<div>Array of " + str(maxrandoms) +
                 " random numbers from Random.org</div><hr /><div>" +
                 str(randoms) + "</div><hr /><div>Selection of " +
                 str(number) + " random trackers from " +
                 str(len(results)) + " of " + lanyrd + ":</div><br />")
             # iterate throguh random numbers
             for i in randoms:
                 # iterate through the random numbers
                 if len(people) < number:
                     # only parse if we haven't picked the n random people
                     try:
                         # here as a horrible hack for the trailing \n TODO clean up
                         name = results[int(i)]["href"].replace(
                             "/profile/", "").replace("/", "")
                         # check to see if person has been picked, greater chance in small sample sets
                         if name not in people:
                             # if they're not there add them to the array and display on page
                             people.append(name)
                             self.response.out.write("<div>" + name +
                                                     "</div>")
                     except:
                         i = "NaN"
                 else:
                     break
Exemplo n.º 7
0
	def get(self):
		url = self.request.get("url")
		query = "select * from html where url=\"%s\" and (xpath=\"//table[@class='frp-totals']/tbody/tr/td/p\" or xpath=\"//table[@class='tbl-donations']/tbody/tr\")" % url
		results = helpers.do_yql(query)['query']['results']
		totals = results['p']
		donations = results['tr']
		online = float(totals[0][1:len(totals[0])].replace(",", ""))
		offline = float(totals[1][1:len(totals[1])].replace(",", ""))
		giftaid = float(totals[2][1:len(totals[2])].replace(",", ""))
		donationset = models.DonationSet.get_or_insert(url, url=url, online=online, offline=offline, giftaid=giftaid)
		donationlist = models.List.get_or_insert("donationlist", list=[])
		donationset.online = online
		donationset.offline = offline
		donationset.giftaid = giftaid
		donationset.put()
		self.response.out.write(online)
		self.response.out.write("<br />")
		self.response.out.write(offline)
		self.response.out.write("<br />")
		self.response.out.write(giftaid)
		self.response.out.write("<br />")
		self.response.out.write("<br />")
		for donation in donations:			
			donationdetails = donation['td'][1]['div']['p']['span']
			self.response.out.write(donationdetails)
			message = donationdetails[0]['content'].replace("\n", " ")
			try:
				donor = donationdetails[1]['strong'].strip().replace("\n", " ")
				date = helpers.convert_justgiving_datetime(donationdetails[2])
				rawdate = donationdetails[2]
			except:
				donor = "Anonymous"
				date = helpers.convert_justgiving_datetime(donationdetails[1])
				rawdate = donationdetails[1]
			try:
				rawamount = donation['td'][2]['strong']
				amount = rawamount
				#			amount = rawamount[1:len(rawamount)]
			except:
				amount = ""
			keyhash = hashlib.md5(str(url + donor + rawdate.replace("/", ""))).hexdigest()
			if keyhash not in donationlist.list:
				donation = models.Donation.get_or_insert(keyhash, keyhash=keyhash, date=date, amount=amount, donor=donor, message=message)
				donationlist.list.append(keyhash)
				donationlist.put()
				message = "<strong>" + donor + "</strong> has donated " + amount
				newsitem = models.NewsItem.get_or_insert(keyhash, text=message)
			self.response.out.write(message)
			self.response.out.write("<br />")
			self.response.out.write(donor)
			self.response.out.write("<br />")
			self.response.out.write(date)
			self.response.out.write("<br />")
			self.response.out.write(amount)
			self.response.out.write("<br />")
			self.response.out.write(keyhash)
			self.response.out.write("<br />")
			self.response.out.write("<br />")
Exemplo n.º 8
0
	def get(self):
		flickr = models.Flickr.get_by_key_name(self.request.get("flickr_secret"))
		query = "select content from html where url=\"http://www.timparenti.com/dev/flickr/shortlink/?id=%s\" and xpath=\"//a[@id='shortLink']\"" % flickr.flickr_id
		results = helpers.do_yql(query)['query']['results']['a']
		if not flickr.short_url:
			flickr.short_url = results
			flickr.put()
			taskqueue.add(url="/twitter/post", params={"secret": "k1lim4njar0", "message": flickr.title + " : " + flickr.short_url}, method='GET')
		self.response.out.write(results)
Exemplo n.º 9
0
def get_location(lockey, lat, lon):
	location = memcache.get(lockey)
	if not location:
		query = "select * from geo.places where woeid in (select place.woeid from flickr.places where lat='" + lat +"' and  lon='"+ lon +"')";
		result = helpers.do_yql(query)
		try:
			yql_location = result['query']['results']['place']
			location = yql_location['name'] + ", " + yql_location['country']['content']
			memcache.add(lockey, location, 10000)
		except:
			location = "not found"
	return location
Exemplo n.º 10
0
    def get(self):
		people = []
		# gets the Lanyrd URL
		lanyrd = self.request.get("lanyrd")
		# gets the number of winners
		number = self.request.get("number")
		# casts the number to an int, defaults to 5 randoms if no number specified
		if number:
			try:
				number = int(number)
			except:
				number = 5
		else:
			number = 5
		# does the YQL thing to get the people tracking on Lanyrd
		yqlquery = "select href from html where url=\""+ lanyrd +"\" and xpath=\"//div[@class='trackers-placeholder placeholder']/ul/li/a\""
		logging.warn(yqlquery)
		# need to put some handling in here for when there are no trackers, assumes that there are
		results = helpers.do_yql(yqlquery)["query"]["results"]["a"]
		if results:
			# gets a lot more random numbers than needed, but useful where small sample sets to avoid collisions, needs optimising for when there are a big number of trackers or a large number of randoms needed, or god forbid both
			maxrandoms = (len(results) - 1) * number
			maxvalue = len(results) - 1
			# build query for Random.org
			randomquery = "http://www.random.org/integers/?num=" + str(maxrandoms) + "&min=0&max="+ str(maxvalue) +"&col=1&base=10&format=plain&rnd=new"
			# and go fetch it
			result = urlfetch.fetch(randomquery)
			if result.status_code == 200:
				# each number is on a new line, so split into an array, sadly it'll have 1 item left over, TODO fix trailing \n
				randoms = result.content.split("\n")
				# put something out on top of page to show how many numbers and for true transparency give the array of random numbers
				self.response.out.write("<div>Array of "+ str(maxrandoms) +" random numbers from Random.org</div><hr /><div>" + str(randoms) + "</div><hr /><div>Selection of "+ str(number)+ " random trackers from "+ str(len(results)) + " of " + lanyrd +":</div><br />")
				# iterate throguh random numbers
				for i in randoms:
					# iterate through the random numbers
					if len(people) < number:
						# only parse if we haven't picked the n random people
						try:
							# here as a horrible hack for the trailing \n TODO clean up
							name = results[int(i)]["href"].replace("/profile/", "").replace("/", "")
							# check to see if person has been picked, greater chance in small sample sets
							if name not in people:
								# if they're not there add them to the array and display on page
								people.append(name)
								self.response.out.write("<div>" + name + "</div>")
						except:
							i = "NaN"
					else:
						break
Exemplo n.º 11
0
	def get(self):
		query = "select guid, title, pubDate from rss where url='http://twitter.com/statuses/user_timeline/162386848.rss'"
		results = helpers.do_yql(query)['query']['results']['item']
		twitterlist = models.List.get_or_insert("twitterlist", list=[])
		for item in results:
			created_at = helpers.convert_twitter_rss_datetime(item['pubDate'])
			text = item['title'].replace("\n", " ")
			tweet_id = item['guid'].replace("http://twitter.com/childsiklimb/statuses/", "")
			if not tweet_id in twitterlist.list:
				twitterlist.list.append(tweet_id)
				twitterlist.put()
				tweet = models.Tweet.get_or_insert(tweet_id, tweet_id=tweet_id, text=text, created_at=created_at)
				message = "<span class='tweet'>" + text + "</span>"
				newsitem = models.NewsItem.get_or_insert(tweet_id, text=text, created_at=created_at)
			self.response.out.write(created_at)
			self.response.out.write("<br />")
			self.response.out.write(tweet_id)
			self.response.out.write("<br />")
			self.response.out.write(text)
			self.response.out.write("<br />")
			self.response.out.write("<br />")
			self.response.out.write("<br />")
	def get(self):
		p = pdt.Calendar()
		url = self.request.get("url")
		if self.request.get("page"):
			page = self.request.get("page")
		else: 
			page = "1"
		pagesize = 3
		low = ((int(page) - 1) * pagesize) + 1
		high = int(page) * pagesize
		query = "select title, link, description, category.content, creator, pubDate, link from rss where url='"+ url +"'"
		result = helpers.do_yql(query)
		self.response.out.write(url)
		self.response.out.write("<br />")
		feed = models.Feed.get_by_key_name(url)
		self.response.out.write(feed.site.code)
		self.response.out.write("<br />")
		resulthash = hashlib.md5(str(result)).hexdigest()
		feed.last_hash = resulthash
		feed.last_scraped = datetime.datetime.today()
		self.response.out.write(resulthash)
		self.response.out.write("<br />")
		self.response.out.write("<br />")
		self.response.out.write(str(low) + "/" + str(high))
		self.response.out.write("<br />")
		self.response.out.write("<br />")
		index = True
		i = 0
		for element in result['query']['results']['item']:
			i += 1
			try:
				if i <= high and i >= low:
					self.response.out.write(i)
					self.response.out.write("<br />")
					category = ""
					description = ""
					title = ""
					pub_date = ""
					creator = ""
					link = ""
					try:
						category = element['category']
						self.response.out.write(element['category'])
						self.response.out.write("<br />")
					except:
						error = True
					try:
						if not "img" in element['description']:
							description = element['description']
							self.response.out.write(element['description'])
							self.response.out.write("<br />")
					except:
						error = True
					try:
						title = element['title']
						self.response.out.write(element['title'])
						self.response.out.write("<br />")
					except:
						error = True
					try:
						self.response.out.write(element['pubDate'])
						self.response.out.write("<br />")
						pub_date = datetime(p.parse(element['pubDate']))
						self.response.out.write(pub_date)
						self.response.out.write("<br />")
					except:
						error = True
					try:
						creator = element['creator'].title()
						self.response.out.write(element['creator'].title())
						self.response.out.write("<br />")
					except:
						error = True
					try:
						self.response.out.write(element['link'])
						if "phdo" in str(element['link']):
							lookup = urlfetch.fetch(element['link'])
							self.response.out.write("<br />")
							self.response.out.write(lookup.status_code)
							self.response.out.write("<br />")
							self.response.out.write(lookup.final_url)
							self.response.out.write("<br />")
							link = lookup.final_url
						else:
							link = element['link']
						if feed.site.code == "nyt":
							link = link.replace("?partner=rss&emc=rss", "")
						elif feed.site.code == "wsj":
							link, sep, discard = link.partition("?mod")
						elif feed.site.code == "sfc":
							link, sep, discard = link.partition("&feed")
						elif feed.site.code == "wp":
							link = link.replace("?nav=rss_email/components", "")
						else:
							self.response.out.write("unknown url")
						self.response.out.write(link)
						self.response.out.write("<br />")
						self.response.out.write("Now to persist")
						self.response.out.write("<br />")
						item = models.Item.get_or_insert(url, site=feed.site, url=link, title=title, category=category, description=description, byline=creator, pub_date=pub_date)
					except:
						temp = True
						self.response.out.write("failed to persist")
						self.response.out.write("<br />")
			except:
				self.response.out.write(element)
		self.response.out.write("<br />")
		if i > high:
			taskqueue.add(url='/index/feed', params={'page': str(int(page) + 1), "url":url}, method='GET')
		feed.put()