def get(self): url = self.request.get("url") section = models.Section.get_by_key_name(url) query = "select * from html where url = '"+ url +"' and xpath='//div[@id=\"main\"]//a'" result = helpers.do_yql(query) # self.response.out.write(result) resulthash = hashlib.md5(str(result)).hexdigest() section.last_hash = resulthash section.last_scraped = datetime.datetime.today() linklist = [] for link in result['query']['results']['a']: if not "http://www.newsmill.se" in link['href']: if not "http://www.pastan.nu" in link['href']: if not "http://www.nyteknik.se" in link['href']: if not "http://www.alltommotor.se" in link['href']: if not "http://www.motimate.se" in link['href']: if not "http://altfarm.mediaplex.com" in link['href']: if not "http://affiliate.se.espotting.com" in link['href']: if not "mailto:" in link['href']: if "http://www.dn.se" in link['href']: linkurl = link['href'] else: linkurl = "http://www.dn.se" + link['href'] linkurl = linkurl.replace("#article-readers", "") if linkurl not in linklist: linklist.append(linkurl) datekey = str(datetime.datetime.today()).replace("-", "").replace(" ", "")[0:10] sectionkey = datekey + url sectionpagelist = models.SectionPageList.get_or_insert(sectionkey, section=section, section_datekey=datekey, pagelist=linklist) self.response.out.write(sectionkey + "<br />") self.response.out.write(str(datetime.datetime.today()) + "<br />") self.response.out.write(str(len(linklist)) + "<br />") self.response.out.write(str(linklist) + "<br />") section.put()
def get(self): url = self.request.get("url") item = models.Item.get_by_key_name(url) query = "select src, height, width from html where url='"+ url +"' and xpath='//div[@id=\"main\"]//img' and height > 200 and width > 200" result = helpers.do_yql(query) self.response.out.write(url) self.response.out.write("<br />") imgurl = False try: element = result['query']['results']['img'][0] if "www.dn.se" in element['src']: imgurl = element['src'] else: imgurl = "http://www.dn.se%s" % (element['src']) self.response.out.write("<img src=\"%s\"/>" % imgurl) if imgurl: item.img_url = imgurl else: item.img_url = False item.put() self.response.out.write("<br />") self.response.out.write(imgurl) except: self.response.out.write(result) self.response.out.write("<br />") if item: self.response.out.write("is an item") taskqueue.add(url='/scrape/imagecache', params={"url":item.item_url, "imgurl":imgurl}, method='GET') else: self.response.out.write("scrape item") self.response.out.write("<br />")
def get(self): url = self.request.get("url") query = "select title, link, description, category.content, creator, pubDate, link from rss where url='"+ url +"'" result = helpers.do_yql(query) feed = models.Feed.get_by_key_name(url) resulthash = hashlib.md5(str(result)).hexdigest() feed.last_hash = resulthash feed.last_scraped = datetime.datetime.today() self.response.out.write(resulthash) self.response.out.write("<br />") self.response.out.write("<br />") try: for element in result['query']['results']['item']: models.Item.get_or_insert(element['link'], item_url=element['link'], title=element['title'], category=element['category'], description=element['description'], byline=element['creator'], pubDate=element['pubDate']) self.response.out.write(element['category']) self.response.out.write("<br />") self.response.out.write(element['description']) self.response.out.write("<br />") self.response.out.write(element['title']) self.response.out.write("<br />") self.response.out.write(element['pubDate']) self.response.out.write("<br />") self.response.out.write(element['creator']) self.response.out.write("<br />") self.response.out.write(element['link']) self.response.out.write("<br />") self.response.out.write("<br />") except: self.response.out.write("fail") feed.put()
def get(self): url = self.request.get("url") query = "select script from html where url=\"http://maps.google.com/maps/ms?ie=UTF8&hl=en&msa=0&ll=-3.073324,37.411366&spn=0.126847,0.245819&t=h&z=13&msid=116890249293007182618.00048088bb3d7fc8d89e1\"" results = helpers.do_yql(query)['query']['results']['body'] for result in results: item = str(result['script']['content']) if "KEOB" in item: things = item.split("KEOB") geoelements = things[1].split("@") messageelements = things[3].split("@") latlng = geoelements[1][0:geoelements[1].find("\"")] lat = latlng.split(",")[0] lng = latlng.split(",")[1] messagebits = messageelements[1].replace("\",infoWindow:{title:\"", "").replace("\\", "").replace("x3c", "").replace("x3e", "").replace("/div", "").replace("\n", " ").split(":brbr") self.response.out.write(lat) self.response.out.write("<br />") self.response.out.write(lng) self.response.out.write("<br />") self.response.out.write("<br />") note = "<strong>From TeamKilimanjaro Guides</strong><br />" + messagebits[1].strip() + "<br />" + messagebits[0].strip() self.response.out.write(note) self.response.out.write("<br />") self.response.out.write("<br />") keyhash = hashlib.md5(lat + lng).hexdigest(); location = models.Location.get_or_insert(keyhash, lat=lat, lng=lng, note=note) geolist = models.List.get_or_insert("geolist", list=[]) if keyhash not in geolist.list: geolist.list.append(keyhash) newsitem = models.NewsItem.get_or_insert(keyhash, text=note)
def get(self): query = "SELECT * FROM flickr.people.publicphotos(0,10) WHERE user_id='51711675@N02' AND extras='url_sq, date_upload, url_m'" results = helpers.do_yql(query)['query']['results']['photo'] flickrlist = models.List.get_or_insert("flickrlist", list=[]) for photo in results: created_at = datetime.datetime.fromtimestamp(float(photo['dateupload'])) flickr_id = photo['id'] flickr_secret = photo['secret'] square = photo['url_sq'] medium = photo['url_m'] title = photo['title'] if flickr_secret not in flickrlist.list: flickrlist.list.append(flickr_secret) flickrlist.put() flickr = models.Flickr.get_or_insert(flickr_secret, flickr_secret=flickr_secret, created_at=created_at, flickr_id=flickr_id, square=square, medium=medium, title=title) taskqueue.add(url="/scrape/flickrshortner", params={"flickr_secret": flickr_secret}, method='GET') message = "<span class='newsimg'><img src='"+ square +"' height='45' width='45' alt='"+ title +"'/></span>" + "<a href='javascript:showImage(\""+ flickr_secret +"\")'>" + title + "</a>" newsitem = models.NewsItem.get_or_insert(flickr_secret, text=message, created_at=created_at) self.response.out.write(created_at) self.response.out.write("<br />") self.response.out.write(square) self.response.out.write("<br />") self.response.out.write(medium) self.response.out.write("<br />") self.response.out.write(title) self.response.out.write("<br />") self.response.out.write(flickr_id) self.response.out.write("<br />") self.response.out.write("<br />")
def get(self): people = [] # gets the Lanyrd URL lanyrd = self.request.get("lanyrd") # gets the number of winners number = self.request.get("number") # casts the number to an int, defaults to 5 randoms if no number specified if number: try: number = int(number) except: number = 5 else: number = 5 # does the YQL thing to get the people tracking on Lanyrd yqlquery = "select href from html where url=\"" + lanyrd + "\" and xpath=\"//div[@class='trackers-placeholder placeholder']/ul/li/a\"" logging.warn(yqlquery) # need to put some handling in here for when there are no trackers, assumes that there are results = helpers.do_yql(yqlquery)["query"]["results"]["a"] if results: # gets a lot more random numbers than needed, but useful where small sample sets to avoid collisions, needs optimising for when there are a big number of trackers or a large number of randoms needed, or god forbid both maxrandoms = (len(results) - 1) * number maxvalue = len(results) - 1 # build query for Random.org randomquery = "http://www.random.org/integers/?num=" + str( maxrandoms) + "&min=0&max=" + str( maxvalue) + "&col=1&base=10&format=plain&rnd=new" # and go fetch it result = urlfetch.fetch(randomquery) if result.status_code == 200: # each number is on a new line, so split into an array, sadly it'll have 1 item left over, TODO fix trailing \n randoms = result.content.split("\n") # put something out on top of page to show how many numbers and for true transparency give the array of random numbers self.response.out.write( "<div>Array of " + str(maxrandoms) + " random numbers from Random.org</div><hr /><div>" + str(randoms) + "</div><hr /><div>Selection of " + str(number) + " random trackers from " + str(len(results)) + " of " + lanyrd + ":</div><br />") # iterate throguh random numbers for i in randoms: # iterate through the random numbers if len(people) < number: # only parse if we haven't picked the n random people try: # here as a horrible hack for the trailing \n TODO clean up name = results[int(i)]["href"].replace( "/profile/", "").replace("/", "") # check to see if person has been picked, greater chance in small sample sets if name not in people: # if they're not there add them to the array and display on page people.append(name) self.response.out.write("<div>" + name + "</div>") except: i = "NaN" else: break
def get(self): url = self.request.get("url") query = "select * from html where url=\"%s\" and (xpath=\"//table[@class='frp-totals']/tbody/tr/td/p\" or xpath=\"//table[@class='tbl-donations']/tbody/tr\")" % url results = helpers.do_yql(query)['query']['results'] totals = results['p'] donations = results['tr'] online = float(totals[0][1:len(totals[0])].replace(",", "")) offline = float(totals[1][1:len(totals[1])].replace(",", "")) giftaid = float(totals[2][1:len(totals[2])].replace(",", "")) donationset = models.DonationSet.get_or_insert(url, url=url, online=online, offline=offline, giftaid=giftaid) donationlist = models.List.get_or_insert("donationlist", list=[]) donationset.online = online donationset.offline = offline donationset.giftaid = giftaid donationset.put() self.response.out.write(online) self.response.out.write("<br />") self.response.out.write(offline) self.response.out.write("<br />") self.response.out.write(giftaid) self.response.out.write("<br />") self.response.out.write("<br />") for donation in donations: donationdetails = donation['td'][1]['div']['p']['span'] self.response.out.write(donationdetails) message = donationdetails[0]['content'].replace("\n", " ") try: donor = donationdetails[1]['strong'].strip().replace("\n", " ") date = helpers.convert_justgiving_datetime(donationdetails[2]) rawdate = donationdetails[2] except: donor = "Anonymous" date = helpers.convert_justgiving_datetime(donationdetails[1]) rawdate = donationdetails[1] try: rawamount = donation['td'][2]['strong'] amount = rawamount # amount = rawamount[1:len(rawamount)] except: amount = "" keyhash = hashlib.md5(str(url + donor + rawdate.replace("/", ""))).hexdigest() if keyhash not in donationlist.list: donation = models.Donation.get_or_insert(keyhash, keyhash=keyhash, date=date, amount=amount, donor=donor, message=message) donationlist.list.append(keyhash) donationlist.put() message = "<strong>" + donor + "</strong> has donated " + amount newsitem = models.NewsItem.get_or_insert(keyhash, text=message) self.response.out.write(message) self.response.out.write("<br />") self.response.out.write(donor) self.response.out.write("<br />") self.response.out.write(date) self.response.out.write("<br />") self.response.out.write(amount) self.response.out.write("<br />") self.response.out.write(keyhash) self.response.out.write("<br />") self.response.out.write("<br />")
def get(self): flickr = models.Flickr.get_by_key_name(self.request.get("flickr_secret")) query = "select content from html where url=\"http://www.timparenti.com/dev/flickr/shortlink/?id=%s\" and xpath=\"//a[@id='shortLink']\"" % flickr.flickr_id results = helpers.do_yql(query)['query']['results']['a'] if not flickr.short_url: flickr.short_url = results flickr.put() taskqueue.add(url="/twitter/post", params={"secret": "k1lim4njar0", "message": flickr.title + " : " + flickr.short_url}, method='GET') self.response.out.write(results)
def get_location(lockey, lat, lon): location = memcache.get(lockey) if not location: query = "select * from geo.places where woeid in (select place.woeid from flickr.places where lat='" + lat +"' and lon='"+ lon +"')"; result = helpers.do_yql(query) try: yql_location = result['query']['results']['place'] location = yql_location['name'] + ", " + yql_location['country']['content'] memcache.add(lockey, location, 10000) except: location = "not found" return location
def get(self): people = [] # gets the Lanyrd URL lanyrd = self.request.get("lanyrd") # gets the number of winners number = self.request.get("number") # casts the number to an int, defaults to 5 randoms if no number specified if number: try: number = int(number) except: number = 5 else: number = 5 # does the YQL thing to get the people tracking on Lanyrd yqlquery = "select href from html where url=\""+ lanyrd +"\" and xpath=\"//div[@class='trackers-placeholder placeholder']/ul/li/a\"" logging.warn(yqlquery) # need to put some handling in here for when there are no trackers, assumes that there are results = helpers.do_yql(yqlquery)["query"]["results"]["a"] if results: # gets a lot more random numbers than needed, but useful where small sample sets to avoid collisions, needs optimising for when there are a big number of trackers or a large number of randoms needed, or god forbid both maxrandoms = (len(results) - 1) * number maxvalue = len(results) - 1 # build query for Random.org randomquery = "http://www.random.org/integers/?num=" + str(maxrandoms) + "&min=0&max="+ str(maxvalue) +"&col=1&base=10&format=plain&rnd=new" # and go fetch it result = urlfetch.fetch(randomquery) if result.status_code == 200: # each number is on a new line, so split into an array, sadly it'll have 1 item left over, TODO fix trailing \n randoms = result.content.split("\n") # put something out on top of page to show how many numbers and for true transparency give the array of random numbers self.response.out.write("<div>Array of "+ str(maxrandoms) +" random numbers from Random.org</div><hr /><div>" + str(randoms) + "</div><hr /><div>Selection of "+ str(number)+ " random trackers from "+ str(len(results)) + " of " + lanyrd +":</div><br />") # iterate throguh random numbers for i in randoms: # iterate through the random numbers if len(people) < number: # only parse if we haven't picked the n random people try: # here as a horrible hack for the trailing \n TODO clean up name = results[int(i)]["href"].replace("/profile/", "").replace("/", "") # check to see if person has been picked, greater chance in small sample sets if name not in people: # if they're not there add them to the array and display on page people.append(name) self.response.out.write("<div>" + name + "</div>") except: i = "NaN" else: break
def get(self): query = "select guid, title, pubDate from rss where url='http://twitter.com/statuses/user_timeline/162386848.rss'" results = helpers.do_yql(query)['query']['results']['item'] twitterlist = models.List.get_or_insert("twitterlist", list=[]) for item in results: created_at = helpers.convert_twitter_rss_datetime(item['pubDate']) text = item['title'].replace("\n", " ") tweet_id = item['guid'].replace("http://twitter.com/childsiklimb/statuses/", "") if not tweet_id in twitterlist.list: twitterlist.list.append(tweet_id) twitterlist.put() tweet = models.Tweet.get_or_insert(tweet_id, tweet_id=tweet_id, text=text, created_at=created_at) message = "<span class='tweet'>" + text + "</span>" newsitem = models.NewsItem.get_or_insert(tweet_id, text=text, created_at=created_at) self.response.out.write(created_at) self.response.out.write("<br />") self.response.out.write(tweet_id) self.response.out.write("<br />") self.response.out.write(text) self.response.out.write("<br />") self.response.out.write("<br />") self.response.out.write("<br />")
def get(self): p = pdt.Calendar() url = self.request.get("url") if self.request.get("page"): page = self.request.get("page") else: page = "1" pagesize = 3 low = ((int(page) - 1) * pagesize) + 1 high = int(page) * pagesize query = "select title, link, description, category.content, creator, pubDate, link from rss where url='"+ url +"'" result = helpers.do_yql(query) self.response.out.write(url) self.response.out.write("<br />") feed = models.Feed.get_by_key_name(url) self.response.out.write(feed.site.code) self.response.out.write("<br />") resulthash = hashlib.md5(str(result)).hexdigest() feed.last_hash = resulthash feed.last_scraped = datetime.datetime.today() self.response.out.write(resulthash) self.response.out.write("<br />") self.response.out.write("<br />") self.response.out.write(str(low) + "/" + str(high)) self.response.out.write("<br />") self.response.out.write("<br />") index = True i = 0 for element in result['query']['results']['item']: i += 1 try: if i <= high and i >= low: self.response.out.write(i) self.response.out.write("<br />") category = "" description = "" title = "" pub_date = "" creator = "" link = "" try: category = element['category'] self.response.out.write(element['category']) self.response.out.write("<br />") except: error = True try: if not "img" in element['description']: description = element['description'] self.response.out.write(element['description']) self.response.out.write("<br />") except: error = True try: title = element['title'] self.response.out.write(element['title']) self.response.out.write("<br />") except: error = True try: self.response.out.write(element['pubDate']) self.response.out.write("<br />") pub_date = datetime(p.parse(element['pubDate'])) self.response.out.write(pub_date) self.response.out.write("<br />") except: error = True try: creator = element['creator'].title() self.response.out.write(element['creator'].title()) self.response.out.write("<br />") except: error = True try: self.response.out.write(element['link']) if "phdo" in str(element['link']): lookup = urlfetch.fetch(element['link']) self.response.out.write("<br />") self.response.out.write(lookup.status_code) self.response.out.write("<br />") self.response.out.write(lookup.final_url) self.response.out.write("<br />") link = lookup.final_url else: link = element['link'] if feed.site.code == "nyt": link = link.replace("?partner=rss&emc=rss", "") elif feed.site.code == "wsj": link, sep, discard = link.partition("?mod") elif feed.site.code == "sfc": link, sep, discard = link.partition("&feed") elif feed.site.code == "wp": link = link.replace("?nav=rss_email/components", "") else: self.response.out.write("unknown url") self.response.out.write(link) self.response.out.write("<br />") self.response.out.write("Now to persist") self.response.out.write("<br />") item = models.Item.get_or_insert(url, site=feed.site, url=link, title=title, category=category, description=description, byline=creator, pub_date=pub_date) except: temp = True self.response.out.write("failed to persist") self.response.out.write("<br />") except: self.response.out.write(element) self.response.out.write("<br />") if i > high: taskqueue.add(url='/index/feed', params={'page': str(int(page) + 1), "url":url}, method='GET') feed.put()