def post(self): route_loader_key = self.request.get('rll_key') logging.debug('work on %s' % self.request.get('rll_key')) route_loader = RouteListingLoader.get(route_loader_key) if route_loader is None: logging.error('total fail. unable to find %s' % route_loader_key) else: logging.debug(route_loader.routeID) # find the corresponding stop details stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", route_loader.stopID).get() if stop is None: logging.error("Missing stop %s which should be impossible" % route_loader.stopID); try: url = CRAWL_URLBASE + '?r=' + route_loader.routeCode + '&d=' + route_loader.directionCode + '&s=' + route_loader.stopCode logging.debug(url) route = RouteListing() route.route = route_loader.routeID route.routeCode = route_loader.routeCode route.direction = route_loader.directionCode route.stopID = route_loader.stopID route.stopCode = route_loader.stopCode route.scheduleURL = url route.stopLocation = stop route.put() logging.info("added new route listing entry to the database!") DestinationListing.get_or_insert(route_loader.direction, id=route_loader.directionCode, label=route_loader.direction) except TransactionFailedError: logging.error('FAIL : unable to store RouteListing for route %s, stop %s', (route_loader.routeID,route_loader.stopID)) self.response.set_status(2) self.response.out.write('transaction fail') return
def getRoutes(refresh): if refresh is False: # do we already have it in the datastore? api = db.GqlQuery('select * from StaticAPIs where method = :1', api_utils.GETROUTES).get() if api is not None: logging.debug('---> datastore hit') return api.json logging.debug('---> datastore lookup starting!') offset = 0 q = RouteListing.all() routes = q.fetch(1000) hits = {} response_dict = {'status': 0, 'timestamp': api_utils.getLocalTimestamp()} while len(routes) > 0: offset += len(routes) ## stopped here trying to create a map of unique routes and endpoints ## for r in routes: # are we tracking this route/direction pair? key = r.route + ':' + r.direction hits[key] = hits.get(key, 0) + 1 # get more routes routes = q.fetch(1000, offset) routeMap = {} for k, v in hits.iteritems(): key = k.split(':') routeID = key[0] direction = key[1] directionLabel = api_utils.getDirectionLabel(direction) logging.debug('adding direction %s to route %s' % (directionLabel, routeID)) if routeID in routeMap: routeMap[routeID].append(directionLabel) else: routeMap[routeID] = list() routeMap[routeID].append(directionLabel) route_results = [] for k, v in routeMap.iteritems(): route_results.append(dict({'routeID': k, 'directions': routeMap[k]})) # add the populated route details to the response response_dict.update({'routes': route_results}) json_results = json.dumps(response_dict) static = StaticAPIs() static.method = api_utils.GETROUTES static.json = json_results static.put() return json_results
def getRoutes(refresh): if refresh is False: # do we already have it in the datastore? api = db.GqlQuery('select * from StaticAPIs where method = :1', utils.GETROUTES).get() if api is not None: logging.debug('---> datastore hit'); return api.json logging.debug('---> datastore lookup starting!') offset = 0 q = RouteListing.all() routes = q.fetch(1000) hits = {} response_dict = {'status':0,'timestamp':utils.getLocalTimestamp()} while len(routes) > 0: offset += len(routes) ## stopped here trying to create a map of unique routes and endpoints ## for r in routes: # are we tracking this route/direction pair? key = r.route + ':' + r.direction hits[key] = hits.get(key,0) + 1 # get more routes routes = q.fetch(1000,offset) routeMap = {} for k,v in hits.iteritems(): key = k.split(':') routeID = key[0] direction = key[1] directionLabel = utils.getDirectionLabel(direction) logging.debug('adding direction %s to route %s' % (directionLabel,routeID)) if routeID in routeMap: routeMap[routeID].append(directionLabel) else: routeMap[routeID] = list() routeMap[routeID].append(directionLabel) route_results = [] for k,v in routeMap.iteritems(): route_results.append(dict({'routeID':k,'directions':routeMap[k]})) # add the populated route details to the response response_dict.update({'routes':route_results}) json = simplejson.dumps(response_dict) static = StaticAPIs() static.method = utils.GETROUTES static.json = json static.put() return json
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL,direction,routeID)) loop = 0 done = False result = None start = quota.get_request_cpu_usage() while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True; except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop+1 end = quota.get_request_cpu_usage() #logging.info("scraping took %s cycles" % (end-start)) # start to interrogate the results soup = BeautifulSoup(result.content) stopUpdates = [] for slot in soup.html.body.findAll("a","ada"): logging.info("pulling out data from page... %s" % slot) if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID,intersection)) # check for conflicts... stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: # add the new stop stop = StopLocation() stop.stopID = stopID stop.routeID = routeID stop.intersection = intersection.upper() stop.direction = direction.upper() stopUpdates.append(stop) # stop.put() logging.info("ADDED StopLocation (%s) - MINUS geo location" % stopID) else: logging.info("StopLoation entity already exists for %s..." % stopID) stop.routeID = routeID stopUpdates.append(stop) # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID)) routeArgs = routeData.split('&') routeID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.put() logging.info("added new route listing entry to the database!") else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) #else: # title.split(",")[0].isdigit(): elif href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href if routeID == '00': routeID = href.split('r=')[1] elif href.find("&") > -1: routeID = href.split('&')[0].split('r=')[1] task = Task(url='/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID}) task.add('crawler') logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID)) # label crawler looks for titles with letters for extraction/persistence #elif title.replace('-','').replace(' ','').isalpha(): # routeData = href.split('?')[1] # logging.info("found the route LABEL page! href: %s" % href) # routeArgs = routeData.split('&') # directionID = routeArgs[1].split('=')[1] # # l = DestinationListing.get_or_insert(title, id=directionID, label=title) # push the vehicle updates to the datastore db.put(stopUpdates) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return;
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL, direction, routeID)) loop = 0 done = False result = None #start = quota.get_request_cpu_usage() while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop + 1 #end = quota.get_request_cpu_usage() #logging.info("scraping took %s cycles" % (end-start)) # start to interrogate the results soup = BeautifulSoup(result.content) for slot in soup.html.body.findAll("a", "ada"): logging.info("pulling out data from page... %s" % slot) if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID, intersection)) # check for conflicts... stop = db.GqlQuery( "SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: logging.error( "Missing stop %s which should be impossible" % stopID) # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info( "FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData, stopID)) routeArgs = routeData.split('&') routeID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery( "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.stopLocation = stop route.put() logging.info( "added new route listing entry to the database!" ) else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) #else: # title.split(",")[0].isdigit(): else: if href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href if routeID == '00': routeID = href.split('r=')[1] elif href.find("&") > -1: routeID = href.split('&')[0].split('r=')[1] task = Task(url='/crawl/routelist/crawlingtask', params={ 'crawl': crawlURL, 'direction': title, 'routeID': routeID }) task.add('crawler') logging.info( "Added new task for %s, direction %s, route %s" % (title.split(",")[0], title, routeID)) # label crawler looks for titles with letters for extraction/persistence if title.replace('-', '').replace(' ', '').isalpha(): logging.info( "found the route LABEL page! href: %s" % href) routeData = href.split('?')[1] routeArgs = routeData.split('&') directionID = routeArgs[1].split('=')[1] l = DestinationListing.get_or_insert( title, id=directionID, label=title) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') loop = 0 done = False result = None while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True; except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop+1 # start to interrogate the results soup = BeautifulSoup(result.content) for slot in soup.html.body.findAll("a","adalink"): if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string # # stop links have the following format # <a class="adalink" title="CAMPUS & BABCOCK RR [EB#0809]" href="?r=61&d=102&s=3457">CAMPUS & BABCOCK RR [EB#0809]</a> # if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID,intersection)) # check for conflicts... stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: logging.error("Missing stop %s which should be impossible" % stopID); # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID)) routeArgs = routeData.split('&') fakeRouteID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.stopLocation = stop route.put() logging.info("added new route listing entry to the database!") else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) else: # direction links look like the following, # <a class="adalink" title="CapSq" href="?r=61&d=102">CapSq</a> # # fetch the next page depth to get to the stop details # if href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href task = Task(url='/crawl/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID}) task.add('crawler') logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID)) # label crawler looks for titles with letters for extraction/persistence if title.replace('-','').replace(' ','').isalpha(): directionID = href.split('d=')[0] logging.info("found the route LABEL page! href: %s" % href) l = DestinationListing.get_or_insert(title, id=directionID, label=title) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return;