示例#1
0
    def post(self):
        route_loader_key = self.request.get('rll_key')
        logging.debug('work on %s' % self.request.get('rll_key'))
        route_loader = RouteListingLoader.get(route_loader_key)
        if route_loader is None:
            logging.error('total fail. unable to find %s' % route_loader_key)
        else:
            logging.debug(route_loader.routeID)
            # find the corresponding stop details
            stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", route_loader.stopID).get()
            if stop is None:
              logging.error("Missing stop %s which should be impossible" % route_loader.stopID);

            try:
                url = CRAWL_URLBASE + '?r=' + route_loader.routeCode + '&d=' + route_loader.directionCode + '&s=' + route_loader.stopCode
                logging.debug(url)
                route = RouteListing()
                route.route = route_loader.routeID
                route.routeCode = route_loader.routeCode
                route.direction = route_loader.directionCode
                route.stopID = route_loader.stopID
                route.stopCode = route_loader.stopCode
                route.scheduleURL = url
                route.stopLocation = stop
                route.put()
                logging.info("added new route listing entry to the database!")

                DestinationListing.get_or_insert(route_loader.direction, id=route_loader.directionCode, label=route_loader.direction)
            except TransactionFailedError:
                logging.error('FAIL : unable to store RouteListing for route %s, stop %s', (route_loader.routeID,route_loader.stopID))
                self.response.set_status(2)
                self.response.out.write('transaction fail')

        return
示例#2
0
def getRoutes(refresh):

    if refresh is False:
        # do we already have it in the datastore?
        api = db.GqlQuery('select * from StaticAPIs where method = :1',
                          api_utils.GETROUTES).get()
        if api is not None:
            logging.debug('---> datastore hit')
            return api.json

    logging.debug('---> datastore lookup starting!')
    offset = 0
    q = RouteListing.all()
    routes = q.fetch(1000)

    hits = {}
    response_dict = {'status': 0, 'timestamp': api_utils.getLocalTimestamp()}
    while len(routes) > 0:
        offset += len(routes)

        ## stopped here trying to create a map of unique routes and endpoints
        ##
        for r in routes:
            # are we tracking this route/direction pair?
            key = r.route + ':' + r.direction
            hits[key] = hits.get(key, 0) + 1

        # get more routes
        routes = q.fetch(1000, offset)

    routeMap = {}
    for k, v in hits.iteritems():
        key = k.split(':')
        routeID = key[0]
        direction = key[1]
        directionLabel = api_utils.getDirectionLabel(direction)

        logging.debug('adding direction %s to route %s' %
                      (directionLabel, routeID))
        if routeID in routeMap:
            routeMap[routeID].append(directionLabel)
        else:
            routeMap[routeID] = list()
            routeMap[routeID].append(directionLabel)

    route_results = []
    for k, v in routeMap.iteritems():
        route_results.append(dict({'routeID': k, 'directions': routeMap[k]}))

    # add the populated route details to the response
    response_dict.update({'routes': route_results})
    json_results = json.dumps(response_dict)

    static = StaticAPIs()
    static.method = api_utils.GETROUTES
    static.json = json_results
    static.put()

    return json_results
示例#3
0
def getRoutes(refresh):

    if refresh is False:
        # do we already have it in the datastore?
        api = db.GqlQuery('select * from StaticAPIs where method = :1', utils.GETROUTES).get()
        if api is not None:
            logging.debug('---> datastore hit');
            return api.json

    logging.debug('---> datastore lookup starting!')
    offset = 0
    q = RouteListing.all()
    routes = q.fetch(1000)

    hits = {}
    response_dict = {'status':0,'timestamp':utils.getLocalTimestamp()}
    while len(routes) > 0:
        offset += len(routes)

        ## stopped here trying to create a map of unique routes and endpoints
        ##
        for r in routes:
            # are we tracking this route/direction pair?
            key = r.route + ':' + r.direction
            hits[key] = hits.get(key,0) + 1

        # get more routes
        routes = q.fetch(1000,offset)
        
    routeMap = {}        
    for k,v in hits.iteritems():
        key = k.split(':')
        routeID = key[0]
        direction = key[1]
        directionLabel = utils.getDirectionLabel(direction)
        
        logging.debug('adding direction %s to route %s' % (directionLabel,routeID))
        if routeID in routeMap:
            routeMap[routeID].append(directionLabel)
        else:
            routeMap[routeID] = list()
            routeMap[routeID].append(directionLabel)
        
    route_results = []
    for k,v in routeMap.iteritems():
        route_results.append(dict({'routeID':k,'directions':routeMap[k]}))
    
    # add the populated route details to the response
    response_dict.update({'routes':route_results})
    json = simplejson.dumps(response_dict)
    
    static = StaticAPIs()
    static.method = utils.GETROUTES
    static.json = json
    static.put()

    return json
示例#4
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL,direction,routeID))
            
            loop = 0
            done = False
            result = None
            start = quota.get_request_cpu_usage()
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True;
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop+1
            end = quota.get_request_cpu_usage()
            #logging.info("scraping took %s cycles" % (end-start))

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            stopUpdates = []
            for slot in soup.html.body.findAll("a","ada"):
                logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for
                        
                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]
                        
                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()
                        
                        logging.info("found stop %s, %s" % (stopID,intersection))
                        
                        # check for conflicts...
                        stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get()
                        if stop is None:
                            # add the new stop
                            stop = StopLocation()
                            stop.stopID = stopID
                            stop.routeID = routeID
                            stop.intersection = intersection.upper()
                            stop.direction = direction.upper()
                            stopUpdates.append(stop)  # stop.put()
                            logging.info("ADDED StopLocation (%s) - MINUS geo location" % stopID)
                        else:
                            logging.info("StopLoation entity already exists for %s..." % stopID)
                            stop.routeID = routeID
                            stopUpdates.append(stop)
                        
                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID))
                        routeArgs = routeData.split('&')
                        routeID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href
                    
                        # check for conflicts...
                        r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                                        routeID, directionID, stopID).get()
                        if r is None:
                          # add the new route to the DB
                          route = RouteListing()
                          route.route = routeID
                          route.direction = directionID
                          route.stopID = stopID
                          route.scheduleURL = timeEstimatesURL
                          route.put()
                          logging.info("added new route listing entry to the database!")
                        else:
                          logging.error("we found a duplicate entry!?! %s", r.scheduleURL)
                    #else: # title.split(",")[0].isdigit():
                    elif href.find("?r=") > -1:
                        # create a new task with this link
                        crawlURL = CRAWL_URLBASE + href
                        if routeID == '00':
                            routeID = href.split('r=')[1]
                        elif href.find("&") > -1:
                            routeID = href.split('&')[0].split('r=')[1]
                        task = Task(url='/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID})
                        task.add('crawler')
                        logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID))                    
                    # label crawler looks for titles with letters for extraction/persistence
                    #elif title.replace('-','').replace(' ','').isalpha():
                    #    routeData = href.split('?')[1]
                    #    logging.info("found the route LABEL page! href: %s" % href)
                    #    routeArgs = routeData.split('&')
                    #    directionID = routeArgs[1].split('=')[1]
                    #    
                    #    l = DestinationListing.get_or_insert(title, id=directionID, label=title)

            # push the vehicle updates to the datastore
            db.put(stopUpdates)
                                        
        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return
            
        return;
示例#5
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" %
                          (scrapeURL, direction, routeID))

            loop = 0
            done = False
            result = None
            #start = quota.get_request_cpu_usage()
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop + 1
            #end = quota.get_request_cpu_usage()
            #logging.info("scraping took %s cycles" % (end-start))

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            for slot in soup.html.body.findAll("a", "ada"):
                logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for

                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]

                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()

                        logging.info("found stop %s, %s" %
                                     (stopID, intersection))

                        # check for conflicts...
                        stop = db.GqlQuery(
                            "SELECT * FROM StopLocation WHERE stopID = :1",
                            stopID).get()
                        if stop is None:
                            logging.error(
                                "Missing stop %s which should be impossible" %
                                stopID)

                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info(
                            "FOUND THE PAGE ---> arguments: %s stopID: %s" %
                            (routeData, stopID))
                        routeArgs = routeData.split('&')
                        routeID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href

                        # check for conflicts...
                        r = db.GqlQuery(
                            "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                            routeID, directionID, stopID).get()
                        if r is None:
                            # add the new route to the DB
                            route = RouteListing()
                            route.route = routeID
                            route.direction = directionID
                            route.stopID = stopID
                            route.scheduleURL = timeEstimatesURL
                            route.stopLocation = stop
                            route.put()
                            logging.info(
                                "added new route listing entry to the database!"
                            )
                        else:
                            logging.error("we found a duplicate entry!?! %s",
                                          r.scheduleURL)
                    #else: # title.split(",")[0].isdigit():
                    else:
                        if href.find("?r=") > -1:
                            # create a new task with this link
                            crawlURL = CRAWL_URLBASE + href
                            if routeID == '00':
                                routeID = href.split('r=')[1]
                            elif href.find("&") > -1:
                                routeID = href.split('&')[0].split('r=')[1]
                            task = Task(url='/crawl/routelist/crawlingtask',
                                        params={
                                            'crawl': crawlURL,
                                            'direction': title,
                                            'routeID': routeID
                                        })
                            task.add('crawler')
                            logging.info(
                                "Added new task for %s, direction %s, route %s"
                                % (title.split(",")[0], title, routeID))
                        # label crawler looks for titles with letters for extraction/persistence
                        if title.replace('-', '').replace(' ', '').isalpha():
                            logging.info(
                                "found the route LABEL page! href: %s" % href)
                            routeData = href.split('?')[1]
                            routeArgs = routeData.split('&')
                            directionID = routeArgs[1].split('=')[1]

                            l = DestinationListing.get_or_insert(
                                title, id=directionID, label=title)

        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return

        return
示例#6
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')

            loop = 0
            done = False
            result = None
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True;
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop+1

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            for slot in soup.html.body.findAll("a","adalink"):

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)

                    # route crawler looks for titles with an ID# string
                    #
                    # stop links have the following format
                    #    <a class="adalink" title="CAMPUS &amp; BABCOCK RR [EB#0809]" href="?r=61&amp;d=102&amp;s=3457">CAMPUS &amp; BABCOCK RR [EB#0809]</a>
                    #
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for

                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]

                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()

                        logging.info("found stop %s, %s" % (stopID,intersection))

                        # check for conflicts...
                        stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get()
                        if stop is None:
                          logging.error("Missing stop %s which should be impossible" % stopID);

                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID))
                        routeArgs = routeData.split('&')
                        fakeRouteID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href

                        # check for conflicts...
                        r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                                        routeID, directionID, stopID).get()
                        if r is None:
                          # add the new route to the DB
                          route = RouteListing()
                          route.route = routeID
                          route.direction = directionID
                          route.stopID = stopID
                          route.scheduleURL = timeEstimatesURL
                          route.stopLocation = stop
                          route.put()
                          logging.info("added new route listing entry to the database!")
                        else:
                          logging.error("we found a duplicate entry!?! %s", r.scheduleURL)
                    else:
                        # direction links look like the following,
                        #    <a class="adalink" title="CapSq" href="?r=61&amp;d=102">CapSq</a>
                        #
                        # fetch the next page depth to get to the stop details
                        #

                        if href.find("?r=") > -1:
                            # create a new task with this link
                            crawlURL = CRAWL_URLBASE + href
                            task = Task(url='/crawl/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID})
                            task.add('crawler')
                            logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID))

                        # label crawler looks for titles with letters for extraction/persistence
                        if title.replace('-','').replace(' ','').isalpha():
                            directionID = href.split('d=')[0]
                            logging.info("found the route LABEL page! href: %s" % href)
                            l = DestinationListing.get_or_insert(title, id=directionID, label=title)

        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return

        return;