def writeVenueProviderRecord(yelpID, details): try: venue = representation.updateRecord(yelpID, **details) for provider, data in list(venue["providers"].items()): db().child(venuesTable, "details", yelpID, "providers").update({provider: data}) except Exception as e: log.error("Error writing record: {}\n{}".format(details, e))
def _writePlaceDataToLocations(place_data): place_id_to_georecord = {} for id, data in place_data.items(): coord = data['providers']['yelp']['coordinates'] place_id_to_georecord[id] = geoRecordFromCoord(coord['lat'], coord['lng']) db().child(locationsTable).update(place_id_to_georecord)
def fetchDetails(placeID): placeStatus = statusTable[placeID] # Get a list of (src, version) pairs that could be updated, skip searched places # TODO: Gracefully handle if TripAdvisor-mapper runs out of API calls (25k) newProviders = [ src for src in config if src not in placeStatus or ( config[src] > placeStatus[src] and placeStatus[src] != Status.NOT_FOUND.value) ] if not newProviders: # log.info("No new sources for {}".format(placeID)) return try: placeProviderIDs = proxwalk.getAndCacheProviderIDs( placeID, newProviders, placeStatus["identifiers"]) except Exception as e: log.error("Error fetching or caching provider id: {}".format(e)) return updatedProviders = request_handler.researchPlace( placeID, placeProviderIDs) # Write updated sources to /status newStatus = makeNewStatusTable(config, updatedProviders, placeProviderIDs, newProviders) try: placeStatus.update(newStatus) db().child(venuesTable, "status", placeID).update(placeStatus) except Exception as e: log.error("Error accessing status table for {}: {}".format( placeID, e)) log.info("{} done: {}".format(placeID, str(updatedProviders)))
def _guessYelpId(placeName, lat, lon): safePlaceId = hashlib.md5(placeName).hexdigest() cachedId = db().child(eventsTable).child("cache/" + safePlaceId).get().val() if cachedId: return cachedId opts = { # 'term': placeName, # Yelp does a bad job with term searching 'limit': 20, #'radius_filter': 1000, #'sort_by': 'distance', 'sort': 1, } r = yelpClient.search_by_coordinates(lat, lon, **opts) if len(r.businesses) > 0: location = (lat, lon) businessesWithCoords = [b for b in r.businesses if (b.location is not None) and (b.location.coordinate is not None)] biz = min(businessesWithCoords, key=lambda b: geo.distance(location, (b.location.coordinate.latitude, b.location.coordinate.longitude)) ) log.debug("%s --> %s" % (placeName, biz.name)) researchVenue(biz) # Add bizId to cache record = { "cache/" + safePlaceId: str(biz.id) } db().child(eventsTable).update(record) return biz.id else: log.info("Can't find %s" % placeName) return None
def writeSearchRecord(lat, lng, key=None): record = representation.geoRecordFromCoord(lat, lng) from datetime import datetime import time now = datetime.utcnow() record["timestamp"] = now.isoformat() record["time"] = time.time() db().child(searchesTable).update({ record["g"]: record })
def expandPlaces(config, center, radius_km): """ Expands cached venue details by fetching additional sources Config is of the form: { <provider>: <version> } where version is the newest version status """ statusTable = db().child(venuesTable).child("status").get().val() # Fetch placeIDs to expand location_table = db().child(locationsTable).get().val() placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table) log.info("{} places found".format(len(placeIDs))) def fetchDetails(placeID): placeStatus = statusTable[placeID] # Get a list of (src, version) pairs that could be updated, skip searched places # TODO: Gracefully handle if TripAdvisor-mapper runs out of API calls (25k) newProviders = [ src for src in config if src not in placeStatus or ( config[src] > placeStatus[src] and placeStatus[src] != Status.NOT_FOUND.value) ] if not newProviders: # log.info("No new sources for {}".format(placeID)) return try: placeProviderIDs = proxwalk.getAndCacheProviderIDs( placeID, newProviders, placeStatus["identifiers"]) except Exception as e: log.error("Error fetching or caching provider id: {}".format(e)) return updatedProviders = request_handler.researchPlace( placeID, placeProviderIDs) # Write updated sources to /status newStatus = makeNewStatusTable(config, updatedProviders, placeProviderIDs, newProviders) try: placeStatus.update(newStatus) db().child(venuesTable, "status", placeID).update(placeStatus) except Exception as e: log.error("Error accessing status table for {}: {}".format( placeID, e)) log.info("{} done: {}".format(placeID, str(updatedProviders))) pool = ThreadPool(8) pool.map(fetchDetails, placeIDs) log.info("Finished crawling other sources")
def writeEventRecord(eventObj): key = representation.createEventKey(eventObj) event = eventObj; geo = representation.geoRecordFromCoord(float(eventObj["coordinates"]["lat"]), float(eventObj["coordinates"]["lng"])) db().child(eventsTable).update( { "details/" + key: event, "locations/" + key: geo } )
def forceCache(locationDict): for key in locationDict: yelpID = locationDict[key] safePlaceId = hashlib.md5(key).hexdigest() # Fetch location and add it to Locations biz = yelpClient.get_business(yelpID) researchVenue(biz.business) record = { "cache/" + safePlaceId: yelpID } db().child(eventsTable).update(record)
def recordAPIStatus(apiName): req = True try: if apiName == "tripadvisor": TA_TEST_ID= "8364980" params = { "key": tripadvisorkey } r = requests.get(TRIP_ADVISOR_API.format(TA_TEST_ID), params) if r.status_code == 429: e_code = r.json().get("code") req = False elif apiName == "tripadvisor-mapper": TA_TEST_LOC = "37.774125,-122.422099" params = {"key": tripadvisorkey + "-mapper", "q": "kittea" } r = requests.get(TRIP_ADVISOR_LOC_MAPPER_API.format(TA_TEST_LOC), params) if r.status_code == 429: e_code = r.json().get("code") req = False elif apiName == "factual": try: cw = factualClient.crosswalk() r = cw.filters({"factual_id": "1b5a13e0-d022-4a66-b7cd-9f48801f1196" }).data() except api.APIException as e: if e.get_status_code() == 403: # Assume this means API limit has been reached b/c we don't want # to string-check the message (the only place where the specific # error type is listed) req = False elif apiName == "yelp": try: # Check an existing location yelpClient.get_business("kittea-cat-cafe-san-francisco-4") except errors.ExceededReqs: req = False elif apiName == "yelp3": try: yelp3Client.request("/businesses/{0}".format("kittea-cat-cafe-san-francisco-4")) except HTTPError as e: if e.code == 429: req = False else: raise ValueError("Unknown API name; see app/util.py for API values") except Exception as e: log.exception("Unknown error while checking for cap limits: %s" % e) # Timestamped response response = "{} {} {}".format(req, time.strftime("%a %x %X", time.localtime()), time.tzname[0]) # Status updated at: # https://console.firebase.google.com/project/prox-server-cf63e/database/data/api_availability db().child(apiAvailabilityTable).update({apiName: response}) return req
def writeYelpRecords(yelpVenues): record = {} for biz in yelpVenues: key = representation.createKey(biz) venue = representation.baseRecord(biz) geo = representation.geoRecord(biz) status = representation.baseStatus(biz) record["details/" + key] = venue record["locations/" + key] = geo record["status/" + key] = status db().child(venuesTable).update(record)
def calculate_crawled_provider_stats(center, radius_km): statusTable = db().child(venuesTable, "status").get().val() # Fetch placeIDs to check location_table = db().child(locationsTable).get().val() placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table) print("{} total places found".format(len(placeIDs))) provider_dict = {"match": {}, "no_match": {}, "error": {}} proxwalkTable = db().child(venuesTable, "proxwalk").get().val() prox_dict = {} prox_dict["total"] = len(proxwalkTable) for placeID in proxwalkTable: children = proxwalkTable[placeID] for child in children: if child not in prox_dict: prox_dict[child] = 0 prox_dict[child] += 1 provider_dict["proxwalk"] = prox_dict for placeID in placeIDs: placeStatus = statusTable[placeID] for provider in placeStatus: if provider == "identifiers": continue status = placeStatus[provider] state = "" if status == Status.NOT_FOUND.value: state = "no_match" elif status == Status.FETCH_FAILED.value: state = "error" else: state = "match" state_dict = provider_dict[state] newVal = state_dict.get(provider, 0) state_dict[provider] = newVal + 1 pprint(provider_dict)
def _writePlaceDataToStatus(place_data): place_id_to_status = {} for id, data in place_data.items(): yelp_place = data['providers']['yelp'] coord = yelp_place['coordinates'] status = { 'identifiers': { 'lat': coord['lat'], 'lng': coord['lng'], 'name': yelp_place['name'], }, } for provider in _STATUS_PROVIDERS_TO_FAKE: # If we set status to not found, we won't recrawl. It's more correct to set it to a positive number (data found) # but since that value increments with versions, it's possible it'll get overwritten eventually. status.update({provider: Status.NOT_FOUND.value}) place_id_to_status[id] = status db().child(statusTable).update(place_id_to_status)
def pruneEvents(): eventDetails = db().child(eventsTable).child("details").get().each() cutoff = datetime.datetime.today() - datetime.timedelta(days=1, hours=1) for event in eventDetails: key = event.key() if ("localEndTime" not in event.val()): deleteEvent(key) continue endTime = parser.parse(event.val().get("localEndTime")) if endTime < cutoff.replace(tzinfo=None): deleteEvent(key)
def fixStatus(center, radius_km, provider, detailsProvider, version): location_table = db().child(locationsTable).get().val() placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table) print("number found {}".format(len(placeIDs))) statusTable = db().child(venuesTable, "status").get().val() placeDetails = db().child(venuesTable, "details").get().val() count = 0 placeList = [] for placeID in placeIDs: providers = placeDetails[placeID]["providers"] if detailsProvider in providers and statusTable[placeID][ provider] == -1: st = db().child(venuesTable, "status", placeID) if not dryRun: st.update({provider: version}) count += 1 placeList.append(placeID) placeList.sort() print("Places updated: {}".format(placeList)) print("total {} places with details: {}".format(detailsProvider, count))
def _get_place_caches_missing_provider_data(center, radius_km, check_missing_ta=True, check_missing_wiki=True, check_missing_web=True): """Like get_get_places_missing_provider_data but returns the caches directly, for debug purposes.""" if not (check_missing_ta or check_missing_wiki or check_missing_web): raise ValueError('expected at least one provider') required_keys = set() if check_missing_ta: required_keys.add(_KEY_TA) if check_missing_wiki: required_keys.add(_KEY_WIKI) if check_missing_web: required_keys.add(_KEY_FACTUAL) location_table = db().child(locationsTable).get().val() place_ids_in_range = geo.get_place_ids_in_radius(center, radius_km, location_table) caches_for_place_ids = handler.readCachedVenueIterableDetails( place_ids_in_range) return _filter_caches_by_required_keys(required_keys, caches_for_place_ids)
def readCachedVenueIterableDetails(place_ids): """Retrieves the cache objects matching the given place IDs. This method retrieves the whole cache/ child when making a call so call sparingly. We do this because it's slow to make network requests for each child individually. To pull down less data, you can use `readCacheVenueDetails`. :param place_ids: Iterable of place_ids :return: a list of cache objects. If a place_id is not in the cache, it will be dropped from the results. """ out = [] try: cache = db().child(cacheTable).get().val() for place_id in place_ids: if place_id not in cache: continue out.append(cache[place_id]) except Exception: log.error("Error fetching cached venue details for " + place_id) return out
To visualize the returned GPS coordinates, hampster map is recommended. To log results from the production database, see readme. """ from app.firebase import db import app.geofire as geo # --- START MODIFIABLE PARAMETERS --- # # The location around which you'd like to log. focus = (19.915403, -155.8961577) # --- END MODIFIABLE PARAMETERS --- # from app.constants import venuesTable venueTableComplete = db().child(venuesTable).get().val() venueData = venueTableComplete["details"] cacheData = venueTableComplete["cache"] import json stats = { "website" : 0, "TOTAL" : 0, } factualStats = {} for venue in list(venueData.values()): yelpID = venue["id"] coord = venue["coordinates"] coord = (coord["lat"], coord["lng"]) if geo.distance(coord, focus) > 500: # 500 km
def _writePlaceDataToDetails(place_data): db().child(detailsTable).update(place_data)
def _get_locations_table(): return db().child(locationsTable).get().val()
def backup(path): backup = db().child(path).get().val() with open("out.json", "w") as f: json.dump(backup, f)
def deleteEvent(key): db().child(eventsTable).update({ "details/" + key: None, # "cache/" + key: None, # For Kona, do not delete the cache because we hard-code the moz-specific locations "locations/" + key: None })
def writeVenueRecord(yelpID, details, idObj = None): # idObj is remnant of live-search client calls. venue = representation.updateRecord(yelpID, **details) db().child(venuesTable, "details", yelpID).update(venue)
def readCachedVenueDetails(key): try: cache = db().child(venuesTable).child("cache/" + key).get().val() return cache except Exception: log.error("Error fetching cached venue details for " + key)
from app.constants import searchesTable, venuesTable from app.firebase import db print("Purging " + searchesTable) db().child(searchesTable).remove() print("Purging " + venuesTable) db().child(venuesTable).remove()
def _get_proxwalk_db(): return db().child(proxwalkTable)
def clearEventsDir(): db().child(eventsTable).remove()