Exemplo n.º 1
0
def writeVenueProviderRecord(yelpID, details):
    try:
        venue = representation.updateRecord(yelpID, **details)
        for provider, data in list(venue["providers"].items()):
            db().child(venuesTable, "details", yelpID, "providers").update({provider: data})
    except Exception as e:
        log.error("Error writing record: {}\n{}".format(details, e))
Exemplo n.º 2
0
def _writePlaceDataToLocations(place_data):
    place_id_to_georecord = {}
    for id, data in place_data.items():
        coord = data['providers']['yelp']['coordinates']
        place_id_to_georecord[id] = geoRecordFromCoord(coord['lat'],
                                                       coord['lng'])
    db().child(locationsTable).update(place_id_to_georecord)
Exemplo n.º 3
0
    def fetchDetails(placeID):
        placeStatus = statusTable[placeID]
        # Get a list of (src, version) pairs that could be updated, skip searched places
        # TODO: Gracefully handle if TripAdvisor-mapper runs out of API calls (25k)
        newProviders = [
            src for src in config if src not in placeStatus or (
                config[src] > placeStatus[src]
                and placeStatus[src] != Status.NOT_FOUND.value)
        ]
        if not newProviders:
            #            log.info("No new sources for {}".format(placeID))
            return

        try:
            placeProviderIDs = proxwalk.getAndCacheProviderIDs(
                placeID, newProviders, placeStatus["identifiers"])
        except Exception as e:
            log.error("Error fetching or caching provider id: {}".format(e))
            return

        updatedProviders = request_handler.researchPlace(
            placeID, placeProviderIDs)

        # Write updated sources to /status
        newStatus = makeNewStatusTable(config, updatedProviders,
                                       placeProviderIDs, newProviders)

        try:
            placeStatus.update(newStatus)
            db().child(venuesTable, "status", placeID).update(placeStatus)
        except Exception as e:
            log.error("Error accessing status table for {}: {}".format(
                placeID, e))

        log.info("{} done: {}".format(placeID, str(updatedProviders)))
Exemplo n.º 4
0
def _guessYelpId(placeName, lat, lon):
    safePlaceId = hashlib.md5(placeName).hexdigest()
    cachedId = db().child(eventsTable).child("cache/" + safePlaceId).get().val()
    if cachedId:
        return cachedId

    opts = {
      # 'term': placeName, # Yelp does a bad job with term searching
      'limit': 20,
      #'radius_filter': 1000,
      #'sort_by': 'distance',
      'sort': 1,
    }
    r = yelpClient.search_by_coordinates(lat, lon, **opts)
    if len(r.businesses) > 0:
        location = (lat, lon)
        businessesWithCoords = [b for b in r.businesses if (b.location is not None) and (b.location.coordinate is not None)]
        biz = min(businessesWithCoords, key=lambda b:
            geo.distance(location,
                         (b.location.coordinate.latitude, b.location.coordinate.longitude))
        )
        log.debug("%s --> %s" % (placeName, biz.name))
        researchVenue(biz)

        # Add bizId to cache
        record = { "cache/" +  safePlaceId: str(biz.id) }
        db().child(eventsTable).update(record)

        return biz.id
    else:
        log.info("Can't find %s" % placeName)
        return None
Exemplo n.º 5
0
def writeSearchRecord(lat, lng, key=None):
    record = representation.geoRecordFromCoord(lat, lng)
    from datetime import datetime
    import time
    now = datetime.utcnow()

    record["timestamp"] = now.isoformat()
    record["time"] = time.time()
    db().child(searchesTable).update({ record["g"]: record })
Exemplo n.º 6
0
def expandPlaces(config, center, radius_km):
    """
    Expands cached venue details by fetching additional sources
    Config is of the form:
        { <provider>: <version> }
    where version is the newest version status
    """
    statusTable = db().child(venuesTable).child("status").get().val()

    # Fetch placeIDs to expand
    location_table = db().child(locationsTable).get().val()
    placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table)

    log.info("{} places found".format(len(placeIDs)))

    def fetchDetails(placeID):
        placeStatus = statusTable[placeID]
        # Get a list of (src, version) pairs that could be updated, skip searched places
        # TODO: Gracefully handle if TripAdvisor-mapper runs out of API calls (25k)
        newProviders = [
            src for src in config if src not in placeStatus or (
                config[src] > placeStatus[src]
                and placeStatus[src] != Status.NOT_FOUND.value)
        ]
        if not newProviders:
            #            log.info("No new sources for {}".format(placeID))
            return

        try:
            placeProviderIDs = proxwalk.getAndCacheProviderIDs(
                placeID, newProviders, placeStatus["identifiers"])
        except Exception as e:
            log.error("Error fetching or caching provider id: {}".format(e))
            return

        updatedProviders = request_handler.researchPlace(
            placeID, placeProviderIDs)

        # Write updated sources to /status
        newStatus = makeNewStatusTable(config, updatedProviders,
                                       placeProviderIDs, newProviders)

        try:
            placeStatus.update(newStatus)
            db().child(venuesTable, "status", placeID).update(placeStatus)
        except Exception as e:
            log.error("Error accessing status table for {}: {}".format(
                placeID, e))

        log.info("{} done: {}".format(placeID, str(updatedProviders)))

    pool = ThreadPool(8)
    pool.map(fetchDetails, placeIDs)

    log.info("Finished crawling other sources")
Exemplo n.º 7
0
def writeEventRecord(eventObj):
    key   = representation.createEventKey(eventObj)
    event = eventObj;
    geo   = representation.geoRecordFromCoord(float(eventObj["coordinates"]["lat"]), float(eventObj["coordinates"]["lng"]))

    db().child(eventsTable).update(
      {
        "details/" + key: event,
        "locations/" + key: geo
      }
    )
def forceCache(locationDict):
    for key in locationDict:
        yelpID = locationDict[key]
        safePlaceId = hashlib.md5(key).hexdigest()

        # Fetch location and add it to Locations
        biz = yelpClient.get_business(yelpID)
        researchVenue(biz.business)

        record = { "cache/" +  safePlaceId: yelpID }
        db().child(eventsTable).update(record)
Exemplo n.º 9
0
def recordAPIStatus(apiName):
    req = True
    try:
        if apiName == "tripadvisor":
            TA_TEST_ID= "8364980"
            params = { "key": tripadvisorkey }
            r = requests.get(TRIP_ADVISOR_API.format(TA_TEST_ID), params)
            if r.status_code == 429:
                e_code = r.json().get("code")
                req = False
        elif apiName == "tripadvisor-mapper":
            TA_TEST_LOC = "37.774125,-122.422099"
            params = {"key": tripadvisorkey + "-mapper",
                      "q": "kittea" }
            r = requests.get(TRIP_ADVISOR_LOC_MAPPER_API.format(TA_TEST_LOC), params)
            if r.status_code == 429:
                e_code = r.json().get("code")
                req = False
        elif apiName == "factual":
            try:
                cw = factualClient.crosswalk()
                r = cw.filters({"factual_id": "1b5a13e0-d022-4a66-b7cd-9f48801f1196"
}).data()
            except api.APIException as e:
                if e.get_status_code() == 403:
                    # Assume this means API limit has been reached b/c we don't want
                    # to string-check the message (the only place where the specific
                    # error type is listed)
                    req = False
        elif apiName == "yelp":
            try:
                # Check an existing location
                yelpClient.get_business("kittea-cat-cafe-san-francisco-4")
            except errors.ExceededReqs:
                req = False
        elif apiName == "yelp3":
            try:
                yelp3Client.request("/businesses/{0}".format("kittea-cat-cafe-san-francisco-4"))
            except HTTPError as e:
                if e.code == 429:
                    req = False

        else:
            raise ValueError("Unknown API name; see app/util.py for API values")
    except Exception as e:
        log.exception("Unknown error while checking for cap limits: %s" % e)

    # Timestamped response
    response = "{} {} {}".format(req, time.strftime("%a %x %X", time.localtime()), time.tzname[0])

    # Status updated at:
    # https://console.firebase.google.com/project/prox-server-cf63e/database/data/api_availability
    db().child(apiAvailabilityTable).update({apiName: response})
    return req
Exemplo n.º 10
0
def writeYelpRecords(yelpVenues):
    record = {}

    for biz in yelpVenues:
        key = representation.createKey(biz)
        venue = representation.baseRecord(biz)
        geo = representation.geoRecord(biz)
        status = representation.baseStatus(biz)
        record["details/" + key] = venue
        record["locations/" + key] = geo
        record["status/" + key] = status

    db().child(venuesTable).update(record)
def calculate_crawled_provider_stats(center, radius_km):
    statusTable = db().child(venuesTable, "status").get().val()

    # Fetch placeIDs to check
    location_table = db().child(locationsTable).get().val()
    placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table)

    print("{} total places found".format(len(placeIDs)))

    provider_dict = {"match": {}, "no_match": {}, "error": {}}

    proxwalkTable = db().child(venuesTable, "proxwalk").get().val()
    prox_dict = {}
    prox_dict["total"] = len(proxwalkTable)

    for placeID in proxwalkTable:
        children = proxwalkTable[placeID]
        for child in children:
            if child not in prox_dict:
                prox_dict[child] = 0
            prox_dict[child] += 1

    provider_dict["proxwalk"] = prox_dict

    for placeID in placeIDs:
        placeStatus = statusTable[placeID]
        for provider in placeStatus:
            if provider == "identifiers":
                continue
            status = placeStatus[provider]
            state = ""
            if status == Status.NOT_FOUND.value:
                state = "no_match"
            elif status == Status.FETCH_FAILED.value:
                state = "error"
            else:
                state = "match"
            state_dict = provider_dict[state]
            newVal = state_dict.get(provider, 0)
            state_dict[provider] = newVal + 1

    pprint(provider_dict)
Exemplo n.º 12
0
def _writePlaceDataToStatus(place_data):
    place_id_to_status = {}
    for id, data in place_data.items():
        yelp_place = data['providers']['yelp']
        coord = yelp_place['coordinates']
        status = {
            'identifiers': {
                'lat': coord['lat'],
                'lng': coord['lng'],
                'name': yelp_place['name'],
            },
        }

        for provider in _STATUS_PROVIDERS_TO_FAKE:
            # If we set status to not found, we won't recrawl. It's more correct to set it to a positive number (data found)
            # but since that value increments with versions, it's possible it'll get overwritten eventually.
            status.update({provider: Status.NOT_FOUND.value})
        place_id_to_status[id] = status

    db().child(statusTable).update(place_id_to_status)
Exemplo n.º 13
0
def pruneEvents():
    eventDetails = db().child(eventsTable).child("details").get().each()
    cutoff = datetime.datetime.today() - datetime.timedelta(days=1, hours=1)

    for event in eventDetails:
        key = event.key()
        if ("localEndTime" not in event.val()):
            deleteEvent(key)
            continue

        endTime = parser.parse(event.val().get("localEndTime"))
        if endTime < cutoff.replace(tzinfo=None):
            deleteEvent(key)
def fixStatus(center, radius_km, provider, detailsProvider, version):
    location_table = db().child(locationsTable).get().val()
    placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table)
    print("number found {}".format(len(placeIDs)))

    statusTable = db().child(venuesTable, "status").get().val()
    placeDetails = db().child(venuesTable, "details").get().val()

    count = 0
    placeList = []
    for placeID in placeIDs:
        providers = placeDetails[placeID]["providers"]
        if detailsProvider in providers and statusTable[placeID][
                provider] == -1:
            st = db().child(venuesTable, "status", placeID)
            if not dryRun:
                st.update({provider: version})

            count += 1
            placeList.append(placeID)

    placeList.sort()
    print("Places updated: {}".format(placeList))
    print("total {} places with details: {}".format(detailsProvider, count))
def _get_place_caches_missing_provider_data(center,
                                            radius_km,
                                            check_missing_ta=True,
                                            check_missing_wiki=True,
                                            check_missing_web=True):
    """Like get_get_places_missing_provider_data but returns the caches directly, for debug purposes."""
    if not (check_missing_ta or check_missing_wiki or check_missing_web):
        raise ValueError('expected at least one provider')

    required_keys = set()
    if check_missing_ta: required_keys.add(_KEY_TA)
    if check_missing_wiki: required_keys.add(_KEY_WIKI)
    if check_missing_web: required_keys.add(_KEY_FACTUAL)

    location_table = db().child(locationsTable).get().val()
    place_ids_in_range = geo.get_place_ids_in_radius(center, radius_km,
                                                     location_table)
    caches_for_place_ids = handler.readCachedVenueIterableDetails(
        place_ids_in_range)
    return _filter_caches_by_required_keys(required_keys, caches_for_place_ids)
Exemplo n.º 16
0
def readCachedVenueIterableDetails(place_ids):
    """Retrieves the cache objects matching the given place IDs.

    This method retrieves the whole cache/ child when making a call so call sparingly.
    We do this because it's slow to make network requests for each child individually.
    To pull down less data, you can use `readCacheVenueDetails`.

    :param place_ids: Iterable of place_ids
    :return: a list of cache objects. If a place_id is not in the cache, it will be dropped from the results.
    """
    out = []
    try:
        cache = db().child(cacheTable).get().val()
        for place_id in place_ids:
            if place_id not in cache: continue
            out.append(cache[place_id])
    except Exception:
        log.error("Error fetching cached venue details for " + place_id)

    return out
Exemplo n.º 17
0
To visualize the returned GPS coordinates, hampster map is recommended.

To log results from the production database, see readme.

"""
from app.firebase import db
import app.geofire as geo

# --- START MODIFIABLE PARAMETERS --- #
# The location around which you'd like to log.
focus = (19.915403, -155.8961577)
# --- END MODIFIABLE PARAMETERS --- #

from app.constants import venuesTable

venueTableComplete = db().child(venuesTable).get().val()
venueData = venueTableComplete["details"]
cacheData = venueTableComplete["cache"]

import json
stats = {
  "website"    : 0,
  "TOTAL"      : 0,
}
factualStats = {}
for venue in list(venueData.values()):
    yelpID = venue["id"]
    coord = venue["coordinates"]
    coord = (coord["lat"], coord["lng"])

    if geo.distance(coord, focus) > 500: # 500 km
Exemplo n.º 18
0
def _writePlaceDataToDetails(place_data):
    db().child(detailsTable).update(place_data)
Exemplo n.º 19
0
def _get_locations_table():
    return db().child(locationsTable).get().val()
Exemplo n.º 20
0
def backup(path):
    backup = db().child(path).get().val()
    with open("out.json", "w") as f:
        json.dump(backup, f)
Exemplo n.º 21
0
def deleteEvent(key):
    db().child(eventsTable).update({
        "details/" + key: None,
    #    "cache/" + key: None, # For Kona, do not delete the cache because we hard-code the moz-specific locations
        "locations/" + key: None
    })
Exemplo n.º 22
0
def writeVenueRecord(yelpID, details, idObj = None):
    # idObj is remnant of live-search client calls.
    venue = representation.updateRecord(yelpID, **details)
    db().child(venuesTable, "details", yelpID).update(venue)
Exemplo n.º 23
0
def readCachedVenueDetails(key):
    try:
        cache = db().child(venuesTable).child("cache/" + key).get().val()
        return cache
    except Exception:
        log.error("Error fetching cached venue details for " + key)
Exemplo n.º 24
0
from app.constants import searchesTable, venuesTable
from app.firebase import db

print("Purging " + searchesTable)
db().child(searchesTable).remove()

print("Purging " + venuesTable)
db().child(venuesTable).remove()
Exemplo n.º 25
0
def _get_proxwalk_db():
    return db().child(proxwalkTable)
def clearEventsDir():
    db().child(eventsTable).remove()