def _mark_as_failed( owners_coll, owner_id, now, reason='', ): save = OrderedDict([ ('ubernear', OrderedDict([ ('lookup_failed', OrderedDict([ # If owner is retried and it is # successful it would be useful # to know when it failed. ('when', now), ('reason', reason), ]), ), ]), ), ]) mongo.save_no_replace( owners_coll, _id=owner_id, save=save, )
def test_save_no_replace_simple(): collection = fudge.Fake('collection') collection.remember_order() update = collection.expects('update') fields = OrderedDict([ ('foo', 'bar'), ('sna', 'foo'), ]) update.with_args( OrderedDict([ ('_id', 'foo_id'), ]), OrderedDict([ ('$set', fields), ]), upsert=True, safe=True, ) save = OrderedDict([ ('foo', 'bar'), ('sna', 'foo'), ]) mongo.save_no_replace( collection, 'foo_id', save=save, )
def _mark_as_failed( events_coll, event_id, now, field, reason='', ): save = OrderedDict([ ('ubernear', OrderedDict([ (field, OrderedDict([ # If event is retried and it is # successful it would be useful # to know when it failed. ('when', now), ('reason', reason), ]), ), ]), ), ]) mongo.save_no_replace( events_coll, _id=event_id, save=save, )
def test_save_no_replace_save_none(): collection = fudge.Fake('collection') collection.remember_order() update = collection.expects('update') fields2 = OrderedDict([ ('fee', 'fi'), ('fo', 'fum'), ]) update.with_args( OrderedDict([ ('_id', 'foo_id'), ]), OrderedDict([ ('$addToSet', fields2), ]), upsert=True, safe=True, ) add = OrderedDict([ ('fee', 'fi'), ('fo', 'fum'), ]) mongo.save_no_replace( collection, 'foo_id', add=add, )
def _save_venues( events, events_coll, usps_id, now, ): # Don't waste a call to the USPS API if not events: return venues = [event['facebook']['venue'] for event in events] usps_venues = [ OrderedDict([ ('address', venue['street']), ('city', venue['city']), ('state', venue['state']), ]) for venue in venues ] matches = address_information.verify( usps_id, *usps_venues ) # TODO fugly if len(usps_venues) == 1: matches = [matches] for (event,match) in zip(events,matches): if isinstance(match, ValueError): _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='normalization_failed', reason=str(match), ) continue match['country'] = 'US' save = OrderedDict([ ('normalized', match), ('ubernear.normalization_completed', now), ('ubernear.normalization_source', 'usps'), ]) log.debug( 'Storing normalized venue for {event_id}'.format( event_id=event['_id'], ) ) mongo.save_no_replace( events_coll, _id=event['_id'], save=save, )
def update_owners( events_coll, expired_coll, owners_coll, ): # Insert any owner ids that have migrated and mark the current # owners with ubernear.id_migrated = True find_regex = '\(#21\) Page ID [0-9]+ was migrated to page ID [0-9]+' cursor = owners_coll.find( {'$and':[ {'ubernear.lookup_failed.reason': {'$regex': find_regex}}, {'ubernear.id_migrated': {'$ne': True}}, ], } ) regex = ('\(#21\) Page ID (?P<old_id>[0-9]+) was migrated to ' 'page ID (?P<new_id>[0-9]+)' ) for owner in cursor: reason = owner['ubernear']['lookup_failed']['reason'] match = re.search(regex, reason) if match is None: log.error( 'Found unexpected lookup_failed reason {reason} ' 'for owner {_id}. Skipping'.format( reason=reason, _id=owner['_id'], ) ) old_id = match.group('old_id') new_id = match.group('new_id') if old_id != owner['_id']: log.warn( 'lookup_failed reason {reason} for owner {_id} has a ' 'different id {old_id}.'.format( old_id=old_id, new_id=new_id, _id=owner['_id'], ) ) # TODO find a more efficient way to do this one = owners_coll.find_one({'_id': new_id}) if one is None: owners_coll.insert({'_id': new_id}) save = OrderedDict([ ('ubernear.id_migrated', True) ]) mongo.save_no_replace( owners_coll, _id=owner['_id'], save=save, )
def test_save_no_replace_add_and_add_each(): collection = fudge.Fake('collection') collection.remember_order() update = collection.expects('update') fields1 = OrderedDict([ ('foo', 'bar'), ('sna', 'foo'), ]) fields2 = OrderedDict([ ('fi', 'fo'), ('fo', 'fum'), ('fee', OrderedDict([ ('$each', ['fo', 'fi', 'fum']), ]), ), ('fum', OrderedDict([ ('$each', ['fi', 'fo', 'fee']), ]), ), ]) update.with_args( OrderedDict([ ('_id', 'foo_id'), ]), OrderedDict([ ('$set', fields1), ('$addToSet', fields2), ]), upsert=True, safe=True, ) save = OrderedDict([ ('foo', 'bar'), ('sna', 'foo'), ]) add = OrderedDict([ ('fi', 'fo'), ('fo', 'fum'), ]) add_each = OrderedDict([ ('fee', ['fo', 'fi', 'fum']), ('fum', ['fi', 'fo', 'fee']), ]) mongo.save_no_replace( collection, 'foo_id', save=save, add=add, add_each=add_each, )
def _save_events( events_coll, expired_coll, owners_coll, owner_ids, graph ): # Don't waste a call to the Facebook Graph API if not owner_ids: return batch = [ OrderedDict([ ('method', 'GET'), ('relative_url', '{owner_id}/events?date_format=c'.format( owner_id=owner_id, ), ), ]) for owner_id in owner_ids ] reponses = graph.batch(batch) now = datetime.utcnow() for owner_id,response in zip(owner_ids,reponses): if isinstance(response, FacepyError): _mark_as_failed( owners_coll=owners_coll, owner_id=owner_id, now=now, reason=str(response), ) continue # Object does not exist anymore if response is False: _mark_as_failed( owners_coll=owners_coll, owner_id=owner_id, now=now, reason='False response', ) continue if response is None: # None has special significance in mongodb searches # so use 'null' instead. _mark_as_failed( owners_coll=owners_coll, owner_id=owner_id, now=now, reason='Null response', ) continue for event in response['data']: _id = event.pop('id') # TODO find a more efficient way to do this if (events_coll.find_one({'_id': _id}) or expired_coll.find_one({'_id': _id}) ): continue try: # This information is not complete. Save it for now # but allow it to be replaced with more detailed # information later. # TODO what happens if this overwrites good values? # Is that OK? Is it bad? # The check above does not guarantee some other # process won't update this same event before we get # here save = OrderedDict([ ('facebook', event), ('ubernear', OrderedDict([ ('source', 'facebook'), ('fetched', now), ]), ), ]) save['facebook']['start_time'] = utc_from_iso8601( save['facebook']['start_time'], naive=True, ) save['facebook']['end_time'] = utc_from_iso8601( save['facebook']['end_time'], naive=True, ) if 'updated_time' in save['facebook']: save['facebook']['updated_time'] = utc_from_iso8601( save['facebook']['updated_time'], naive=True, ) mongo.save_no_replace( events_coll, _id=_id, save=save, ) except Exception: log.error( 'Could not save event {_id}'.format( _id=_id, ) ) try: save = {'ubernear.last_lookup': now} mongo.save_no_replace( owners_coll, _id=owner_id, save=save, ) except Exception: log.error( 'Could not update owner {_id}'.format( _id=owner_id, ) )
def main(): parser = optparse.OptionParser( usage='%prog [OPTS]', ) parser.add_option( '-v', '--verbose', help='Verbose mode [default %default]', action="store_true", dest="verbose" ) parser.add_option( '--csv', help='Path to the CSV file containing the places to import', metavar='PATH', ) parser.add_option( '--config', help=('Path to the config file with information on how to ' 'import places' ), metavar='PATH', ) parser.add_option( '--db-config', help=('Path the to file with information on how to ' 'retrieve and store data in the database' ), metavar='PATH', ) parser.set_defaults( verbose=False, ) options, args = parser.parse_args() if args: parser.error('Wrong number of arguments.') if options.csv is None: parser.error('Missing option --csv=.') if options.config is None: parser.error('Missing option --config=.') if options.db_config is None: parser.error('Missing option --db-config=.') logging.basicConfig( level=logging.DEBUG if options.verbose else logging.INFO, format='%(asctime)s.%(msecs)03d %(name)s: %(levelname)s: %(message)s', datefmt='%Y-%m-%dT%H:%M:%S', ) places_csv = absolute_path(options.csv) config = config_parser(options.config) coll = collections(options.db_config) places_coll = coll['places-collection'] usps_id = config.get('usps', 'user_id') delimiter = config.get('csv', 'delimiter') delimiter = delimiter.decode('string-escape') fieldnames = [ 'id', 'name', 'address', 'address_extended', 'po_box', 'locality', 'region', 'country', 'postcode', 'tel', 'fax', 'category', 'website', 'email', 'latitude', 'longitude', 'status', ] log.info('Start...') with open(places_csv, 'rb') as places_fp: places = csv.DictReader( places_fp, delimiter=delimiter, fieldnames=fieldnames, ) for place in places: # Don't store empty fields save = defaultdict(dict) for k,v in place.iteritems(): if v != '': save['info'][k] = v try: lat = float(save['info']['latitude']) lng = float(save['info']['longitude']) except (KeyError, ValueError): log.debug( 'Did not find a valid latitude and longitude for place ' '{_id}'.format( _id=save['info']['id'], ) ) else: save['info']['latitude'] = lat save['info']['longitude'] = lng # Coordinates are always stored in the form [lng,lat], # in that order. Anything else might result in incorrect # MongoDB Geospatial queries. save['ubernear.location'] = [lng, lat] error_msg = ('Bad coordinates (lng,lat) {coord} for id ' '{_id}' ) error_msg = error_msg.format( coord=(lng, lat), _id=save['info']['id'] ) if (lng < -180 or lng >= 180) or (lat < -90 or lat > 90): log.error(error_msg) del save['info']['latitude'] del save['info']['longitude'] del save['ubernear.location'] if 'address' not in save['info']: log.error( 'Found place {_id} with no address information. ' 'Skipping'.format( _id=save['info']['id'], ) ) continue match = dict([ ('address', save['info']['address']), ('city', save['info']['locality']), ('state', save['info']['region']), ('zipcode', save['info']['postcode']), ]) if 'address_extended' in save['info']: match['address_extended'] = save['info']['address_extended'] try: norm = address_information.verify(usps_id, match) except: log.error( 'The USPS API could not find an address for place ' '{_id}'.format( _id=save['info']['id'], ) ) else: norm['name'] = save['info']['name'].upper() norm['country'] = 'US' save['normalized'] = norm save['ubernear.normalization_source'] = 'usps' save['ubernear.source'] = 'factual' mongo.save_no_replace( places_coll, _id=save['info']['id'], save=save, ) indices = [ {'ubernear.location': pymongo.GEO2D}, {'ubernear.last_checked': pymongo.ASCENDING}, ] mongo.create_indices( collection=places_coll, indices=indices, ) log.info('End')
def locate( events_coll, places_coll, database, process_all=False, _log=None, _datetime=None, _match_with_place_fn=None, _match_with_venue_fn=None, ): if _log is None: _log = log if _datetime is None: _datetime = datetime if _match_with_place_fn is None: _match_with_place_fn = _match_with_place if _match_with_venue_fn is None: _match_with_venue_fn = _match_with_venue now = _datetime.utcnow() if process_all: events = events_coll.find( OrderedDict([("ubernear.lookup_completed", OrderedDict([("$exists", True)]))]), sort=[("ubernear.fetched", pymongo.ASCENDING)], ) else: query_parts = [ OrderedDict([("ubernear.lookup_completed", OrderedDict([("$exists", True)]))]), OrderedDict([("ubernear.match_completed", OrderedDict([("$exists", False)]))]), OrderedDict([("ubernear.match_failed", OrderedDict([("$exists", False)]))]), ] events = events_coll.find(OrderedDict([("$and", query_parts)]), sort=[("ubernear.fetched", pymongo.ASCENDING)]) count = events.count() if count != 0: _log.info("Matching {count} event{s}".format(count=count, s="" if count == 1 else "s")) found_work = False for event in events: found_work = True ubernear = event["ubernear"] place_ids = ubernear.get("place_ids", []) place_ids = place_ids match = _match_with_place_fn(event=event, place_ids=place_ids, places_coll=places_coll, database=database) if match is not None: save = OrderedDict([("match", match), ("ubernear.match_completed", now)]) mongo.save_no_replace(events_coll, _id=event["_id"], save=save) else: save = OrderedDict([("ubernear.match_failed", "No place match")]) mongo.save_no_replace(events_coll, _id=event["_id"], save=save) if process_all: events = events_coll.find( OrderedDict([("ubernear.lookup_completed", OrderedDict([("$exists", True)]))]), sort=[("ubernear.fetched", pymongo.ASCENDING)], ) else: or_query = OrderedDict( [ ( "$or", [ OrderedDict([("facebook.location", OrderedDict([("$exists", True)]))]), OrderedDict([("facebook.owner.name", OrderedDict([("$exists", True)]))]), ], ) ] ) query = OrderedDict( [ ( "$and", [ OrderedDict([("ubernear.match_completed", OrderedDict([("$exists", False)]))]), OrderedDict([("ubernear.match_failed", "No place match")]), OrderedDict([("facebook.venue.latitude", OrderedDict([("$exists", True)]))]), OrderedDict([("facebook.venue.longitude", OrderedDict([("$exists", True)]))]), OrderedDict([("facebook.venue.street", OrderedDict([("$exists", True)]))]), OrderedDict([("facebook.venue.city", OrderedDict([("$exists", True)]))]), or_query, ], ) ] ) events = events_coll.find(query, sort=[("ubernear.fetched", pymongo.ASCENDING)]) count = events.count() if count != 0: _log.info("Resolving {count} venue{s}".format(count=count, s="" if count == 1 else "s")) for event in events: found_work = True match = _match_with_venue_fn(event=event, _log=_log) if match is not None: save = OrderedDict([("match", match), ("ubernear.match_completed", now)]) mongo.save_no_replace(events_coll, _id=event["_id"], save=save) return found_work
def test_save_no_replace_noop(): collection = fudge.Fake('collection') mongo.save_no_replace( collection, 'foo_id', )
def update_coordinate( events_coll, yahoo_id, process_all, ): now = datetime.utcnow() if process_all: events = events_coll.find() else: latitude_query = OrderedDict([ ('facebook.venue.latitude', OrderedDict([ ('$exists', False), ]), ), ]) longitude_query = OrderedDict([ ('facebook.venue.longitude', OrderedDict([ ('$exists', False), ]), ), ]) or_query = OrderedDict([ ('$or', [latitude_query, longitude_query, ] ), ]) failed_query = OrderedDict([ ('ubernear.geocoding_failed', OrderedDict([ ('$exists', False), ]), ), ]) completed_query = OrderedDict([ ('ubernear.geocoding_completed', OrderedDict([ ('$exists', False), ]), ), ]) lookup_query = OrderedDict([ ('ubernear.lookup_completed', OrderedDict([ ('$exists', True), ]), ), ]) query = OrderedDict([ ('$and', [or_query, failed_query, completed_query, lookup_query, ] ), ]) events = events_coll.find( query, sort=[('ubernear.fetched', pymongo.ASCENDING)], ) count = events.count() if count != 0: log.info( 'Geocoding {count} event{s}'.format( count=count, s='' if count == 1 else 's', ), ) found_work = OrderedDict([ ('found_work', False), ('sleep', None), ]) # TODO This cursor may timeout if there are too many results for event in events: found_work['found_work'] = True # Check for missing values here instead of in the query # so it is explicitly known which events are not # eligible for geocoding if not 'venue' in event['facebook']: _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='geocoding_failed', reason='No venue', ) continue venue = event['facebook']['venue'] # The minimal requirements for geocoding if 'normalized' in event: address = event['normalized']['address'] city = event['normalized']['city'] elif ( not 'street' in venue or not 'city' in venue ): _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='geocoding_failed', reason='No street or city', ) continue else: address = venue['street'] city = venue['city'] request = '{address},{city}'.format( address=address.encode('utf-8'), city=city.encode('utf-8'), ) try: # TODO figure out which error corresponds to the # rate limit reached and return the number of hours # to sleep response = geocoder.geocode_yahoo(request, yahoo_id) except geocoder.GeocoderAmbiguousResultError, e: _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='geocoding_failed', reason=str(e), ) continue if response is None: _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='geocoding_failed', reason='Null response', ) continue save = OrderedDict([ ('facebook.venue.latitude', response['lat']), ('facebook.venue.longitude', response['lng']), ('ubernear.geocoding_completed', now), ('ubernear.geocoding_source', 'yahoo'), ]) log.debug( 'Storing coordinates for {event_id}'.format( event_id=event['_id'], ) ) mongo.save_no_replace( events_coll, _id=event['_id'], save=save, )
def expire( events_coll, expired_coll, _datetime=None, ): if _datetime is None: _datetime = datetime last_week = _datetime.utcnow() - timedelta(days=7) end_parts = [ # No guarantees in documentation that # $lt doesn't return rows where # the field doesn't exist OrderedDict([ ('facebook.end_time', OrderedDict([ ('$exists', True), ]), ), ]), OrderedDict([ ('facebook.end_time', OrderedDict([ ('$lt', last_week), ]), ), ]), ] end_query = OrderedDict([ ('$and', end_parts), ]) false_query = OrderedDict([ ('ubernear.lookup_failed.reason', 'False response'), ]) # It seems facebook should return false instead of this error, # i.e., the id cannot be found. No bug report has been found to # confirm this although some reports suggest it. unsupported_query = OrderedDict([ ('ubernear.lookup_failed.reason', OrderedDict([ ('$regex', 'GraphMethodException error on get.*' ': Unsupported get request..', ), ('$options', 'i'), ]), ) ]) alias_query = OrderedDict([ ('ubernear.lookup_failed.reason', OrderedDict([ ('$regex', 'OAuthException error on get.*Some ' 'of the aliases you requested do not exist.*' ), ('$options', 'i'), ]), ) ]) or_query = OrderedDict([ ('$or', [false_query, unsupported_query, alias_query, ], ), ]) facebook_query = OrderedDict([ ('ubernear.lookup_completed', OrderedDict([ ('$exists', False), ]), ), ]) failed_query = OrderedDict([ ('$and', [facebook_query, or_query]), ]) cursor = events_coll.find( OrderedDict([ ('$or', [end_query, failed_query, ] ), ]), sort=[('facebook.end_time', pymongo.ASCENDING)], ) for event in cursor: event_id = event.pop('_id') kwargs = OrderedDict([ ('_id', event_id), ('save', event), ]) ubernear = event['ubernear'] place_ids = ubernear.get('place_ids') if place_ids is not None: # Add to a set of ubernear.place_ids kwargs['add_each'] = OrderedDict([ ('ubernear.place_ids', place_ids), ]) del ubernear['place_ids'] mongo.save_no_replace( expired_coll, **kwargs ) events_coll.remove( OrderedDict([ ('_id', event_id), ]) )
def _save_events( events, events_coll, graph, now, _log=None, ): if _log is None: _log = log # Don't waste a call to the Facebook Graph API if not events: return batch = [ OrderedDict([ ('method', 'GET'), ('relative_url', '{event_id}?date_format=c'.format( event_id=event['_id'] ), ), ]) for event in events ] reponses = graph.batch(batch) for event,response in zip(events,reponses): if isinstance(response, FacepyError): _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='lookup_failed', reason=str(response), ) continue # Event does not exist anymore if response is False: _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='lookup_failed', reason='False response', ) continue if response is None: # None has special significance in mongodb searches # so use 'null' instead. _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='lookup_failed', reason='Null response', ) continue # We seem to have a valid response but ids are different? if response['id'] != event['_id']: _log.error( 'Facebook returned information for an event other than ' '{event_id}. Skipping event.'.format( event_id=event['_id'], ) ) _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='lookup_failed', reason='Response id is different', ) continue save = OrderedDict([ ('facebook', response), ('ubernear', OrderedDict([ # Depending on where the event came from, # the event source may not have already # been set ('source', 'facebook'), ('lookup_completed', now), ]), ), ]) # Skip responses without a start_time or end_time. # Sometimes the Graph API returns events without these if ( 'start_time' in save['facebook'] and 'end_time' in save['facebook'] ): save['facebook']['start_time'] = utc_from_iso8601( save['facebook']['start_time'], naive=True, ) save['facebook']['end_time'] = utc_from_iso8601( save['facebook']['end_time'], naive=True, ) else: _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='lookup_failed', reason='Missing start_time or end_time', ) continue if 'updated_time' in save['facebook']: save['facebook']['updated_time'] = utc_from_iso8601( save['facebook']['updated_time'], naive=True, ) _log.debug( 'Storing event {event_id}'.format( event_id=event['_id'], ) ) mongo.save_no_replace( events_coll, _id=event['_id'], save=save, )
) else: # Coordinates are always stored in the form [lng,lat], # in that order. Anything else might result in incorrect # MongoDB Geospatial queries. lat = float(location['lat']) lng = float(location['lng']) location = [lng, lat] save = { 'ubernear.location': location, 'info.latitude': lat, 'info.longitude': lng, } mongo.save_no_replace( places_coll, _id=place['_id'], save=save, ) if not found_work: hours = 24 delay = random.randint(60*60*hours, 60*60*hours+1) log.info( 'Did not find any work. ' 'Sleeping about {hours} hours...'.format( hours=hours, ) ) time.sleep(delay) log.info('End')