# TODO: Why doesn't this update the event? Because add_event_tuple_if_updating seems to do nothing, probably because no fb_event is returned if real_fb_event['empty'] == fb_api.EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS and db_event.visible_to_fb_uids: empty_fb_event_ids.append(db_event.fb_event_id) else: # Otherwise if it's visible to our main token, or there are no other tokens to try, deal with it here. add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated) except fb_api.NoFetchedDataException as e: logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e) # Now trigger off a background reloading of empty fb_events if empty_fb_event_ids: logging.info("Couldn't fetch, using backup tokens for events: %s", empty_fb_event_ids) deferred.defer(load_fb_events_using_backup_tokens, empty_fb_event_ids, allow_cache=fbl.allow_cache, only_if_updated=only_if_updated, update_geodata=update_geodata) logging.info("Updating events: %s", [x[0].id for x in events_to_update]) # And then re-save all the events in here event_updates.update_and_save_fb_events(events_to_update, update_geodata=update_geodata) map_load_fb_event = fb_mapreduce.mr_wrap(yield_load_fb_event) load_fb_event = fb_mapreduce.nomr_wrap(yield_load_fb_event) def yield_load_fb_event_attending(fbl, all_events): db_events = [x for x in all_events if x.is_fb_event] fbl.get_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events], allow_fail=True) map_load_fb_event_attending = fb_mapreduce.mr_wrap(yield_load_fb_event_attending) load_fb_event_attending = fb_mapreduce.nomr_wrap(yield_load_fb_event_attending) def mr_load_fb_events(fbl, load_attending=False, time_period=None, update_geodata=True, only_if_updated=True, queue='slow-queue'): if load_attending: event_or_attending = 'Event Attendings' mr_func = 'map_load_fb_event_attending' else:
try: discovered_list.update(_process_thing_feed(fbl, source)) except fb_api.NoFetchedDataException, e: logging.warning("Failed to fetch data for thing: %s", str(e)) logging.info("Discovered %s items: %s", len(discovered_list), discovered_list) return discovered_list def scrape_events_from_source_ids(fbl, source_ids): sources = thing_db.Source.get_by_key_name(source_ids) sources = [x for x in sources if x] logging.info("Looking up %s source_ids, found %s sources", len(source_ids), len(sources)) scrape_events_from_sources(fbl, sources) map_scrape_events_from_sources = fb_mapreduce.mr_wrap(scrape_events_from_sources) def mapreduce_scrape_all_sources(fbl, min_potential_events=None, queue='slow-queue'): # Do not do the min_potential_events>1 filter in the mapreduce filter, # or it will want to do a range-shard on that property. Instead, pass-it-down # and use it as an early-return in the per-Source processing. # TODO:....maybe we do want a range-shard filter? save on loading all the useless sources... fb_mapreduce.start_map( fbl, 'Scrape All Sources', 'event_scraper.thing_scraper.map_scrape_events_from_sources', 'event_scraper.thing_db.Source', handle_batch_size=10, extra_mapper_params={'min_potential_events': min_potential_events}, queue=queue,
except fb_api.NoFetchedDataException as e: logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e) # Now trigger off a background reloading of empty fb_events if empty_fb_event_ids: deferred.defer(load_fb_events_using_backup_tokens, empty_fb_event_ids, allow_cache=fbl.allow_cache, only_if_updated=only_if_updated, update_geodata=update_geodata) # And then re-save all the events in here event_updates.update_and_save_fb_events(events_to_update, update_geodata=update_geodata) map_load_fb_event = fb_mapreduce.mr_wrap(yield_load_fb_event) load_fb_event = fb_mapreduce.nomr_wrap(yield_load_fb_event) def yield_load_fb_event_attending(fbl, all_events): db_events = [x for x in all_events if x.is_fb_event] fbl.get_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events]) map_load_fb_event_attending = fb_mapreduce.mr_wrap( yield_load_fb_event_attending) load_fb_event_attending = fb_mapreduce.nomr_wrap(yield_load_fb_event_attending) def mr_load_fb_events(fbl,
fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list]) fbl.batch_fetch() csv_file = StringIO.StringIO() csv_writer = csv.writer(csv_file) for pe in pe_list: try: result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)) cache_key = fbl.key_to_cache_key(fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id)) csv_writer.writerow([cache_key, result]) except fb_api.NoFetchedDataException: logging.error("skipping row for event id %s", pe.fb_event_id) yield csv_file.getvalue() map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json) def mr_dump_events(fbl): fb_mapreduce.start_map( fbl, 'Dump Potential FB Event Data', 'logic.mr_dump.map_dump_fb_json', 'event_scraper.potential_events.PotentialEvent', handle_batch_size=80, queue=None, filters=[('looked_at', '=', None)], output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', output_writer={ 'mime_type': 'text/plain', 'bucket_name': 'dancedeets-hrd.appspot.com', },
source.put() if new_source: backgrounder.load_sources([source_id], fb_uid=fbl.fb_uid) return source return None def create_sources_from_event(fbl, db_event): logging.info('create_sources_from_event: %s', db_event.id) create_source_from_id(fbl, db_event.owner_fb_uid) for admin in db_event.admins: if admin['id'] != db_event.owner_fb_uid: create_source_from_id(fbl, admin['id']) map_create_sources_from_event = fb_mapreduce.mr_wrap(create_sources_from_event) def explode_per_source_count(pe): db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id) is_potential_event = pe.match_score > 0 real_event = db_event != None false_negative = bool(db_event and not is_potential_event) result = (is_potential_event, real_event, false_negative) for source_id in pe.source_ids_only(): yield (source_id, json.dumps(result)) def combine_source_count(source_id, counts_to_sum):
pe2.auto_looked_at = True pe2.put() result = '-%s\n' % '\t'.join(unicode(x) for x in (pe.fb_event_id, fb_event['info'].get('name', ''))) results.append(result) mr.increment('auto-notadded-dance-events') return results def classify_events_with_yield(fbl, pe_list): assert fbl.allow_cache fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True) # DISABLE_ATTENDING results = classify_events(fbl, pe_list, fb_list) yield ''.join(results).encode('utf-8') map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield) def mr_classify_potential_events(fbl, past_event): filters = [('looked_at', '=', None), ('should_look_at', '=', True)] if past_event is not None: filters.append(('past_event', '=', past_event)) fb_mapreduce.start_map( fbl, 'Auto-Add Events', 'event_scraper.auto_add.map_classify_events', 'event_scraper.potential_events.PotentialEvent', filters=filters, handle_batch_size=20, queue='fast-queue', output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
source.compute_derived_properties(fb_data) logging.info('Getting source for id %s: %s', source.graph_id, source.name) return source def create_source_from_event(fbl, db_event): if not db_event.owner_fb_uid: return # technically we could check if the object exists in the db, before we bother fetching the feed thing_feed = fbl.get(fb_api.LookupThingFeed, db_event.owner_fb_uid) if not thing_feed['empty']: s = create_source_for_id(db_event.owner_fb_uid, thing_feed) s.put() map_create_source_from_event = fb_mapreduce.mr_wrap(create_source_from_event) def export_sources(fbl, sources): fbl.request_multi(fb_api.LookupThingFeed, [x.graph_id for x in sources]) fbl.batch_fetch() for source in sources: try: thing_feed = fbl.fetched_data(fb_api.LookupThingFeed, source.graph_id) if 'info' not in thing_feed: continue name = thing_feed['info'].get('name', '').encode('utf8') desc = thing_feed['info'].get('description', '').encode('utf8') fields = ( source.graph_id,
def create_source_for_id(source_id, fb_data): source = Source.get_by_key_name(source_id) or Source(key_name=source_id, street_dance_related=False) source.compute_derived_properties(fb_data) logging.info('Getting source for id %s: %s', source.graph_id, source.name) return source def create_source_from_event(fbl, db_event): if not db_event.owner_fb_uid: return # technically we could check if the object exists in the db, before we bother fetching the feed thing_feed = fbl.get(fb_api.LookupThingFeed, db_event.owner_fb_uid) if not thing_feed['empty']: s = create_source_for_id(db_event.owner_fb_uid, thing_feed) s.put() map_create_source_from_event = fb_mapreduce.mr_wrap(create_source_from_event) def export_sources(fbl, sources): fbl.request_multi(fb_api.LookupThingFeed, [x.graph_id for x in sources]) fbl.batch_fetch() for source in sources: try: thing_feed = fbl.fetched_data(fb_api.LookupThingFeed, source.graph_id) if 'info' not in thing_feed: continue name = thing_feed['info'].get('name', '').encode('utf8') desc = thing_feed['info'].get('description', '').encode('utf8') fields = ( source.graph_id, source.graph_type, source.creation_time,
for source in sources: try: thing_feed = fbl.fetched_data(fb_api.LookupThingFeed, source.graph_id) discovered_list.update(process_thing_feed(source, thing_feed)) except fb_api.NoFetchedDataException, e: logging.warning("Failed to fetch data for thing: %s", str(e)) logging.info("Discovered %s items: %s", len(discovered_list), discovered_list) return discovered_list def scrape_events_from_source_ids(fbl, source_ids): sources = thing_db.Source.get_by_key_name(source_ids) sources = [x for x in sources if x] logging.info("Looking up %s source_ids, found %s sources", len(source_ids), len(sources)) scrape_events_from_sources(fbl, sources) map_scrape_events_from_sources = fb_mapreduce.mr_wrap(scrape_events_from_sources) def mapreduce_scrape_all_sources(fbl, min_potential_events=None, queue='super-slow-queue'): # Do not do the min_potential_events>1 filter in the mapreduce filter, # or it will want to do a range-shard on that property. Instead, pass-it-down # and use it as an early-return in the per-Source processing. # TODO:....maybe we do want a range-shard filter? save on loading all the useless sources... fb_mapreduce.start_map( fbl, 'Scrape All Sources', 'event_scraper.thing_scraper.map_scrape_events_from_sources', 'event_scraper.thing_db.Source', handle_batch_size=10, extra_mapper_params={'min_potential_events': min_potential_events}, queue=queue, randomize_tokens=True,
if 'info' not in fbe: logging.error("skipping row2 for event id %s", e.fb_event_id) continue attendees = fb_api.get_all_members_count(fbe) if not fb_events.is_public(fbe) and fb_events.is_public_ish(fbe): mr.increment('nonpublic-and-large') privacy = fbe['info'].get('privacy', 'UNKNOWN') mr.increment('privacy-%s' % privacy) start_date = e.start_time.strftime('%Y-%m-%d') if e.start_time else '' yield '%s\n' % '\t'.join(str(x) for x in [e.fb_event_id, start_date, privacy, attendees]) except fb_api.NoFetchedDataException: logging.error("skipping row for event id %s", e.fb_event_id) map_dump_private_events = fb_mapreduce.mr_wrap(count_private_events) def mr_private_events(fbl): fb_mapreduce.start_map( fbl, 'Dump Private Events', 'servlets.tools.map_dump_private_events', 'events.eventdata.DBEvent', handle_batch_size=80, queue=None, output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', output_writer={ 'mime_type': 'text/plain', 'bucket_name': 'dancedeets-hrd.appspot.com', },
attendees = fb_api.get_all_members_count(fbe) privacy = fbe['info'].get('privacy', 'OPEN') if privacy != 'OPEN' and attendees > 60: mr.increment('nonpublic-and-large') mr.increment('privacy-%s' % privacy) start_date = e.start_time.strftime( '%Y-%m-%d') if e.start_time else '' yield '%s\n' % '\t'.join( str(x) for x in [e.fb_event_id, start_date, privacy, attendees]) except fb_api.NoFetchedDataException: logging.error("skipping row for event id %s", e.fb_event_id) map_dump_private_events = fb_mapreduce.mr_wrap(count_private_events) def mr_private_events(fbl): fb_mapreduce.start_map( fbl, 'Dump Private Events', 'servlets.tools.map_dump_private_events', 'events.eventdata.DBEvent', handle_batch_size=80, queue=None, output_writer_spec= 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', output_writer={ 'mime_type': 'text/plain', 'bucket_name': 'dancedeets-hrd.appspot.com',
fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id) fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id) except fb_api.NoFetchedDataException: continue if fb_event['empty']: continue predict_service = predict_service or gprediction.get_predict_service() pe = potential_events.update_scores_for_potential_event(pe, fb_event, fb_event_attending, predict_service) logging.info("%s has ms=%s, d=%s, nd=%s", pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score) if pe.dance_bias_score > 0.5 and pe.non_dance_bias_score > 0.5: result = '%s:%s:%s:%s\n' % (pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score) results.append(result) yield ''.join(results).encode('utf-8') map_classify_events = fb_mapreduce.mr_wrap(classify_events) def mr_classify_potential_events(fbl): fb_mapreduce.start_map( fbl, 'Auto-Classify Events', 'ml.mr_prediction.map_classify_events', 'event_scraper.potential_events.PotentialEvent', filters=[('looked_at', '=', None)], handle_batch_size=20, queue='slow-queue', output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', output_writer={ 'mime_type': 'text/plain', 'bucket_name': 'dancedeets-hrd.appspot.com', },
else: mr.increment('auto-added-dance-events-future') except fb_api.NoFetchedDataException as e: logging.error("Error adding event %s, no fetched data: %s", event_id, e) except add_entities.AddEventException as e: logging.warning("Error adding event %s, no fetched data: %s", event_id, e) return results def classify_events_with_yield(fbl, pe_list): fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True) results = classify_events(fbl, pe_list, fb_list) yield ''.join(results).encode('utf-8') map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield) def mr_classify_potential_events(fbl, past_event, dancey_only): filters = [] if dancey_only: filters.append(('should_look_at', '=', True)) if past_event is not None: filters.append(('past_event', '=', past_event)) fb_mapreduce.start_map( fbl, 'Auto-Add Events', 'event_scraper.auto_add.map_classify_events', 'event_scraper.potential_events.PotentialEvent', filters=filters, handle_batch_size=20,
db_event.visible_to_fb_uids = [] db_event.put() # Let's update the DBEvent as necessary (note, this uses the last-updated FBLookup) # Unfortunately, we failed to get anything in our fbl, as it was raising an ExpiredOAuthToken # So instead, let's call it and just have it use the db_event.fb_event if fbl: add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated) if events_to_update: event_updates.update_and_save_fb_events(events_to_update, disable_updates=disable_updates) def yield_resave_display_event(fbl, all_events): event_updates.resave_display_events(all_events) map_resave_display_event = fb_mapreduce.mr_wrap(yield_resave_display_event) def yield_load_fb_event(fbl, all_events): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params disable_updates = params['disable_updates'] only_if_updated = params['only_if_updated'] else: disable_updates = [] only_if_updated = True # Process web_events web_events = [x for x in all_events if not x.is_fb_event] events_to_update = []
if not potential_event.looked_at: continue try: good_event = potential_event.fb_event_id in good_event_ids and 'dance' or 'nodance' fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id) if fb_event['empty']: continue fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, potential_event.fb_event_id) training_features = get_training_features(potential_event, fb_event, fb_event_attending) csv_writer.writerow([good_event] + list(training_features)) except fb_api.NoFetchedDataException: logging.info("No data fetched for event id %s", potential_event.fb_event_id) yield csv_file.getvalue() map_training_data_for_pevents = fb_mapreduce.mr_wrap(training_data_for_pevents) def get_training_features(potential_event, fb_event, fb_event_attending): if 'owner' in fb_event['info']: owner_name = 'id%s' % fb_event['info']['owner']['id'] else: owner_name = '' location = event_locations.get_address_for_fb_event(fb_event).encode('utf-8') def strip_text(s): return strip_punctuation(s.encode('utf8')).lower() name = strip_text(fb_event['info'].get('name', '')) description = strip_text(fb_event['info'].get('description', '')) attendee_list = ' '.join(['id%s' % x['id'] for x in fb_event_attending['attending']['data']]) source_list = ' '.join('id%s' % x for x in potential_event.source_ids)
csv_file = StringIO.StringIO() csv_writer = csv.writer(csv_file) for pe in pe_list: try: result = json.dumps( fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)) cache_key = fbl.key_to_cache_key( fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id)) csv_writer.writerow([cache_key, result]) except fb_api.NoFetchedDataException: logging.error("skipping row for event id %s", pe.fb_event_id) yield csv_file.getvalue() map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json) def mr_dump_events(fbl): fb_mapreduce.start_map( fbl, 'Dump Potential FB Event Data', 'logic.mr_dump.map_dump_fb_json', 'event_scraper.potential_events.PotentialEvent', handle_batch_size=80, queue=None, filters=[('looked_at', '=', None)], output_writer_spec= 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', output_writer={ 'mime_type': 'text/plain',