if fb_event['empty']:
                continue
            fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending,
                                                  potential_event.fb_event_id)

            training_features = get_training_features(potential_event,
                                                      fb_event,
                                                      fb_event_attending)
            csv_writer.writerow([good_event] + list(training_features))
        except fb_api.NoFetchedDataException:
            logging.info("No data fetched for event id %s",
                         potential_event.fb_event_id)
    yield csv_file.getvalue()


map_training_data_for_pevents = fb_mapreduce.mr_wrap(training_data_for_pevents)


def get_training_features(potential_event, fb_event, fb_event_attending):
    if 'owner' in fb_event['info']:
        owner_name = 'id%s' % fb_event['info']['owner']['id']
    else:
        owner_name = ''
    location = event_locations.get_address_for_fb_event(fb_event).encode(
        'utf-8')

    def strip_text(s):
        return strip_punctuation(s.encode('utf8')).lower()

    name = strip_text(fb_event['info'].get('name', ''))
    description = strip_text(fb_event['info'].get('description', ''))
Exemplo n.º 2
0
            predict_service = predict_service or gprediction.get_predict_service(
            )
            pe = potential_events.update_scores_for_potential_event(
                pe, fb_event, fb_event_attending, predict_service)
        logging.info("%s has ms=%s, d=%s, nd=%s", pe.fb_event_id,
                     pe.match_score, pe.dance_bias_score,
                     pe.non_dance_bias_score)
        if pe.dance_bias_score > 0.5 and pe.non_dance_bias_score > 0.5:
            result = '%s:%s:%s:%s\n' % (pe.fb_event_id, pe.match_score,
                                        pe.dance_bias_score,
                                        pe.non_dance_bias_score)
            results.append(result)
    yield ''.join(results).encode('utf-8')


map_classify_events = fb_mapreduce.mr_wrap(classify_events)


def mr_classify_potential_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Auto-Classify Events',
        'dancedeets.ml.mr_prediction.map_classify_events',
        'dancedeets.event_scraper.potential_events.PotentialEvent',
        filters=[('looked_at', '=', None)],
        handle_batch_size=20,
        queue='slow-queue',
        output_writer_spec=
        'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
Exemplo n.º 3
0
    fbl.batch_fetch()

    csv_file = StringIO.StringIO()
    csv_writer = csv.writer(csv_file)

    for pe in pe_list:
        try:
            result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id))
            cache_key = fbl.key_to_cache_key(fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id))
            csv_writer.writerow([cache_key, result])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", pe.fb_event_id)
    yield csv_file.getvalue()


map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json)


def mr_dump_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Potential FB Event Data',
        'dancedeets.logic.mr_dump.map_dump_fb_json',
        'dancedeets.event_scraper.potential_events.PotentialEvent',
        handle_batch_size=80,
        queue=None,
        filters=[('looked_at', '=', None)],
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
        try:
            discovered_list.update(_process_thing_feed(fbl, source))
        except fb_api.NoFetchedDataException, e:
            logging.warning("Failed to fetch data for thing: %s", str(e))
    logging.info("Discovered %s items: %s", len(discovered_list), discovered_list)
    return discovered_list


def scrape_events_from_source_ids(fbl, source_ids):
    sources = thing_db.Source.get_by_key_name(source_ids)
    sources = [x for x in sources if x]
    logging.info("Looking up %s source_ids, found %s sources", len(source_ids), len(sources))
    scrape_events_from_sources(fbl, sources)


map_scrape_events_from_sources = fb_mapreduce.mr_wrap(scrape_events_from_sources)


def mapreduce_scrape_all_sources(fbl, min_potential_events=None, queue='slow-queue'):
    # Do not do the min_potential_events>1 filter in the mapreduce filter,
    # or it will want to do a range-shard on that property. Instead, pass-it-down
    # and use it as an early-return in the per-Source processing.
    # TODO:....maybe we do want a range-shard filter? save on loading all the useless sources...
    fb_mapreduce.start_map(
        fbl,
        'Scrape All Sources',
        'dancedeets.event_scraper.thing_scraper.map_scrape_events_from_sources',
        'dancedeets.event_scraper.thing_db.Source',
        handle_batch_size=10,
        extra_mapper_params={'min_potential_events': min_potential_events},
        queue=queue,
Exemplo n.º 5
0
                for vertical in e.verticals:
                    mr.increment('auto-added-dance-event-vertical-%s' % vertical)
            except fb_api.NoFetchedDataException as e:
                logging.error("Error adding event %s, no fetched data: %s", event_id, e)
            except add_entities.AddEventException as e:
                logging.warning("Error adding event %s, no fetched data: %s", event_id, e)
    return results


def classify_events_with_yield(fbl, pe_list):
    fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True)
    results = classify_events(fbl, pe_list, fb_list)
    yield ''.join(results).encode('utf-8')


map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield)


def mr_classify_potential_events(fbl, past_event, dancey_only):
    filters = []
    if dancey_only:
        filters.append(('should_look_at', '=', True))
    if past_event is not None:
        filters.append(('past_event', '=', past_event))
    fb_mapreduce.start_map(
        fbl,
        'Auto-Add Events',
        'dancedeets.event_scraper.auto_add.map_classify_events',
        'dancedeets.event_scraper.potential_events.PotentialEvent',
        filters=filters,
        # Make sure we don't process so many that we cause the tasks to time out
        try:
            good_event = potential_event.fb_event_id in good_event_ids and 'dance' or 'nodance'

            fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id)
            if fb_event['empty']:
                continue
            fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, potential_event.fb_event_id)

            training_features = get_training_features(potential_event, fb_event, fb_event_attending)
            csv_writer.writerow([good_event] + list(training_features))
        except fb_api.NoFetchedDataException:
            logging.info("No data fetched for event id %s", potential_event.fb_event_id)
    yield csv_file.getvalue()


map_training_data_for_pevents = fb_mapreduce.mr_wrap(training_data_for_pevents)


def get_training_features(potential_event, fb_event, fb_event_attending):
    if 'owner' in fb_event['info']:
        owner_name = 'id%s' % fb_event['info']['owner']['id']
    else:
        owner_name = ''
    location = event_locations.get_address_for_fb_event(fb_event).encode('utf-8')

    def strip_text(s):
        return strip_punctuation(s.encode('utf8')).lower()

    name = strip_text(fb_event['info'].get('name', ''))
    description = strip_text(fb_event['info'].get('description', ''))
Exemplo n.º 7
0
            attendees = fb_events.get_all_members_count(fbe)
            if not fb_events.is_public(fbe) and fb_events.is_public_ish(fbe):
                mr.increment('nonpublic-and-large')
            privacy = fbe['info'].get('privacy', 'UNKNOWN')
            mr.increment('privacy-%s' % privacy)

            start_date = e.start_time.strftime(
                '%Y-%m-%d') if e.start_time else ''
            yield '%s\n' % '\t'.join(
                str(x)
                for x in [e.fb_event_id, start_date, privacy, attendees])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", e.fb_event_id)


map_dump_private_events = fb_mapreduce.mr_wrap(count_private_events)


def mr_private_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Private Events',
        'dancedeets.servlets.tools.map_dump_private_events',
        'dancedeets.events.eventdata.DBEvent',
        handle_batch_size=80,
        queue=None,
        output_writer_spec=
        'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
Exemplo n.º 8
0
                              event_id, e)
            except add_entities.AddEventException as e:
                logging.warning("Error adding event %s, no fetched data: %s",
                                event_id, e)
    return results


def classify_events_with_yield(fbl, pe_list):
    fb_list = fbl.get_multi(fb_api.LookupEvent,
                            [x.fb_event_id for x in pe_list],
                            allow_fail=True)
    results = classify_events(fbl, pe_list, fb_list)
    yield ''.join(results).encode('utf-8')


map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield)


def mr_classify_potential_events(fbl, past_event, dancey_only):
    filters = []
    if dancey_only:
        filters.append(('should_look_at', '=', True))
    if past_event is not None:
        filters.append(('past_event', '=', past_event))
    fb_mapreduce.start_map(
        fbl,
        'Auto-Add Events',
        'dancedeets.event_scraper.auto_add.map_classify_events',
        'dancedeets.event_scraper.potential_events.PotentialEvent',
        filters=filters,
        # Make sure we don't process so many that we cause the tasks to time out
                db_event.visible_to_fb_uids = visible_to_fb_uids
                db_event.put()
            # Let's update the DBEvent as necessary (note, this uses the last-updated FBLookup)
            # Unfortunately, we failed to get anything in our fbl, as it was raising an ExpiredOAuthToken
            # So instead, let's call it and just have it use the db_event.fb_event
            if fbl:
                add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated)
    if events_to_update:
        event_updates.update_and_save_fb_events(events_to_update, disable_updates=disable_updates)


def yield_resave_display_event(fbl, all_events):
    event_updates.resave_display_events(all_events)


map_resave_display_event = fb_mapreduce.mr_wrap(yield_resave_display_event)


def yield_load_fb_event(fbl, all_events):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        disable_updates = params['disable_updates']
        only_if_updated = params['only_if_updated']
    else:
        disable_updates = []
        only_if_updated = True

    for event in all_events:
        # Temporary hack to fix our double-street events
        verticals = sorted(list(set(event.verticals)))
Exemplo n.º 10
0
            source.compute_derived_properties(fb_source_common, fb_source_data)
            return source
        return None
    finally:
        fbl.allow_cache = original_allow_cache


def create_sources_from_event(fbl, db_event):
    logging.info('create_sources_from_event: %s', db_event.id)
    create_source_from_id(fbl, db_event.owner_fb_uid, verticals=db_event.verticals)
    for admin in db_event.admins:
        if admin['id'] != db_event.owner_fb_uid:
            create_source_from_id(fbl, admin['id'], verticals=db_event.verticals)


map_create_sources_from_event = fb_mapreduce.mr_wrap(create_sources_from_event)


def explode_per_source_count(pe):
    db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id)

    is_potential_event = pe.match_score > 0
    real_event = db_event != None
    false_negative = bool(db_event and not is_potential_event)
    result = (is_potential_event, real_event, false_negative)

    for source_id in pe.source_ids_only():
        yield (source_id, json.dumps(result))


def combine_source_count(source_id, counts_to_sum):
                fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)
                fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id)
            except fb_api.NoFetchedDataException:
                continue
            if fb_event['empty']:
                continue
            predict_service = predict_service or gprediction.get_predict_service()
            pe = potential_events.update_scores_for_potential_event(pe, fb_event, fb_event_attending, predict_service)
        logging.info("%s has ms=%s, d=%s, nd=%s", pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score)
        if pe.dance_bias_score > 0.5 and pe.non_dance_bias_score > 0.5:
            result = '%s:%s:%s:%s\n' % (pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score)
            results.append(result)
    yield ''.join(results).encode('utf-8')


map_classify_events = fb_mapreduce.mr_wrap(classify_events)


def mr_classify_potential_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Auto-Classify Events',
        'dancedeets.ml.mr_prediction.map_classify_events',
        'dancedeets.event_scraper.potential_events.PotentialEvent',
        filters=[('looked_at', '=', None)],
        handle_batch_size=20,
        queue='slow-queue',
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
Exemplo n.º 12
0
            if 'info' not in fbe:
                logging.error("skipping row2 for event id %s", e.fb_event_id)
                continue
            attendees = fb_events.get_all_members_count(fbe)
            if not fb_events.is_public(fbe) and fb_events.is_public_ish(fbe):
                mr.increment('nonpublic-and-large')
            privacy = fbe['info'].get('privacy', 'UNKNOWN')
            mr.increment('privacy-%s' % privacy)

            start_date = e.start_time.strftime('%Y-%m-%d') if e.start_time else ''
            yield '%s\n' % '\t'.join(str(x) for x in [e.fb_event_id, start_date, privacy, attendees])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", e.fb_event_id)


map_dump_private_events = fb_mapreduce.mr_wrap(count_private_events)


def mr_private_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Private Events',
        'dancedeets.servlets.tools.map_dump_private_events',
        'dancedeets.events.eventdata.DBEvent',
        handle_batch_size=80,
        queue=None,
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
        },
            # Let's update the DBEvent as necessary (note, this uses the last-updated FBLookup)
            # Unfortunately, we failed to get anything in our fbl, as it was raising an ExpiredOAuthToken
            # So instead, let's call it and just have it use the db_event.fb_event
            if fbl:
                add_event_tuple_if_updating(events_to_update, fbl, db_event,
                                            only_if_updated)
    if events_to_update:
        event_updates.update_and_save_fb_events(
            events_to_update, disable_updates=disable_updates)


def yield_resave_display_event(fbl, all_events):
    event_updates.resave_display_events(all_events)


map_resave_display_event = fb_mapreduce.mr_wrap(yield_resave_display_event)


def yield_load_fb_event(fbl, all_events):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        disable_updates = params['disable_updates']
        only_if_updated = params['only_if_updated']
    else:
        disable_updates = []
        only_if_updated = True

    for event in all_events:
        # Temporary hack to fix our double-street events
        verticals = sorted(list(set(event.verticals)))
Exemplo n.º 14
0
    csv_file = StringIO.StringIO()
    csv_writer = csv.writer(csv_file)

    for pe in pe_list:
        try:
            result = json.dumps(
                fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id))
            cache_key = fbl.key_to_cache_key(
                fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id))
            csv_writer.writerow([cache_key, result])
        except fb_api.NoFetchedDataException:
            logging.error("skipping row for event id %s", pe.fb_event_id)
    yield csv_file.getvalue()


map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json)


def mr_dump_events(fbl):
    fb_mapreduce.start_map(
        fbl,
        'Dump Potential FB Event Data',
        'dancedeets.logic.mr_dump.map_dump_fb_json',
        'dancedeets.event_scraper.potential_events.PotentialEvent',
        handle_batch_size=80,
        queue=None,
        filters=[('looked_at', '=', None)],
        output_writer_spec=
        'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        output_writer={
            'mime_type': 'text/plain',
        except fb_api.NoFetchedDataException, e:
            logging.warning("Failed to fetch data for thing: %s", str(e))
    logging.info("Discovered %s items: %s", len(discovered_list),
                 discovered_list)
    return discovered_list


def scrape_events_from_source_ids(fbl, source_ids):
    sources = thing_db.Source.get_by_key_name(source_ids)
    sources = [x for x in sources if x]
    logging.info("Looking up %s source_ids, found %s sources", len(source_ids),
                 len(sources))
    scrape_events_from_sources(fbl, sources)


map_scrape_events_from_sources = fb_mapreduce.mr_wrap(
    scrape_events_from_sources)


def mapreduce_scrape_all_sources(fbl,
                                 min_potential_events=None,
                                 queue='slow-queue'):
    # Do not do the min_potential_events>1 filter in the mapreduce filter,
    # or it will want to do a range-shard on that property. Instead, pass-it-down
    # and use it as an early-return in the per-Source processing.
    # TODO:....maybe we do want a range-shard filter? save on loading all the useless sources...
    fb_mapreduce.start_map(
        fbl,
        'Scrape All Sources',
        'dancedeets.event_scraper.thing_scraper.map_scrape_events_from_sources',
        'dancedeets.event_scraper.thing_db.Source',
        handle_batch_size=10,
Exemplo n.º 16
0
        # If it's coming up soon
        elif start_time_delta.days < 30:
            changefreq_node.text = 'daily'

        else:
            changefreq_node.text = 'weekly'

        url_node.append(loc_node)
        url_node.append(changefreq_node)
        url_node.append(priority_node)
        # prints out as one line
        yield '%s\n' % etree.tostring(url_node)


map_sitemap_event = fb_mapreduce.mr_wrap(yield_sitemap_event)
sitemap_event = fb_mapreduce.nomr_wrap(yield_sitemap_event)


@app.route('/tasks/generate_sitemaps')
class ReloadEventsHandler(base_servlet.BaseTaskFacebookRequestHandler):
    def get(self):
        queue = self.request.get('queue', 'fast-queue')
        time_period = self.request.get('time_period', None)
        vertical = self.request.get('vertical', None)

        filters = []
        if vertical:
            filters.append(('verticals', '=', vertical))
            vertical_string = '%s ' % vertical
        else: