query = ""
    name_key = person['full_name']
    if name_key in custom_query_keywords:
        log.info("  adjustment: %s -> %s" %
                 (name_key, custom_query_keywords[name_key]))
        query = custom_query_keywords[name_key]
    elif 'first_name' in person.keys() and 'last_name' in person.keys():
        query = '"{0}" AND "{1}"'.format(person['first_name'],
                                         person['last_name'])
    else:
        query = '"{0}"'.format(person['full_name'])

    # a) limit query to correct date range only
    # query_filter = build_mpv_daterange(row)
    # b) also limit query to us media sources (msm, regional, partisan sets)
    date_range_query = build_mpv_daterange(person['date_of_death'])

    query_filter = "( " + date_range_query + " AND " + media_filter_query + " )"
    # c) also limit query to non-spidered us media sources (msm, regional, partisan sets)
    # query_filter = build_mpv_daterange(row) + " AND (tags_id_media:(8875027 2453107 129 8878292 8878293 8878294)) " + " AND NOT (tags_id_stories:8875452) "
    queries.append("(" + query + " AND " + date_range_query + ")")
    no_keyword_queries.append("(" + date_range_query + ")")

    if WRITE_STORY_COUNT_CSVS:
        data = {}
        data['full_name'] = name_key
        data['date_of_death'] = person['date_of_death']
        data['total_stories'] = count_stories('*', query_filter)
        data['stories_about_person'] = count_stories(query, query_filter)
        normalized_story_count = float(data['stories_about_person']) / float(
            data['total_stories'])
    # if person['full_name']!="Akai Gurley":
    #   continue
    # build the in-controversy query for stories about this person
    query = "{~ topic:" + CONTROVERSY_ID + "}"

    name_key = person["full_name"]
    if name_key in custom_query_keywords:
        log.info("  adjustment: %s -> %s" % (name_key, custom_query_keywords[name_key]))
        query += " AND " + custom_query_keywords[name_key]
    elif "first_name" in person.keys() and "last_name" in person.keys():
        query += ' AND "{0}" AND "{1}"'.format(person["first_name"], person["last_name"])
    else:
        query += ' AND "{0}"'.format(person["full_name"])

    query_filter = build_mpv_daterange(person["date_of_death"])

    # fetch the stories
    query_start = time.time()
    stories = fetch_all_stories(query, query_filter)
    query_duration = float(time.time() - query_start)
    time_spent_querying = time_spent_querying + query_duration

    queue_start = time.time()
    duplicate_stories = 0
    urls_already_done = []  # build a list of unique urls for de-duping

    log.info("  found %d stories" % len(stories))
    for story in stories:
        # figure out the base url so we can de-duplicate results from MC
        story["base_url"] = story["url"]
Пример #3
0
    #   continue
    # build the in-controversy query for stories about this person
    query = "{~ topic:" + CONTROVERSY_ID + "}"

    name_key = person['full_name']
    if name_key in custom_query_keywords:
        log.info("  adjustment: %s -> %s" %
                 (name_key, custom_query_keywords[name_key]))
        query += " AND " + custom_query_keywords[name_key]
    elif 'first_name' in person.keys() and 'last_name' in person.keys():
        query += ' AND "{0}" AND "{1}"'.format(person['first_name'],
                                               person['last_name'])
    else:
        query += ' AND "{0}"'.format(person['full_name'])

    query_filter = build_mpv_daterange(person['date_of_death'])

    # fetch the stories
    query_start = time.time()
    stories = fetch_all_stories(query, query_filter)
    query_duration = float(time.time() - query_start)
    time_spent_querying = time_spent_querying + query_duration

    queue_start = time.time()
    duplicate_stories = 0
    urls_already_done = []  # build a list of unique urls for de-duping

    log.info("  found %d stories" % len(stories))
    for story in stories:
        # figure out the base url so we can de-duplicate results from MC
        story['base_url'] = story['url']
for person in data:
    log.info("  Working on %s" % person['full_name'])
    query = ""
    name_key = person['full_name']
    if name_key in custom_query_keywords:
        log.info("  adjustment: %s -> %s" % (name_key,custom_query_keywords[name_key]))
        query = custom_query_keywords[name_key]
    elif 'first_name' in person.keys() and 'last_name' in person.keys():
        query = '"{0}" AND "{1}"'.format(person['first_name'], person['last_name']) 
    else:
        query = '"{0}"'.format(person['full_name'])

    # a) limit query to correct date range only
    # query_filter = build_mpv_daterange(row)
    # b) also limit query to us media sources (msm, regional, partisan sets)
    date_range_query = build_mpv_daterange(person['date_of_death'])

    query_filter = "( " + date_range_query + " AND "+media_filter_query+" )"
    # c) also limit query to non-spidered us media sources (msm, regional, partisan sets)
    # query_filter = build_mpv_daterange(row) + " AND (tags_id_media:(8875027 2453107 129 8878292 8878293 8878294)) " + " AND NOT (tags_id_stories:8875452) " 
    queries.append("("+query+" AND "+date_range_query+")")
    no_keyword_queries.append("(" + date_range_query +")")

    if WRITE_STORY_COUNT_CSVS:
        data = {}
        data['full_name'] = name_key
        data['date_of_death'] = person['date_of_death']
        data['total_stories'] = count_stories('*',query_filter)
        data['stories_about_person'] = count_stories(query,query_filter)
        normalized_story_count = float(data['stories_about_person']) / float(data['total_stories'])
        data['normalized_stories_about_person'] = "{0:.15f}".format(normalized_story_count)