def reverse_image_search(image_file_stream, bit_difference_threshold):
    image_dhash = get_image_dhash_as_int(image_file_stream)
    logging.info(
        'Got reverse_image_search request: %s bit_difference_threshold, file with dhash %x',
        bit_difference_threshold, image_dhash)
    image_simhash_tree = get_image_simhash_bktree()

    found = image_simhash_tree.find(
        ArchiveIDAndSimHash(sim_hash=image_dhash, archive_id=-1),
        bit_difference_threshold)
    logging.info('%d similar image archive IDs: %s', len(found), found)
    # BKTree.find returns tuples of form (bit difference, value). This extracts a set of all
    # archive IDs found.
    archive_ids = {x[1].archive_id for x in found}
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        return db_interface.ad_cluster_details_for_archive_ids(
            list(archive_ids),
            min_date=None,
            max_date=None,
            region=None,
            gender=None,
            age_group=None,
            language=None,
            order_by=None,
            order_direction=None)
def get_image_simhash_bktree():
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        simhash_to_archive_id_set = db_interface.all_ad_creative_image_simhashes(
        )

    total_sim_hashes = len(simhash_to_archive_id_set)
    logging.info('Got %d image simhashes to process.', total_sim_hashes)

    # Create BKTree with dhash bit difference function as distance_function, used to find similar
    # hashes
    image_simhash_tree = pybktree.BKTree(get_num_bits_different)

    sim_hashes_added_to_tree = 0
    tree_construction_start_time = time.time()
    for sim_hash, archive_id_set in simhash_to_archive_id_set.items():
        # Add single entry in BK tree for simhash with lowest archive_id.
        image_simhash_tree.add(
            ArchiveIDAndSimHash(sim_hash=sim_hash,
                                archive_id=min(archive_id_set)))
        sim_hashes_added_to_tree += 1
        if sim_hashes_added_to_tree % 1000 == 0:
            logging.debug('Added %d/%d simhashes to BKtree.',
                          sim_hashes_added_to_tree, total_sim_hashes)
    logging.info('Constructed BKTree in %s seconds',
                 (time.time() - tree_construction_start_time))
    return image_simhash_tree
def get_cluster_details_from_archive_id(archive_id):
    db_connection = db_functions.get_ad_info_database_connection()
    db_interface = db_functions.AdsIfoDBInterface(db_connection)
    ad_cluster_id = db_interface.get_cluster_id_from_archive_id(archive_id)
    if ad_cluster_id is None:
        abort(404)
    return redirect(
        url_for('ad_screener_core.get_ad_cluster_details',
                ad_cluster_id=ad_cluster_id))
def get_cluster_languages_code_to_name():
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        language_code_list = db_interface.cluster_languages()
    language_code_to_name = {}
    for language_code in language_code_list:
        if language_code in LANGUAGE_CODE_TO_NAME_OVERRIDE_MAP:
            language_code_to_name[
                language_code] = LANGUAGE_CODE_TO_NAME_OVERRIDE_MAP[
                    language_code]
        else:
            try:
                language_code_to_name[language_code] = pycountry.languages.get(
                    alpha_2=language_code).name
            except AttributeError as err:
                logging.info(
                    'Unable to get langauge name for language code %s. error: %s',
                    language_code, err)
                language_code_to_name[language_code] = language_code
    return language_code_to_name
def set_ad_feedback_label(ad_cluster_id, feedback_label):
    with db_functions.get_ad_info_database_connection(
    ) as ad_info_db_connection:
        cluster_archive_ids = cluster_additional_ads(
            db_functions.AdsIfoDBInterface(ad_info_db_connection),
            ad_cluster_id)
        if not cluster_archive_ids:
            abort(422)

    with db_functions.get_ad_screener_database_connection() as db_connection:
        db_interface = db_functions.AdScreenerDBInterface(db_connection)
        label_name_to_id = db_interface.is_this_ad_problematic_label_name_to_id(
        )
        if feedback_label not in label_name_to_id:
            abort(422)

        db_interface.insert_is_this_ad_problematic_label(
            current_user.get_id(), ad_cluster_id, cluster_archive_ids,
            label_name_to_id[feedback_label])

    return {'ad_cluste_id': ad_cluster_id, 'feedback_label': feedback_label}
def suggest_ad_cluster_topics(ad_cluster_id):
    try:
        topics = request.json['topics']
        comment = request.json['comment']
        with db_functions.get_ad_info_database_connection(
        ) as ad_info_db_connection:
            cluster_archive_ids = cluster_additional_ads(
                db_functions.AdsIfoDBInterface(ad_info_db_connection),
                ad_cluster_id)
            if not cluster_archive_ids:
                abort(422)

        with db_functions.get_ad_screener_database_connection(
        ) as db_connection:
            db_interface = db_functions.AdScreenerDBInterface(db_connection)
            db_interface.insert_user_suggested_topic_for_ad_cluster(
                current_user.get_id(), ad_cluster_id, cluster_archive_ids,
                topics, comment)

        return {'ad_cluster_id': ad_cluster_id}
    except KeyError:
        return {'Error': 'KeyError for topics/comment'}
def get_ad_cluster_data_from_full_text_search(query, page_id, min_date,
                                              max_date, region, gender,
                                              age_group, language, order_by,
                                              order_direction, limit, offset):
    es_max_results = min(1000 * limit, 10000)
    query_results = elastic_search.query_elastic_search(
        cluster_url=elastic_search.DEFAULT_AD_SCREENER_ES_CLUSTER,
        ad_creative_query=query,
        max_results=es_max_results,
        page_id_query=page_id,
        ad_delivery_start_time=min_date,
        ad_delivery_stop_time=max_date,
        return_archive_ids_only=True)
    logging.debug('Full text search results: %s', query_results)
    archive_ids = query_results['data']
    logging.debug('Full text search returned %d archive_ids: %s',
                  len(archive_ids), archive_ids)
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        # TODO(macpd): use the archive_ids from search results for screenshot cover photo.
        return db_interface.ad_cluster_details_for_archive_ids(
            archive_ids, min_date, max_date, region, gender, age_group,
            language, order_by, order_direction, limit, offset)
def get_topic_id_to_name_map():
    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        return db_interface.topics()
def cached_get_ad_cluster_details(ad_cluster_id):
    db_connection = db_functions.get_ad_info_database_connection()
    db_interface = db_functions.AdsIfoDBInterface(db_connection)

    ad_cluster_data = defaultdict(list)
    ad_cluster_data['ad_cluster_id'] = ad_cluster_id
    region_impression_results = db_interface.ad_cluster_region_impression_results(
        ad_cluster_id)
    for row in region_impression_results:
        ad_cluster_data['region_impression_results'].append({
            'region':
            row['region'],
            'min_spend':
            row['min_spend_sum'],
            'max_spend':
            row['max_spend_sum'],
            'min_impressions':
            row['min_impressions_sum'],
            'max_impressions':
            row['max_impressions_sum']
        })

    demo_impression_results = db_interface.ad_cluster_demo_impression_results(
        ad_cluster_id)
    for row in demo_impression_results:
        ad_cluster_data['demo_impression_results'].append({
            'age_group':
            row['age_group'],
            'gender':
            row['gender'],
            'min_spend':
            row['min_spend_sum'],
            'max_spend':
            row['max_spend_sum'],
            'min_impressions':
            row['min_impressions_sum'],
            'max_impressions':
            row['max_impressions_sum']
        })

    cluster_topics = db_interface.ad_cluster_topics(ad_cluster_id)
    if cluster_topics:
        ad_cluster_data['topics'] = ', '.join(cluster_topics)

    ad_cluster_data['advertiser_info'] = cluster_advertiser_info(
        db_interface, ad_cluster_id)
    ad_cluster_data['funding_entity'] = list(
        db_interface.ad_cluster_funder_names(ad_cluster_id))
    ad_cluster_metadata = db_interface.ad_cluster_metadata(ad_cluster_id)
    ad_cluster_data['min_spend_sum'] = ad_cluster_metadata['min_spend_sum']
    ad_cluster_data['max_spend_sum'] = ad_cluster_metadata['max_spend_sum']
    ad_cluster_data['min_impressions_sum'] = ad_cluster_metadata[
        'min_impressions_sum']
    ad_cluster_data['max_impressions_sum'] = ad_cluster_metadata[
        'max_impressions_sum']
    ad_cluster_data['cluster_size'] = ad_cluster_metadata['cluster_size']
    ad_cluster_data['num_pages'] = ad_cluster_metadata['num_pages']
    canonical_archive_id = ad_cluster_metadata['canonical_archive_id']
    ad_cluster_data['canonical_archive_id'] = canonical_archive_id
    ad_cluster_data['min_ad_creation_date'] = (
        ad_cluster_metadata['min_ad_delivery_start_time'].isoformat())
    ad_cluster_data['max_ad_creation_date'] = (
        ad_cluster_metadata['max_last_active_date'].isoformat())
    ad_cluster_data['url'] = (AD_SCREENSHOT_URL_TEMPLATE % {
        'archive_id': canonical_archive_id
    })
    ad_cluster_data['archive_ids'] = cluster_additional_ads(
        db_interface, ad_cluster_id)
    # These fields are generated by NYU and show up in the Metadata tab
    ad_cluster_data['type'] = ', '.join(
        db_interface.ad_cluster_types(ad_cluster_id))
    ad_cluster_data['entities'] = ', '.join(
        db_interface.ad_cluster_recognized_entities(ad_cluster_id))
    language_code_to_name = get_cluster_languages_code_to_name()
    ad_cluster_data['languages'] = [
        language_code_to_name.get(lang, None)
        for lang in db_interface.ad_cluster_languages(ad_cluster_id)
    ]
    return json.dumps(ad_cluster_data)
def handle_ad_cluster_search(topic_id, min_date, max_date, gender, age_range,
                             region, language, order_by, order_direction,
                             num_requested, offset, full_text_search_query,
                             page_id):
    if topic_id is not None and full_text_search_query is not None:
        abort(400,
              description='topic cannot be combined with full_text_search.')

    try:
        num_requested = int(num_requested)
    except ValueError:
        abort(400, description='numResults must be an integer')
    try:
        offset = int(offset)
    except ValueError:
        abort(400, description='offset must be an integer')

    if num_requested > 20 or offset > 1000:
        abort(
            400,
            description=
            'numResults greater than 20, or offset greater than 1000, not allowed'
        )

    # This date parsing is needed because the FE passes raw UTC formatted dates in Zulu time
    # We can simplify this by not sending the time at all from the FE. Then we strip the time info
    # and just take the date for simplicity.
    if min_date and max_date:
        try:
            min_date = datetime.datetime.strptime(
                min_date, "%Y-%m-%dT%H:%M:%S.%fZ").date()
        except ValueError:
            min_date = date_utils.parse_date_arg(min_date)

        try:
            max_date = datetime.datetime.strptime(
                max_date, "%Y-%m-%dT%H:%M:%S.%fZ").date()
        except ValueError:
            max_date = date_utils.parse_date_arg(max_date)

    if gender:
        if gender.lower() == 'all':
            gender = None
        elif gender.lower() == 'f':
            gender = 'female'
        elif gender.lower() == 'm':
            gender = 'male'
        elif gender.lower() == 'u':
            gender = 'unknown'
    if region and region.lower() == 'all':
        region = None
    if age_range and age_range.lower() == 'all':
        age_range = None
    if language and language.lower() == 'all':
        language = None

    if full_text_search_query:
        return get_ad_cluster_data_from_full_text_search(
            full_text_search_query,
            page_id=page_id,
            min_date=min_date,
            max_date=max_date,
            region=region,
            gender=gender,
            age_group=age_range,
            language=language,
            order_by=order_by,
            order_direction=order_direction,
            limit=num_requested,
            offset=offset)

    if page_id:
        with db_functions.get_ad_info_database_connection() as db_connection:
            db_interface = db_functions.AdsIfoDBInterface(db_connection)
            return db_interface.ad_cluster_details_for_page_id(
                page_id,
                min_date=min_date,
                max_date=max_date,
                region=region,
                gender=gender,
                age_group=age_range,
                language=language,
                order_by=order_by,
                order_direction=order_direction,
                limit=num_requested,
                offset=offset)

    with db_functions.get_ad_info_database_connection() as db_connection:
        db_interface = db_functions.AdsIfoDBInterface(db_connection)
        return db_interface.topic_top_ad_clusters_by_spend(
            topic_id,
            min_date=min_date,
            max_date=max_date,
            region=region,
            gender=gender,
            age_group=age_range,
            language=language,
            order_by=order_by,
            order_direction=order_direction,
            limit=num_requested,
            offset=offset,
            min_topic_percentage_threshold=0.25)