def count_comments_likes(months=12):
    '''
    Returns:
        * total number of comments
        * total number of likes
        * number of photos which had 0 comments
        * number of photos which had 0 likes
        * number of photos which had > 0 comments
        * number of photos which had > 0 likes
    '''
    img_usr = __data.load_image_data(months)[["user_id", "image_id"]]
    comments_likes = __img_f.final_like_and_comments()

    com_l1 = comments_likes.comments > 0
    com_l2 = comments_likes.comments == 0

    like_l1 = comments_likes.likes > 0
    like_l2 = comments_likes.likes == 0

    comments_likes["num_images_with_comments"] = __np.where(com_l1, 1, 0)
    comments_likes["num_images_no_comments"] = __np.where(com_l2, 1, 0)

    comments_likes["num_images_with_likes"] = __np.where(like_l1, 1, 0)
    comments_likes["num_images_no_likes"] = __np.where(like_l2, 1, 0)

    out = img_usr.merge(comments_likes, on="image_id", how="left").drop("image_id",axis=1)\
                    .groupby("user_id").sum().reset_index()

    out = out.rename(columns={
        "likes": "total_number_of_likes",
        "comments": "total_number_of_comments"
    })

    return out
def ratio_of_topics(confidence=80, subset=True, months=12):
    '''
    NOTE: This is an expensive operation.
    
    Returns a matrix of the percentages that shows you what percentage that object appears in the users photos.
    
    Arg:
    confidence: confidence of topic being in image
    subset: returns subset of topics for the user that we selected in advance as indicators of lifestyle 
            and which were not sparse.
    '''

    image_data = __data.load_image_data(months)
    #__pd.read_pickle("../../data/Visual_well_being/image_data.pickle")

    user_img = image_data[["image_id", "user_id"]].drop_duplicates()

    ob = __img_f.binary_object_matrix(confidence)
    photo_counts = instagram_account_stats()[["user_id", "user_posted_photos"]]

    counts_per_user = ob.merge(user_img,how="right",on="image_id")\
                .groupby("user_id").sum()\
                .merge(photo_counts,on="user_id",how="inner")

    f = counts_per_user.set_index("user_id")
    x = (f.sum() > 30)
    df = f.iloc[:, list(x.values)]

    cols = []
    for i in df.columns:
        if i != "user_posted_photos":
            cols.append("ratio_" + i)
        else:
            cols.append(i)

    df.columns = cols

    df = df.reset_index()

    out = df.iloc[:,1:-1]\
                        .divide(
                                df.user_posted_photos,
                                axis=0
                                )

    out["user_id"] = df.user_id

    #if subset == True:
    #    topics_considered = ["user_id","Person","Plant","Food","Collage","Animal","Outdoors","Pet","Book","Dog","Canine","Sky","Alcohol","Crowd","Toy","Cat","Coast","Tree","Beach","Sport","Teddy Bear","Sunlight","Light","Drawing","Sea Life","TV","Dusk","Bikini","Sunrise","Sunset","Swimwear","Selfie","Beard","Woman","Cocktail","Pool","Performer","Coffee Cup","Tattoo","Downtown","Musical Instrument","Festival","City","Laptop","Pizza","Cloud","Beer Bottle","Money","Club","Airplane","Sketch","Sandwich","Cafeteria","Breakfast","Child"]
    #    df = df[topics_considered]

    return out
def avg_posts_per_day(months=12):
    '''
    Returns the average number of posts per day that the person posted. Returns 7 averages:
    `early day`: 8:00-12:00
    `late_day`: 12:00-20:00
    `early_night`: 20:00-00:00
    `late_night`: 00:00-8:00

    `day`: 8:00-20:00
    `night`: 20:00-8:00
    `whole_day`: the average for the whole date
    '''

    image_data = __data.load_image_data(months)

    #__pd.read_pickle(__data_dir + "image_data.pickle")
    #image_date.image_posted_time = __pd.to_datetime(image_date.image_posted_time)

    x = image_data[["image_posted_time", "image_id",
                    "user_id"]].drop_duplicates()

    early_day_b = x.image_posted_time.isin(
        x.set_index("image_posted_time").between_time('8:00', '12:00').index)
    late_day_b = x.image_posted_time.isin(
        x.set_index("image_posted_time").between_time('12:00', '20:00').index)
    early_night_b = x.image_posted_time.isin(
        x.set_index("image_posted_time").between_time('20:00', '00:00').index)
    late_night_b = x.image_posted_time.isin(
        x.set_index("image_posted_time").between_time('00:00', '8:00').index)

    day_b = x.image_posted_time.isin(
        x.set_index("image_posted_time").between_time('8:00', '20:00').index)
    night_b = x.image_posted_time.isin(
        x.set_index("image_posted_time").between_time('20:00', '8:00').index)

    x["avg_posts_early_day"] = __np.where(early_day_b, 1, 0)
    x["avg_posts_late_day"] = __np.where(late_day_b, 1, 0)
    x["avg_posts_early_night"] = __np.where(early_night_b, 1, 0)
    x["avg_posts_late_night"] = __np.where(late_night_b, 1, 0)

    x["avg_posts_day"] = __np.where(day_b, 1, 0)
    x["avg_posts_night"] = __np.where(night_b, 1, 0)
    x["avg_posts_whole_date"] = 1

    xx = x.drop(columns="image_id",axis=0)\
    .groupby([x.user_id,x.image_posted_time.dt.date])\
    .sum().drop("user_id",axis=1).reset_index()

    out = xx.groupby("user_id").mean().reset_index()

    return out
def avg_ratio_gender(confidence=90, months=12):
    '''
    Returns the average of the gender ratio that the user has per image
    '''
    image_data = __data.load_image_data(months)

    #__pd.read_pickle(__data_dir + "image_data.pickle")
    ratio_gender = __img_f.ratio_gender(confidence)
    avg_ratio_gender = __pd.merge(ratio_gender,
                                  image_data[['user_id', 'image_id']],
                                  on='image_id',
                                  how='right')
    avg_ratio_gender.fillna(0, inplace=True)
    return avg_ratio_gender.groupby('user_id').mean().reset_index()
def average_num_faces_per_image_and_emotion(months=12):
    '''
    Returns the average of faces per emotion that the user has per image
    '''
    image_data = __data.load_image_data(months)

    #__pd.read_pickle(__data_dir + "image_data.pickle")
    num_faces_df = __img_f.number_of_faces_per_emotion()
    num_faces_df = __pd.merge(num_faces_df,
                              image_data[['user_id', 'image_id']],
                              on='image_id',
                              how='right')
    num_faces_df.fillna(0, inplace=True)
    return num_faces_df.groupby('user_id').mean().reset_index()
def instagram_account_stats():
    '''
    Returns statistics on the instagram user. 
    * Number of followers
    * Number of people following
    * Number of photos
    '''

    image_data = __data.load_image_data()

    instagram_account_info = image_data[["user_id","user_followed_by","user_follows","user_posted_photos"]]\
                            .drop_duplicates()

    return instagram_account_info
def avg_number_of_faces_from_photos_with_faces(months=12):
    '''Returns the average number of faces in photos with faces'''
    image_data = __data.load_image_data(months)

    #__pd.read_pickle("../../data/Visual_well_being/image_data.pickle")

    num_faces = __img_f.number_of_faces()

    out = image_data[["image_id","user_id"]]\
    .merge(num_faces,on="image_id",how="left")\
    .fillna(0)\
    .groupby("user_id")\
    .mean()\
    .rename(columns = {'number_of_face': 'avg_number_of_faces_over_images_with_faces'})\
    .reset_index()

    return out
def count_face_emotions(months=12):
    emotion = __img_f.number_of_faces_per_emotion()
    img_usr = __data.load_image_data(months=12)[["user_id", "image_id"]]
    out = img_usr.merge(emotion,on="image_id",how="inner").drop(["image_id"],axis=1)\
                    .groupby("user_id").sum().reset_index()

    out = out.rename(
        columns={
            "ANGRY": "count_ANGRY",
            "CALM": "count_CALM",
            "CONFUSED": "count_CONFUSED",
            "DISGUSTED": "count_DISGUSTED",
            "HAPPY": "count_HAPPY",
            "SAD": "count_SAD",
            "SUPRISED": "count_SUPRISED"
        })

    return out
def avg_engagement(months=12):
    '''
    Returns the average number of likes and comments per user.
    '''
    image_data = __data.load_image_data(months)

    #__pd.read_pickle('../../data/Visual_well_being/image_data.pickle')

    updated_metrics = __img_f.final_like_and_comments(months)
    image_data = image_data.merge(updated_metrics, how='left', on='image_id')
    avg_engagement = image_data[['user_id', 'likes', 'comments'
                                 ]].groupby('user_id').mean().reset_index()
    avg_engagement = avg_engagement.rename(columns={
        'likes': 'avg_likes',
        'comments': 'avg_comments'
    })

    return avg_engagement
def counts_objects(months=12, confidence=80):

    o = __img_f.binary_object_matrix(confidence)
    image_data = __data.load_image_data(months)

    user_img = image_data[["image_id", "user_id"]].drop_duplicates()

    out = o.merge(user_img,how="right",on="image_id")\
                .groupby("user_id").sum()\
                .fillna(0)

    x = (out.sum() > 30)
    out = out.iloc[:, list(x.values)]

    cols = []
    for i in out.columns:
        cols.append("count_" + i)

    out.columns = cols

    out = out.reset_index()

    return out
def proportion_image_cluster(months=12):

    anp_cg = __img_f.anp_cluster_groups()
    u = instagram_account_stats()[["user_id", "user_posted_photos"]]

    image_data = __data.load_image_data(months)
    image_user = image_data[["image_id", "user_id"]]

    user_clusters = image_user.merge(anp_cg, on="image_id",
                                     how="left").fillna(0).drop("image_id",
                                                                axis=1)

    user_clusters = user_clusters.groupby("user_id").sum()\
                                        .reset_index()\
                                        .merge(u,on="user_id",how="inner")\
                                        .reset_index()

    cluster_proportions = __pd.concat([user_clusters.user_id,
                        user_clusters.iloc[:,2:-1]\
                                    .divide(user_clusters.user_posted_photos,
                                            axis=0)
                       ],axis=1)

    return cluster_proportions
def filter_features(months=12):
    '''
    Returns percentage of happy/depressed filters, ratio of happy over depressed filters
    '''
    image_data = __data.load_image_data(months)

    #__pd.read_pickle('../../data/Visual_well_being/image_data.pickle')

    # Keep only filters, remove Unknown and Normal entries
    filter_data = image_data[~image_data.image_filter.
                             isin(['Normal', 'Unknown'])][[
                                 'image_id', 'image_filter'
                             ]]
    # Load the filter categories
    filter_categories = __pd.read_csv(
        '../../data/Visual_well_being/filter_categories.csv', sep=';')
    filter_categories = filter_categories.rename(
        columns={'class': 'happiness_class'})

    # Remove images whose filter is not associated with a category in filter_categories (only 23 of them, no big deal)
    filter_data = filter_data[filter_data.image_filter.isin(
        filter_categories['filter'])]

    # Add filter category information to the dataFrame
    filter_data = filter_data.merge(filter_categories,
                                    how='left',
                                    left_on='image_filter',
                                    right_on='filter').drop('filter', axis=1)

    # Create Dummies that will help to summarize happy filters and depressed filters later on.
    filter_dummies = __pd.get_dummies(
        filter_data['happiness_class']).rename(columns={
            0: 'depressed_filter',
            1: 'happy_filter'
        })
    filter_data['happy_filter'] = filter_dummies['happy_filter']
    filter_data['depressed_filter'] = filter_dummies['depressed_filter']
    filter_data = filter_data.drop(['image_filter', 'happiness_class'], axis=1)

    # Merge with original image data
    image_data = image_data.merge(filter_data, 'left', 'image_id')

    # Create Filter features dataframe
    filter_features = image_data[[
        'user_id', 'happy_filter', 'depressed_filter'
    ]].groupby('user_id').sum().reset_index()
    filter_features['total_photos'] = image_data[[
        'user_id', 'user_posted_photos'
    ]].groupby('user_id').max().reset_index()['user_posted_photos']

    # Build the features
    filter_features['happy_flt_pct'] = filter_features[
        'happy_filter'] / filter_features['total_photos']
    filter_features['depressed_flt_pct'] = filter_features[
        'depressed_filter'] / filter_features['total_photos']
    filter_features['happy_to_depressed_flt_ratio'] = filter_features[
        'happy_filter'] / filter_features['depressed_filter']
    filter_features = filter_features.replace([np.inf, -np.inf], np.nan)
    # Drop not neeed columns
    filter_features = filter_features.drop(
        ['happy_filter', 'depressed_filter', 'total_photos'], axis=1)

    return filter_features