import pandas, pandas_ext, param, features as ft import useful_stuff from useful_stuff import * # list of ftures to be defined by this module feature_list = [ ft.Feature("user_blog", "pagerank_by_like_share_postrank"), ft.Feature("user_blog", "pagerank_by_like_share_blogprob") ] # setup function to load data (from store) needed to calculate each Feature def setup_func(feature_list, store, recreate_setup): # Get blog share of user likes and user share of blog likes over stim period "all" # Use these as transition probabilities for user moving to other blogs, users user_blog_df = store.user_blog.load_df( ["all_blog_like_user_like_share", "all_user_like_blog_post_share"], reset_index=True) user_blog_df = user_blog_df.rename( columns={ "all_blog_like_user_like_share": "bu_prob", "all_user_like_blog_post_share": "ub_prob" }) # get list of stim users - they're only ones we ultimately care about tmp = store.user.load_df(['is_stim'], reset_index=True) stim_users = tmp[tmp.is_stim.fillna(False)].user_id # function to get all current user blog indexes for which data is needed def get_user_blog_index(store):
if __name__ == "__main__": # Set random seed np.random.seed(SEED) random.seed(SEED) # Load dataset x_train, y_train, x_test, y_test = dataset.load_hasc() print(x_train.shape) print(y_test.shape) # Feature extractor # 特徴量数 75 (time domain: 16*3 = 48, frequency domain: 8*3 = 24) extractors = [ ('min', features.Feature(np.amin)), ('max', features.Feature(np.amax)), ('mean', features.Feature(np.mean)), ('std', features.Feature(np.std)), ('first_quartiles', features.Feature(features.first_quartiles)), ('median', features.Feature(np.median)), ('third_quartiles', features.Feature(features.third_quartiles)), ('iqr', features.Feature(sp.iqr)), ('corrcoef', features.Feature(features.corrcoef)), ('abs_corrcoef', features.Feature(features.abs_corrcoef)), ('frame_init', features.Feature(features.frame_init)), ('frame_end', features.Feature(features.frame_end)), ('intensity', features.Feature(features.intensity)), ('skewness', features.Feature(features.skewness)), ('kurtosis', features.Feature(features.kurtosis)), ('zcr', features.Feature(features.zcr)), ('power_spectrum_features_8', features.Feature(features.fft_features)) ]
import pandas, pandas_ext, param, features as ft import useful_stuff from useful_stuff import * # list of ftures to be defined by this module feature_list = [] for (name, period) in [('hist', -1), ('all', 0), ('weekM1', 1), ('weekM2', 2), ('weekM3', 3)]: if period != -1: feature_list.append(ft.Feature('blog', name + '_post_ct' , period_name = name, period_index = period, calc_type = 'blog_post_ct')) feature_list.append(ft.Feature('blog', name + '_like_ct' , period_name = name, period_index = period, calc_type = 'blog_like_ct')) feature_list.append(ft.Feature('user', name + '_like_ct' , period_name = name, period_index = period, calc_type = 'user_like_ct')) feature_list.append(ft.Feature('user_blog', name + '_blog_post_user_like_share' , period_name = name, period_index = period, calc_type = 'blog_post_user_like_share')) feature_list.append(ft.Feature('user_blog', name + '_blog_like_user_like_share' , period_name = name, period_index = period, calc_type = 'blog_like_user_like_share')) feature_list.append(ft.Feature('user_blog', name + '_user_like_blog_post_share' , period_name = name, period_index = period, calc_type = "user_like_blog_post_share")) # setup function to load data (from store) needed to calculate each Feature def setup_func(feature_list, store, recreate_setup): retval = dict() needed_periods = list(set(f.period_index for f in feature_list))
df = df[(df.user_blog_hist_like_ct > 0) | (df.user_blog_all_user_like_blog_post_share > 0) | ((df.user_blog_pagerank_by_like_share_postrank > 0) & (df.user_blog_pagerank_by_like_share_postrank < 1000)) | ((df.user_post_topic_proximity_rank > 0) & (df.user_post_topic_proximity_rank < 1000))] # return data print "End candidates", nowstring() return df[["user_id", "post_id"]] # feature list feature_list = [ ft.Feature("post", "time_zone"), ft.Feature("user", "mean_stim_like_time_zone"), ft.Feature("user_post", "tz_proximity_to_stim_likes") ] def setup_func(feature_list, store, recreate_setup): # store, recreate_setup = ft.dev_store, True # get all tz; convert to vectors (location of tz looking at world from N pole, with GMT at left post_tz = jf.PostFile.get_df(["time_zone"]).time_zone post_vec = post_tz.apply(lambda x: (math.sin(x * (math.pi / 12.0)), math.cos(x * (math.pi / 12.0)))) # calculate average time zone of user likes using TZ vector df = store.user_post.load_df(["like_is_stim"], reset_index=True)
import pandas, pandas_ext, param, features as ft import jsonfiles as jf import useful_stuff from useful_stuff import * # list of ftures to be defined by this module feature_list = [ ft.Feature('post', 'date', fileclass=jf.PostFile), ft.Feature('post', 'blog_id', fileclass=jf.PostFile), ft.Feature('post', 'is_test', fileclass=jf.PostFile), ft.Feature('post', 'week', fileclass=None), ft.Feature('post', 'weekday', fileclass=None), ft.Feature('user_post', 'is_like', fileclass=jf.LikeFile), ft.Feature('user_post', 'like_date', fileclass=jf.LikeFile), ft.Feature('user_post', 'like_week', fileclass=None), ft.Feature('user', 'is_test', fileclass=jf.UserFile), ft.Feature('user', 'hist_like_ct', fileclass=jf.UserHistFile), ft.Feature('user_blog', 'hist_like_ct', fileclass=jf.UserBlogHistFile), ft.Feature('blog', 'hist_like_ct', fileclass=jf.BlogHistFile), ft.Feature('blog', 'hist_post_ct', fileclass=jf.BlogHistFile) ] # setup function to load data (from store) needed to calculate each Feature def setup_func(feature_list, store, recreate_setup): # will store feature key and data frame with data retval = dict() features_left = dict((f.key, f) for f in feature_list) # if reloading is allowed, it is ok to get to get series from prod store if they exist
return len(self.files) def veccos(vec1, vec2): if (vec1 is None) or (vec2 is None): return 0 else: denom = sqrt(sum(v*v for (i, v) in vec1)) * sqrt(sum(v*v for (i, v) in vec2)) if denom == 0: return 0 else: return sum(v1*v2 for (i1, v1) in vec1 for (i2, v2) in vec2 if i1 == i2) / denom feature_list = [ ft.Feature("user_post", "topic_proximity_rank") , ft.Feature("user_post", "topic_proximity_mean") , ft.Feature("user_post", "topic_proximity_max") ] def setup_func(feature_list, store, recreate_setup): # store, recreate_setup = ft.prod_store, True if not recreate_setup: try: retval = pandas.HDFStore(store.path + "tmptopicdata.h5")["result"] print "loaded stored result" return retval except: print "didn't load stored result"
import pandas, pandas_ext, param, features as ft import useful_stuff from useful_stuff import * # list of ftures to be defined by this module feature_list = [ ft.Feature('post', 'is_stim'), ft.Feature('post', 'is_resp'), ft.Feature('user_post', 'like_is_stim'), ft.Feature('user_post', 'like_is_resp'), ft.Feature('user', 'is_stim'), ft.Feature('user', 'is_resp'), ft.Feature('blog', 'is_stim'), ft.Feature('blog', 'is_resp') ] # setup function to load data (from store) needed to calculate each Feature def setup_func(feature_list, store, recreate_setup): # Load universe of posts, set stim and resp post_df = store.post.load_df(store.post.keys()) post_df['is_stim'] = (post_df.is_test == False) & (post_df.week < datetime(2012, 8, 6)) post_df['is_resp'] = (post_df.is_test == False) & (post_df.week == datetime(2012, 8, 6)) # Load univers of likes, set stim and resp like_df = store.user_post.load_df(store.user_post.keys()) like_df = like_df[like_df.is_like]
user_post = store.user_post.load_df(["topic_proximity_rank"], reset_index = True) post = store.post.load_df(["is_resp", "blog_id"], reset_index = True) post = post[post.pop("is_resp").fillna(False)] user_post = user_post.merge(post, on = "post_id", how = "inner") user_post = user_post[ (user_post.user_id.isin(store.user['is_resp'].get_matching_indexes(True))) & (user_post.blog_id.isin(store.blog['is_resp'].get_matching_indexes(True))) ] return set(zip(user_post.user_id, user_post.blog_id)) | set(zip(user_blog.user_id, user_blog.blog_id)) # feature info feature_list = [ ft.Feature("user_blog", "lang_proximity") , ft.Feature("user", "is_english") , ft.Feature("blog", "is_english") ] def setup_func(feature_list, store, recreate_setup): # store, recreate_setup = ft.dev_store, False # if not (recreate_setup): # try: # retval = cPickle.load(open( # param.folders.root + "languagedata/" + store.name + "/all_setup_data.pickle", "r")) # return retval # except: # pass
df = df[(df.user_blog_hist_like_ct > 0) | (df.user_blog_all_user_like_blog_post_share > 0) | ((df.user_blog_pagerank_by_like_share_postrank > 0) & (df.user_blog_pagerank_by_like_share_postrank < 1000)) | ((df.user_post_topic_proximity_rank > 0) & (df.user_post_topic_proximity_rank < 1000))] # return data print "End candidates", nowstring() return df[["user_id", "post_id"]] # list of ftures to be defined by this module feature_list = [ ft.Feature('post', 'author'), ft.Feature('blog', 'author_ct'), ft.Feature('user', 'as_author_post_ct'), ft.Feature('user', 'as_author_post_user_like_share'), ft.Feature('user_post', 'author_post_ct'), ft.Feature('user_post', 'author_like_ct'), ft.Feature('user_post', 'blog_post_author_post_share'), ft.Feature('user_post', 'blog_like_author_like_share'), ft.Feature('user_post', 'author_post_user_like_share'), ft.Feature('user_post', 'author_like_user_like_share'), ft.Feature('user_post', 'user_like_author_post_share'), ft.Feature('user_post', 'user_is_blog_author'), ft.Feature('user_post', 'user_is_post_author') ]