def extract_pgt(config, p, k, t, d): dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) if compressed is True: pgt_name = config['intermediate']['pgt']['pgt_output_compressed'] compression = 'bz2' else: pgt_name = config['intermediate']['pgt']['pgt_output'] compression = None intermediate_file = '/'.join( [pgt_root, dataset_names[p], pgt_name.format(p, k, t, d)]) if is_file_exists(intermediate_file) is False: ### Extracting each feature if config['kwargs']['pgt']['extract_pgt']['run'] is True: g1 = None g2 = None g3 = None g4 = None if config['kwargs']['pgt']['extract_pgt']['personal'] is True: g1, g2 = personal_factor(config, p, k, t, d) ### P in PGT debug('Finished loading personal factor', 'p', p, 'k', k, 't', t, 'd', d) if config['kwargs']['pgt']['extract_pgt']['global'] is True: g3 = global_factor(config, p, k, t, d, g2) ### PG in PGT debug('Finished loading global factor', 'p', p, 'k', k, 't', t, 'd', d) if config['kwargs']['pgt']['extract_pgt']['temporal'] is True: g4 = temporal_factor(config, p, k, t, d, g2) ### PGT in PGT debug('Finished loading temporal factor', 'p', p, 'k', k, 't', t, 'd', d) ### Merging all together if config['kwargs']['pgt']['extract_pgt']['merge'] is True: if g1 is not None and g2 is not None and g3 is not None and g4 is not None: df = g1[['user1', 'user2', 'g1']].merge(g2[['user1', 'user2', 'g2']], on=['user1', 'user2']) df = df.merge(g3[['user1', 'user2', 'g3']], on=['user1', 'user2']) df = df.merge(g4[['user1', 'user2', 'g4']], on=['user1', 'user2']) friend_df = extract_friendships(dataset_names[p], config) df = determine_social_tie(df, friend_df) df.to_csv(intermediate_file, header=True, index=False, compression=compression)
def test_colocation_stats(): config_name = 'config_test.json' ### Started the program debug('Started Test test_checkin_stats on SCI+', config_name) config = read_config('config_test.json') ### Read config config = read_config('config_test.json') kwargs = config['kwargs'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] for dataset_name in datasets: p = all_datasets.index(dataset_name) friend_df = extract_friendships(dataset_name, config) for mode in modes: k = all_modes.index(mode) for t in t_diffs: for d in s_diffs: total_user = 0 total_friend = 0 total_colocation = 0 i = 0 for colocation_df in read_colocation_file( config, p, k, t, d, chunksize=10**6, usecols=['user1', 'user2']): colocation_df = determine_social_tie( colocation_df, friend_df) total_colocation += len(colocation_df) colocation_df = colocation_df.drop_duplicates( ['user1', 'user2']) total_user += len(colocation_df) total_friend += sum(colocation_df['link']) i += 1 # debug('Processing chunks #%d' % i) # debug('#colocations', total_colocation, '#total_user', total_user, '#total_friend', total_friend, 'p', p, 'k', k, 't', t, 'd', d) debug(total_colocation, total_user, total_friend, p, k, t, d) gc.collect() debug('Finished Test on SCI+')
def generating_walk2friend_data(): config_name = 'config_test.json' debug('Started Test test_checkin_stats on SCI+', config_name) config = read_config(config_name) ### Started the program ### Read config config = read_config(config_name) kwargs = config['kwargs'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] directory = config['directory']['intermediate'] for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run Test on Dataset', dataset_name, p, 'Mode', mode, k) ### Test extract check-ins checkins, _ = extract_checkins_all(dataset_name, mode, config) checkins.sort_values(["user", "timestamp"], inplace=True) checkins['mid'] = range(1, len(checkins) + 1) checkins.rename(columns={ "user": "******", "location": "locid" }, inplace=True) checkins = checkins[['mid', 'uid', 'locid']] checkins.to_csv('/'.join( [directory, '%s_%s_10.checkin' % (dataset_name, mode)]), index=False, header=True) ### Test extract friendships friend_df = extract_friendships(dataset_name, config) friend_df.sort_values(["user1", "user2"], inplace=True) friend_df.sort_values(["user1", "user2"], inplace=True) friend_df.rename(columns={ "user1": "u1", "user2": "u2" }, inplace=True) friend_df.to_csv('/'.join( [directory, '%s_%s_10.friends' % (dataset_name, mode)]), index=False, header=True)
def extract_colocation_features(stat_lp, config, p, k, t, d): debug('p', p, 'k', k, 't', t, 'd', d) ### Check if SCI intermediate exists dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] sci_root = config['directory']['sci'] make_sure_path_exists('/'.join([sci_root, dataset_names[p]])) if compressed is True: sci_name = config['intermediate']['sci']['evaluation_compressed'] else: sci_name = config['intermediate']['sci']['evaluation'] sci_name = '/'.join( [sci_root, dataset_names[p], sci_name.format(p, k, t, d)]) if is_file_exists(sci_name): debug('File %s exists' % sci_name) else: ### Read (original) friendship from file friend_df = extract_friendships(dataset_names[p], config) colocation_df = read_colocation_file(config, p, k, t, d) ### Find if the two users in the colocated check-ins are friends / stranger colocation_df = determine_social_tie(colocation_df, friend_df) debug('#colocations', len(colocation_df), 'p', p, 'k', k, 't', t, 'd', d) ### Find the stability value for each co-location pairs groups = colocation_df.groupby(['user1', 'user2', 'link']) grouped = aggregate_stats(groups, stat_lp, p, k, t, d) ### Write the result into a csv output write_statistics(grouped, config, p, k, t, d) ### Memory management del friend_df del colocation_df del grouped debug('Finished extract_colocation_features', 'p', p, 'k', k, 't', t, 'd', d)
def test_checkin_stats(): config_name = 'config_test.json' ### Started the program debug('Started Test test_checkin_stats on SCI+', config_name) ### Read config config = read_config(config_name) kwargs = config['kwargs'] all_datasets = config['dataset'] all_modes = config['mode'] n_core = kwargs['n_core'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run Test on Dataset', dataset_name, p, 'Mode', mode, k, '#Core', n_core) ### Test extract check-ins checkins, _ = extract_checkins_all(dataset_name, mode, config) checkins['u_count'] = checkins.groupby('user')['user'].transform( 'count') df, _ = extract_checkins_all(dataset_name, mode, config, filter=False) df['u_count'] = df.groupby('user')['user'].transform('count') df = df[(df['u_count'] > 1)] n_user_ori = len(df['user'].unique()) n_checkins_ori = len(df) n_checkins_filter = len(checkins) ### Test extract friendships friend_df = extract_friendships(dataset_name, config) n_friend = len(friend_df) uids = checkins['user'].unique() n_user_filter = len(uids) locs = checkins['location'].unique() n_locs = len(locs) friend_match_checkin = friend_df.isin(uids) friend_df = friend_df[friend_match_checkin['user1'] & friend_match_checkin['user2']] friend_ids = np.unique( np.concatenate( (friend_df['user1'].values, friend_df['user2'].values))) checkins = df.loc[df['user'].isin(friend_ids)] n_user_friend = len(checkins['user'].unique()) n_checkins_friend = len(checkins) avg_checkin_ori = n_checkins_ori / n_user_ori avg_checkin_filter = n_checkins_filter / n_user_filter avg_checkin_friend = n_checkins_friend / n_user_friend # debug('#user ori', n_user_ori) # debug('#friend', n_friend) # debug('#user', n_user_filter) # debug('#location', n_locs) # debug('#friend w/ checkins', n_friend) # debug('Avg. #Checkins ori', avg_checkin_ori) # debug('Avg. #Checkins filtered', avg_checkin_filter) # debug('#user (#friend>1)', n_user_friend) debug(n_user_ori, n_user_friend, n_user_filter, n_checkins_ori, n_checkins_friend, n_checkins_filter, n_friend, n_locs, avg_checkin_ori, avg_checkin_friend, avg_checkin_filter) debug('Finished Test on SCI+')