def read_indices(dataset): if dataset == "youtube": indices_filepath = get_package_path( ) + "/data_folder/uniform_data/youtube/data_splits.txt" elif dataset == "reddit": indices_filepath = get_package_path( ) + "/data_folder/uniform_data/reddit/data_splits.txt" else: raise ValueError with open(indices_filepath, "r") as fp: file_row = next(fp) clean_row = file_row.strip().split("\t") train_size = int(clean_row[0]) val_size = int(clean_row[1]) test_size = int(clean_row[2]) indices = np.empty(train_size + val_size + test_size, dtype=np.int32) i = 0 for file_row in fp: clean_row = file_row.strip() indices[i] = int(clean_row) i += 1 train = indices[:train_size] val = indices[train_size:train_size + val_size] test = indices[train_size + val_size:] return train, val, test
def form_item_to_popularity(platform): folder = get_package_path() + "/data_folder/anonymized_data/" + platform output_file_path = get_package_path( ) + "/data_folder/anonymized_data/" + platform + "/item_to_popularity" + ".txt" #################################################################################################################### # Extraction functions. #################################################################################################################### extraction_functions = dict() extraction_functions[ "comment_generator"] = anonymized_extract.comment_generator extraction_functions[ "extract_comment_name"] = anonymized_extract.extract_comment_name extraction_functions[ "extract_parent_comment_name"] = anonymized_extract.extract_parent_comment_name extraction_functions[ "extract_lifetime"] = anonymized_extract.extract_lifetime extraction_functions[ "extract_user_name"] = anonymized_extract.extract_user_name extraction_functions[ "calculate_targets"] = anonymized_extract.calculate_targets extraction_functions["anonymous_coward_name"] = "0" #################################################################################################################### # Iterate over all videos. #################################################################################################################### input_file_path = folder + "/anonymized_data" + ".txt" anonymize_user = dict() counter = 0 fp = open(output_file_path, "w") document_gen = anonymized_extract.document_generator(input_file_path) for document in document_gen: if counter % 50 == 0: print(input_file_path, counter) targets = anonymized_extract.calculate_targets(document) fp.write( repr(targets["comments"]) + "\t" + repr(targets["users"]) + "\t" + repr(targets["score_wilson"]) + "\t" + repr(targets["controversiality_wilson"]) + "\n") counter += 1
def read_weights(dataset): if dataset == "youtube": base_model_filepath = get_package_path( ) + "/data_folder/models/youtube_model.pkl" elif dataset == "reddit": base_model_filepath = get_package_path( ) + "/data_folder/models/reddit_model.pkl" else: raise ValueError("Invalid dataset.") file_path_list = list() for i in range(3): file_path_list.append(base_model_filepath + "." + repr(i)) fin = open(file_path_list[0], "rb") params = cPickle.load(fin) fin.close() user_embeddings = params[0] return user_embeddings
__author__ = "Georgios Rizos ([email protected])" from thread2vec.preprocessing.anonymize_datasets import anonymize_reddit_dataset, anonymize_youtube_dataset from thread2vec.preprocessing.anonymize_datasets import form_item_to_user, form_item_to_popularity from thread2vec.representation.utility import get_data from thread2vec.preprocessing.handcrafted import calculate_reddit_features, calculate_youtube_features from thread2vec.common import get_package_path if "__main__" == __name__: # Anonymize raw data anonymize_reddit_dataset( get_package_path() + "/data_folder/raw_data/reddit", get_package_path() + "/data_folder/anonymized_data/reddit") anonymize_youtube_dataset( get_package_path() + "/data_folder/raw_data/youtube", get_package_path() + "/data_folder/anonymized_data/youtube") # Form item to responding users arrays for different time scales. for scale in ["post", "min", "hour", "day", "week", "inf"]: form_item_to_user("youtube", scale) form_item_to_user("reddit", scale) # Extract label values from raw data. form_item_to_popularity("youtube") form_item_to_popularity("reddit") # Calculate engineered features. calculate_reddit_features() calculate_youtube_features() # Store data splits.
def calculate_youtube_features(): input_file_path = get_package_path( ) + "/data_folder/anonymized_data/youtube/anonymized_data.txt" #################################################################################################################### # Iterate over all videos. #################################################################################################################### graph_generator = form_graphs([ input_file_path, ], item_id_set=set(range(411288))) features_generator = extract_features(graph_generator, "youtube") youtube_feature_name_list = sorted( get_handcrafted_feature_names("YouTube")) number_of_youtube_features = len(youtube_feature_name_list) number_of_items = 411288 # TODO: Make this readable. features_post = np.empty((number_of_items, number_of_youtube_features), dtype=np.float32) features_minute = np.empty((number_of_items, number_of_youtube_features), dtype=np.float32) features_hour = np.empty((number_of_items, number_of_youtube_features), dtype=np.float32) features_day = np.empty((number_of_items, number_of_youtube_features), dtype=np.float32) features_week = np.empty((number_of_items, number_of_youtube_features), dtype=np.float32) features_inf = np.empty((number_of_items, number_of_youtube_features), dtype=np.float32) features_dict = dict() features_dict[0] = features_post features_dict[1] = features_minute features_dict[2] = features_hour features_dict[3] = features_day features_dict[4] = features_week features_dict[5] = features_inf counter = 0 for features in features_generator: for s, snapshot in enumerate(features["snapshots"]): snapshot_features = snapshot["features"] for f, feature_name in enumerate(youtube_feature_name_list): features_dict[s][counter, f] = np.float32( snapshot_features[feature_name]) if s < 5: for s_extra in range(s + 1, 6): for f, feature_name in enumerate(youtube_feature_name_list): features_dict[s_extra][counter, f] = np.float32( snapshot_features[feature_name]) counter += 1 np.save( get_package_path() + "/data_folder/anonymized_data/youtube/features_post", features_post) np.save( get_package_path() + "/data_folder/anonymized_data/youtube/features_minute", features_minute) np.save( get_package_path() + "/data_folder/anonymized_data/youtube/features_hour", features_hour) np.save( get_package_path() + "/data_folder/anonymized_data/youtube/features_day", features_day) np.save( get_package_path() + "/data_folder/anonymized_data/youtube/features_week", features_week) np.save( get_package_path() + "/data_folder/anonymized_data/youtube/features_inf", features_inf)
def get_data(dataset, scale): if dataset == "youtube": item_to_userset_filepath = get_package_path( ) + "/data_folder/anonymized_data/youtube/item_to_userset_" + scale + ".txt" anonymize_user_filepath = get_package_path( ) + "/data_folder/anonymized_data/youtube/anonymize_user_" + scale + ".txt" popularity_filepath = get_package_path( ) + "/data_folder/anonymized_data/youtube/item_to_popularity.txt" anonymous_coward_name = repr(0) top_users = 200001 total_number_of_items = 516995 elif dataset == "reddit": item_to_userset_filepath = get_package_path( ) + "/data_folder/anonymized_data/reddit/item_to_userset_" + scale + ".txt" anonymize_user_filepath = get_package_path( ) + "/data_folder/anonymized_data/reddit/anonymize_user_" + scale + ".txt" popularity_filepath = get_package_path( ) + "/data_folder/anonymized_data/reddit/item_to_popularity.txt" anonymous_coward_name = repr(0) top_users = 20000 total_number_of_items = 35844 else: raise ValueError("Invalid dataset.") # Read popularity values. bad_popularity_items = list() popularity_matrix = np.empty((total_number_of_items, 4), dtype=np.float32) with open(popularity_filepath, "r") as fp: file_row = next(fp) item_counter = 0 for file_row in fp: clean_row = file_row.strip().split("\t") if clean_row[0] == "None": popularity_matrix[item_counter, 0] = np.nan popularity_matrix[item_counter, 1] = np.nan popularity_matrix[item_counter, 2] = np.nan popularity_matrix[item_counter, 3] = np.nan bad_popularity_items.append(item_counter) else: popularity_matrix[item_counter, 0] = float(clean_row[0]) popularity_matrix[item_counter, 1] = float(clean_row[1]) popularity_matrix[item_counter, 2] = float(clean_row[2]) popularity_matrix[item_counter, 3] = float(clean_row[3]) item_counter += 1 bad_popularity_items = np.array(bad_popularity_items, dtype=np.int32) # Read user anonymizer. anonymize_user = dict() with open(anonymize_user_filepath, "r") as fp: for file_row in fp: clean_row = file_row.strip().split("\t") anonymize_user[clean_row[0]] = int(clean_row[1]) total_number_of_users = len(anonymize_user) true_anonymize_user = copy.copy(anonymize_user) user_list = list() for i in range(total_number_of_users): user_list.append(None) for k, v in anonymize_user.items(): user_list[v] = k anonymous_coward_within_discussion = anonymize_user[anonymous_coward_name] # Read item to userset. item_to_user_row = list() item_to_user_col = list() item_to_user_matrix = spsp.coo_matrix( (np.array(list(), dtype=np.int32), (np.array(list(), dtype=np.int32), np.array(list(), dtype=np.int32))), shape=(total_number_of_items, total_number_of_users)) item_to_user_matrix = spsp.csc_matrix(item_to_user_matrix) with open(item_to_userset_filepath, "r") as fp: counter = 0 for file_row in fp: clean_row = file_row.strip().split("\t") for user in clean_row[1:]: item_to_user_row.append(int(clean_row[0])) item_to_user_col.append(int(user)) counter += 1 if counter % 10000 == 0: item_to_user_row = np.array(item_to_user_row, dtype=np.int32) item_to_user_col = np.array(item_to_user_col, dtype=np.int32) item_to_user_data = np.ones_like(item_to_user_row, dtype=np.int32) item_to_user_matrix_to_add = spsp.coo_matrix( (item_to_user_data, (item_to_user_row, item_to_user_col)), shape=(total_number_of_items, total_number_of_users)) item_to_user_matrix_to_add = spsp.csc_matrix( item_to_user_matrix_to_add) item_to_user_matrix = item_to_user_matrix + item_to_user_matrix_to_add item_to_user_row = list() item_to_user_col = list() item_to_user_row = np.array(item_to_user_row, dtype=np.int32) item_to_user_col = np.array(item_to_user_col, dtype=np.int32) item_to_user_data = np.ones_like(item_to_user_row, dtype=np.int32) item_to_user_matrix_to_add = spsp.coo_matrix( (item_to_user_data, (item_to_user_row, item_to_user_col)), shape=(total_number_of_items, total_number_of_users)) item_to_user_matrix_to_add = spsp.csc_matrix(item_to_user_matrix_to_add) item_to_user_matrix = item_to_user_matrix + item_to_user_matrix_to_add if top_users is not None: user_to_item_distribution = item_to_user_matrix.sum(axis=0) user_indices_sorted = np.empty(top_users, dtype=np.int32) user_indices_sorted_to_add = np.argsort(user_to_item_distribution)[ 0, -top_users:] user_indices_sorted[:] = user_indices_sorted_to_add user_indices_sorted = user_indices_sorted[ user_indices_sorted != anonymous_coward_within_discussion] user_indices_sorted_set = set(list(user_indices_sorted)) filtered_item_to_user_matrix = item_to_user_matrix[:, user_indices_sorted] new_user_list = list() new_anonymize_user = dict() counter = 0 for user in user_list: if anonymize_user[user] in user_indices_sorted_set: new_user_list.append(user) new_anonymize_user[user] = counter counter += 1 user_list = new_user_list anonymize_user = new_anonymize_user else: top_users = total_number_of_users user_to_item_distribution = np.empty(top_users, dtype=np.int32) user_to_item_distribution[:] = item_to_user_matrix.sum(axis=0)[0, :] user_indices_sorted = np.arange(user_to_item_distribution.size, dtype=np.int32) user_indices_sorted = user_indices_sorted[ user_to_item_distribution > 1] user_indices_sorted = user_indices_sorted[ user_indices_sorted != anonymous_coward_within_discussion] user_indices_sorted_set = set(list(user_indices_sorted)) filtered_item_to_user_matrix = item_to_user_matrix[:, user_indices_sorted] new_user_list = list() new_anonymize_user = dict() counter = 0 for user in user_list: if anonymize_user[user] in user_indices_sorted_set: new_user_list.append(user) new_anonymize_user[user] = counter counter += 1 user_list = new_user_list anonymize_user = new_anonymize_user # item_to_user_distribution = filtered_item_to_user_matrix.sum(axis=1) # item_to_user_distribution = item_to_user_distribution[item_to_user_distribution > 1] item_to_user_distribution = np.empty(total_number_of_items, dtype=np.int32) item_to_user_distribution[:] = filtered_item_to_user_matrix.sum( axis=1)[:, 0].transpose() item_indices_sorted = np.arange(total_number_of_items, dtype=np.int32) item_indices_sorted = item_indices_sorted[item_to_user_distribution > 0] item_indices_sorted = np.setdiff1d(item_indices_sorted, bad_popularity_items) filtered_item_to_user_matrix = spsp.csr_matrix( filtered_item_to_user_matrix) filtered_item_to_user_matrix = filtered_item_to_user_matrix[ item_indices_sorted, :] popularity_matrix = popularity_matrix[item_indices_sorted, :] user_to_item_distribution = np.empty(len(anonymize_user), dtype=np.int32) user_to_item_distribution[:] = filtered_item_to_user_matrix.sum( axis=0)[0, :] user_indices_sorted = np.arange(user_to_item_distribution.size, dtype=np.int32) user_indices_sorted = user_indices_sorted[user_to_item_distribution > 0] user_indices_sorted = user_indices_sorted[ user_indices_sorted != anonymous_coward_within_discussion] user_indices_sorted_set = set(list(user_indices_sorted)) filtered_item_to_user_matrix = filtered_item_to_user_matrix[:, user_indices_sorted] new_user_list = list() new_anonymize_user = dict() counter = 0 for user in user_list: if anonymize_user[user] in user_indices_sorted_set: new_user_list.append(user) new_anonymize_user[user] = counter counter += 1 user_list = new_user_list anonymize_user = new_anonymize_user true_user_id_to_user_id = dict() for user in user_list: k = true_anonymize_user[user] v = anonymize_user[user] true_user_id_to_user_id[k] = v index_1 = int(np.ceil(filtered_item_to_user_matrix.shape[0] * 0.5)) index_2 = int(np.ceil(filtered_item_to_user_matrix.shape[0] * 0.75)) index_permutation = np.random.permutation( np.arange(filtered_item_to_user_matrix.shape[0], dtype=np.int32)) train = index_permutation[:index_1] val = index_permutation[index_1:index_2] test = index_permutation[index_2:] data_splits = (train, val, test) data = dict() data["filtered_item_to_user_matrix"] = filtered_item_to_user_matrix data["popularity_matrix"] = popularity_matrix data["item_indices_sorted"] = item_indices_sorted data["anonymize_user"] = anonymize_user data["true_user_id_to_user_id"] = true_user_id_to_user_id data["user_list"] = user_list data["number_of_items"] = filtered_item_to_user_matrix.shape[0] data["number_of_users"] = filtered_item_to_user_matrix.shape[1] data["data_splits"] = data_splits return data
def handcrafted_features_versus_aggregation_comparison(dataset, vlad_clusters): method_names = list() results_list = list() train, val, test = read_indices(dataset) train = np.append(train, val) data = get_data(dataset, "week") y = data["popularity_matrix"] y_train = y[train, 2] y_test = y[test, 2] handcrafted_parameters = ["hour", "day", "week", "final"] X = make_features_vlad( dataset, number_of_vlad_clusters=vlad_clusters, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=True, do_l2_norm=True) for star in handcrafted_parameters: handcrafted_features = np.load(get_package_path() + "/data_folder/uniform_data/" + dataset + "/features_" + star + ".npy") method_names.append(star + "") X_train = handcrafted_features[train, :] X_test = handcrafted_features[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(star + "vlad" + repr(vlad_clusters)) X = np.hstack([X, handcrafted_features]) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) with open( get_package_path() + "/data_folder/uniform_data/" + dataset + "/handcrafted_benchmark.txt", "w") as fp: for name, loss in zip(method_names, results_list): fp.write(name + "\t" + repr(loss) + "\n") method_names = np.array(method_names) results_list = np.array(results_list) indices_sorted = np.argsort(results_list) print(method_names[indices_sorted]) print(results_list[indices_sorted])
def mean_versus_vlad_aggregation(dataset): method_names = list() vlad_parameters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30] results_list = list() train, val, test = read_indices(dataset) train = np.append(train, val) data = get_data(dataset, "week") y = data["popularity_matrix"] y_train = y[train, 1] y_test = y[test, 1] #################################################################################################################### # Mean #################################################################################################################### method_name = "mean" method_names.append(method_name + "") X = make_features_mean( dataset, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=False, do_l2_norm=False) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(method_name + "_pnorm") X = make_features_mean( dataset, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=False, do_l2_norm=True) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(method_name + "_l2norm") X = make_features_mean( dataset, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=True, do_l2_norm=False) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(method_name + "_allnorm") X = make_features_mean( dataset, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=True, do_l2_norm=True) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) #################################################################################################################### # VLAD #################################################################################################################### method_name = "vlad" for vlad_clusters in vlad_parameters: method_names.append(method_name + repr(vlad_clusters) + "") X = make_features_vlad( dataset, number_of_vlad_clusters=vlad_clusters, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=False, do_l2_norm=False) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(method_name + repr(vlad_clusters) + "_pnorm") X = make_features_vlad( dataset, number_of_vlad_clusters=vlad_clusters, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=True, do_l2_norm=False) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(method_name + repr(vlad_clusters) + "_l2norm") X = make_features_vlad( dataset, number_of_vlad_clusters=vlad_clusters, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=False, do_l2_norm=True) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) method_names.append(method_name + repr(vlad_clusters) + "_allnorm") X = make_features_vlad( dataset, number_of_vlad_clusters=vlad_clusters, filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"], user_id_set=set(list(data["true_user_id_to_user_id"].values())), do_power_norm=True, do_l2_norm=True) X_train = X[train, :] X_test = X[test, :] model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) loss = np.mean(np.power(y_pred - y_test, 2)) print(loss) results_list.append(loss) with open( get_package_path() + "/data_folder/uniform_data/" + dataset + "/aggregation_benchmark_user.txt", "w") as fp: for name, loss in zip(method_names, results_list): fp.write(name + "\t" + repr(loss) + "\n") method_names = np.array(method_names) results_list = np.array(results_list) indices_sorted = np.argsort(results_list) print(method_names[indices_sorted]) print(results_list[indices_sorted])
__author__ = "Georgios Rizos ([email protected])" from thread2vec.common import get_package_path from thread2vec.representation.neural_embedding import Thread2Vec from thread2vec.representation.utility import get_data if __name__ == "__main__": data_folder = get_package_path() + "/data_folder" #################################################################################################################### # Run Reddit experiment. #################################################################################################################### batch_size = 64 negative_samples = .4 embedding_size = 64 window_size = 500 learning_rate = 1e-3 dropout = 0.2 dataset = "reddit" data = get_data(dataset, "week") print("Read data.") async_batch_size = 1000 shuffle = True user_user_iterations_number = None number_of_vlad_clusters = 50
def form_item_to_user(platform, time_scale): folder = get_package_path() + "/data_folder/anonymized_data/" + platform output_file_path = get_package_path( ) + "/data_folder/anonymized_data/" + platform + "/item_to_userset_" + time_scale + ".txt" anonymize_user_file_path = get_package_path( ) + "/data_folder/anonymized_data/" + platform + "/anonymize_user_" + time_scale + ".txt" time_scale_in_seconds = dict() time_scale_in_seconds["post"] = 0.0 time_scale_in_seconds["min"] = 60.0 time_scale_in_seconds["hour"] = 3600.0 time_scale_in_seconds["day"] = 86400.0 time_scale_in_seconds["week"] = 604810.0 time_scale_in_seconds["inf"] = sys.maxsize #################################################################################################################### # Extraction functions. #################################################################################################################### extraction_functions = dict() extraction_functions[ "comment_generator"] = anonymized_extract.comment_generator extraction_functions[ "extract_comment_name"] = anonymized_extract.extract_comment_name extraction_functions[ "extract_parent_comment_name"] = anonymized_extract.extract_parent_comment_name extraction_functions[ "extract_lifetime"] = anonymized_extract.extract_lifetime extraction_functions[ "extract_user_name"] = anonymized_extract.extract_user_name extraction_functions[ "calculate_targets"] = anonymized_extract.calculate_targets extraction_functions["anonymous_coward_name"] = "0" #################################################################################################################### # Iterate over all videos. #################################################################################################################### input_file_path = folder + "/anonymized_data" + ".txt" anonymize_user = dict() counter = 0 fp = open(output_file_path, "w") document_gen = anonymized_extract.document_generator(input_file_path) for document in document_gen: if counter % 50 == 0: print(input_file_path, counter) user_set = list() ################################################################################################################ # Within discussion anonymization. ################################################################################################################ comment_gen = extraction_functions["comment_generator"](document) comment_list = [comment for comment in comment_gen] initial_post = comment_list[0] initial_timestamp = extraction_functions["extract_lifetime"]( initial_post) for comment in comment_list: lifetime = extraction_functions["extract_lifetime"]( comment) - initial_timestamp if lifetime > time_scale_in_seconds[time_scale]: continue user_name = extraction_functions["extract_user_name"](comment) user_id = anonymize_user.get(user_name, len(anonymize_user)) anonymize_user[user_name] = user_id user_set.append(user_id) user_set = set(user_set) user_set = [repr(u) for u in user_set] fp.write(repr(counter) + "\t" + "\t".join(sorted(user_set)) + "\n") counter += 1 fp.close() with open(anonymize_user_file_path, "w") as fp: for k, v in anonymize_user.items(): fp.write(k + "\t" + repr(v) + "\n")