def main(): args = parse_args() np.random.seed(args.seed) print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN)) # noqa: E501 test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) print("Generating {} negative samples for each user" .format(args.negatives)) for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501 test_item = user_to_items[user].pop() all_ratings.remove((user, test_item)) all_negs = all_items - set(user_to_items[user]) all_negs = sorted(list(all_negs)) # determinism test_ratings.append((user, test_item)) test_negs.append(list(np.random.choice(all_negs, args.negatives))) print("Saving train and test CSV files to {}".format(args.output)) df_train_ratings = pd.DataFrame(list(all_ratings)) df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_ratings = pd.DataFrame(test_ratings) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs = pd.DataFrame(test_negs) df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t')
def main(): args = parse_args() print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) LOGGER.log(key=tags.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0] df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0] print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) # clean up data del df['rating'], df['timestamp'] df = df.drop_duplicates() # assuming it keeps order # now we have filtered and sorted by time data, we can split test data out grouped_sorted = df.groupby(USER_COLUMN, group_keys=False) test_data = grouped_sorted.tail(1).sort_values(by='user_id') # need to pop for each group train_data = grouped_sorted.apply(lambda x: x.iloc[:-1]) train_data = train_data.sort_values([USER_COLUMN, ITEM_COLUMN]) train_data.to_pickle(args.output + '/train_ratings.pickle') test_data.to_pickle(args.output + '/test_ratings.pickle')
def main(): args = parse_args() print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Filtering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0] df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0] print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) # clean up data del df['rating'], df['timestamp'] df = df.drop_duplicates() # assuming it keeps order # now we have filtered and sorted by time data, we can split test data out grouped_sorted = df.groupby(USER_COLUMN, group_keys=False) test_data = grouped_sorted.tail(1).sort_values(by='user_id') # need to pop for each group train_data = grouped_sorted.apply(lambda x: x.iloc[:-1]) # Note: no way to keep reference training data ordering because use of python set and multi-process # It should not matter since it will be later randomized again # save train and val data that is fixed. train_ratings = torch.from_numpy(train_data.values) torch.save(train_ratings, args.output + '/train_ratings.pt') test_ratings = torch.from_numpy(test_data.values) torch.save(test_ratings, args.output + '/test_ratings.pt')
def main(): # TODO: Add random seed as parameter np.random.seed(0) args = parse_args() df = implicit_load(args.path, sort=False) grouped = df.groupby(USER_COLUMN) df = grouped.filter(lambda x: len(x) >= 20) original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in df.itertuples(): user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN)) # noqa: E501 test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) for user in range(len(original_users)): test_item = user_to_items[user].pop() all_ratings.remove((user, test_item)) all_negs = all_items - set(user_to_items[user]) all_negs = sorted(list(all_negs)) # determinism test_ratings.append((user, test_item)) test_negs.append(list(np.random.choice(all_negs, NUMBER_NEGATIVES))) # serialize df_train_ratings = pd.DataFrame(list(all_ratings)) df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_ratings = pd.DataFrame(test_ratings) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs = pd.DataFrame(test_negs) df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t')
def main(): args = parse_args() np.random.seed(args.seed) print("\nLoading raw data from {}\n".format(args.file)) df = implicit_load(args.file, sort=False) print( "\nFiltering out users with less than {} ratings".format(MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() nb_users = len(original_users) nb_items = len(original_items) user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 print("Creating list of items for each user") # Need to sort before popping to get last item df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): user_to_items[getattr(row, USER_COLUMN)].append( getattr(row, ITEM_COLUMN)) # noqa: E501 train_ratings = [] test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) print( "Generating {} negative samples for each user and creating training set" .format(args.negatives)) for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501 all_negs = all_items - set(user_to_items[user]) all_negs = sorted(list(all_negs)) # determinism negs = random.sample(all_negs, args.negatives) test_item = user_to_items[user].pop() all_ratings.remove((user, test_item)) tmp = [] tmp.extend([user, test_item]) tmp.extend(negs) test_negs.append(list(tmp)) tmp = [] tmp.extend([user, test_item]) tmp.extend(user_to_items[user][-args.history_size:]) test_ratings.append(list(tmp)) while len(user_to_items[user]) > args.history_size: tgItem = user_to_items[user].pop() tmp = [] tmp.extend([user, tgItem]) tmp.extend(user_to_items[user][-args.history_size:]) train_ratings.append(list(tmp)) print("\nSaving train and test CSV files to {}".format(args.output)) df_train_ratings = pd.DataFrame(list(train_ratings)) print('Saving data description ...') f_writer = open(os.path.join(OUTPUT, 'data_summary.txt'), 'w') f_writer.write('users = ' + str(nb_users) + ', items = ' + str(nb_items) + ', history_size = ' + str(HISTORY_SIZE) + ', train_entries = ' + str(len(df_train_ratings))) df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_ratings = pd.DataFrame(test_ratings) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs = pd.DataFrame(test_negs) df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t') print("Data preprocess done!\n")
def main(): args = parse_args() if args.seed is not None: torch.manual_seed(args.seed) print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) if args.test == 'less_user': to_drop = set(list(df[USER_COLUMN].unique())[-100:]) df = df[~df[USER_COLUMN].isin(to_drop)] if args.test == 'less_item': to_drop = set(list(df[ITEM_COLUMN].unique())[-100:]) df = df[~df[ITEM_COLUMN].isin(to_drop)] if args.test == 'more_user': sample = df.sample(frac=0.2).copy() sample[USER_COLUMN] = sample[USER_COLUMN] + 10000000 df = df.append(sample) users = df[USER_COLUMN] df = df[users.isin(users[users.duplicated( keep=False)])] # make sure something remains in the train set if args.test == 'more_item': sample = df.sample(frac=0.2).copy() sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 10000000 df = df.append(sample) print("Mapping original user and item IDs to new sequential IDs") df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0] df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0] user_cardinality = df[USER_COLUMN].max() + 1 item_cardinality = df[ITEM_COLUMN].max() + 1 # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) # clean up data del df['rating'], df['timestamp'] df = df.drop_duplicates() # assuming it keeps order # Test set is the last interaction for a given user grouped_sorted = df.groupby(USER_COLUMN, group_keys=False) test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN) # Train set is all interactions but the last one train_data = grouped_sorted.apply(lambda x: x.iloc[:-1]) sampler = _TestNegSampler(train_data.values, args.valid_negative) test_negs = sampler.generate().cuda() if args.valid_negative > 0: test_negs = test_negs.reshape(-1, args.valid_negative) else: test_negs = test_negs.reshape(test_data.shape[0], 0) if args.test == 'more_pos': mask = np.random.rand(len(test_data)) < 0.5 sample = test_data[mask].copy() sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 5 test_data = test_data.append(sample) test_negs_copy = test_negs[mask] test_negs = torch.cat((test_negs, test_negs_copy), dim=0) if args.test == 'less_pos': mask = np.random.rand(len(test_data)) < 0.5 test_data = test_data[mask] test_negs = test_negs[mask] # Reshape train set into user,item,label tabular and save train_ratings = torch.from_numpy(train_data.values).cuda() train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32) torch.save(train_ratings, os.path.join(args.output, TRAIN_0)) torch.save(train_labels, os.path.join(args.output, TRAIN_1)) # Reshape test set into user,item,label tabular and save # All users have the same number of items, items for a given user appear consecutively test_ratings = torch.from_numpy(test_data.values).cuda() test_users_pos = test_ratings[:, 0: 1] # slicing instead of indexing to keep dimensions test_items_pos = test_ratings[:, 1:2] test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0) test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1) positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32) negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat( 1, args.valid_negative) test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1) dtypes = { 'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype) } test_tensor = torch.cat((test_users, test_items), dim=1) torch.save(test_tensor, os.path.join(args.output, TEST_0)) torch.save(test_labels, os.path.join(args.output, TEST_1)) if args.test == 'other_names': dtypes = { 'user_2': str(test_users.dtype), 'item_2': str(test_items.dtype), 'label_2': str(test_labels.dtype) } save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes, test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml', user_feature_name='user_2', item_feature_name='item_2', label_feature_name='label_2') else: save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes, test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')
def main(): args = parse_args() device = exp.get_device() chrono = exp.chrono() print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) # ------------------------------------------------------------------------------------------------------------------ with chrono.time('task', skip_obs=0): print("Filtering out users with less than {} ratings".format( MIN_RATINGS)) grouped = df.groupby(USER_COLUMN) # mlperf_log.ncf_print(key=# mlperf_log.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS) df = grouped.filter(lambda x: len(x) >= MIN_RATINGS) print("Mapping original user and item IDs to new sequential IDs") original_users = df[USER_COLUMN].unique() original_items = df[ITEM_COLUMN].unique() user_map = {user: index for index, user in enumerate(original_users)} item_map = {item: index for index, item in enumerate(original_items)} df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user]) df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item]) assert df[USER_COLUMN].max() == len(original_users) - 1 assert df[ITEM_COLUMN].max() == len(original_items) - 1 print("Creating list of items for each user") # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN])) user_to_items = defaultdict(list) for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)): user_to_items[getattr(row, USER_COLUMN)].append( getattr(row, ITEM_COLUMN)) # noqa: E501 test_ratings = [] test_negs = [] all_items = set(range(len(original_items))) print("Generating {} negative samples for each user".format( args.negatives)) # mlperf_log.ncf_print(key=# mlperf_log.PREPROC_HP_NUM_EVAL, value=args.negatives) # The default of np.random.choice is replace=True # mlperf_log.ncf_print(key=# mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) #=========================================================================== #== First random operation triggers the clock start. ======================= #=========================================================================== # mlperf_log.ncf_print(key=# mlperf_log.RUN_START) # mlperf_log.ncf_print(key=# mlperf_log.INPUT_STEP_EVAL_NEG_GEN) for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)): # noqa: E501 test_item = user_to_items[user].pop() all_ratings.remove((user, test_item)) all_negs = all_items - set(user_to_items[user]) all_negs = sorted(list(all_negs)) # determinism test_ratings.append((user, test_item)) test_negs.append(list(np.random.choice(all_negs, args.negatives))) print("Saving train and test CSV files to {}".format(args.output)) df_train_ratings = pd.DataFrame(list(all_ratings)) df_train_ratings['fake_rating'] = 1 df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME), index=False, header=False, sep='\t') # mlperf_log.ncf_print(key=# mlperf_log.INPUT_SIZE, value=len(df_train_ratings)) df_test_ratings = pd.DataFrame(test_ratings) df_test_ratings['fake_rating'] = 1 df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME), index=False, header=False, sep='\t') df_test_negs = pd.DataFrame(test_negs) df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME), index=False, header=False, sep='\t') # ------------------------------------------------------------------------------------------------------------------ exp.report()
def main(): args = parse_args() if args.seed is not None: torch.manual_seed(args.seed) print("Loading raw data from {}".format(args.path)) df = implicit_load(args.path, sort=False) print("Mapping original user and item IDs to new sequential IDs") df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0] df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0] user_cardinality = df[USER_COLUMN].max() + 1 item_cardinality = df[ITEM_COLUMN].max() + 1 # Need to sort before popping to get last item df.sort_values(by='timestamp', inplace=True) # clean up data del df['rating'], df['timestamp'] df = df.drop_duplicates() # assuming it keeps order # Test set is the last interaction for a given user grouped_sorted = df.groupby(USER_COLUMN, group_keys=False) test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN) # Train set is all interactions but the last one train_data = grouped_sorted.apply(lambda x: x.iloc[:-1]) sampler = _TestNegSampler(train_data.values, args.valid_negative) test_negs = sampler.generate().cuda() test_negs = test_negs.reshape(-1, args.valid_negative) # Reshape train set into user,item,label tabular and save train_ratings = torch.from_numpy(train_data.values).cuda() train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32) torch.save(train_ratings, os.path.join(args.output, TRAIN_0)) torch.save(train_labels, os.path.join(args.output, TRAIN_1)) # Reshape test set into user,item,label tabular and save # All users have the same number of items, items for a given user appear consecutively test_ratings = torch.from_numpy(test_data.values).cuda() test_users_pos = test_ratings[:, 0: 1] # slicing instead of indexing to keep dimensions test_items_pos = test_ratings[:, 1:2] test_users = test_users_pos.repeat_interleave(args.valid_negative + 1, dim=0) test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs), dim=1).reshape(-1, 1) positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32) negative_labels = torch.zeros_like(test_users_pos, dtype=torch.float32).repeat( 1, args.valid_negative) test_labels = torch.cat((positive_labels, negative_labels), dim=1).reshape(-1, 1) dtypes = { 'user': str(test_users.dtype), 'item': str(test_items.dtype), 'label': str(test_labels.dtype) } test_tensor = torch.cat((test_users, test_items), dim=1) torch.save(test_tensor, os.path.join(args.output, TEST_0)) torch.save(test_labels, os.path.join(args.output, TEST_1)) save_feature_spec(user_cardinality=user_cardinality, item_cardinality=item_cardinality, dtypes=dtypes, test_negative_samples=args.valid_negative, output_path=args.output + '/feature_spec.yaml')