def main(db_fpath, db_name, cross_val_folder, probs_folder): #get cross validation dicts user_items_to_filter, user_validation_tags, user_test_tags = \ load_train_test_validation(cross_val_folder) with AnnotReader(db_fpath) as reader: reader.change_table(db_name) annot_filter = FilteredUserItemAnnotations(user_items_to_filter) annotations = annot_filter.annotations(reader.iterate()) est = PrecomputedEstimator(probs_folder) value_calc = ValueCalculator(est, annotations) run_exp(user_validation_tags, user_test_tags, est, value_calc)
def sanity_check(reader, user_items_to_filter): ''' A simple sanity check to verify that we did not delete any user, item or tag from the trace. ''' users = set() items = set() tags = set() for annotation in reader.iterate(): user = annotation['user'] item = annotation['item'] tag = annotation['tag'] users.add(user) items.add(item) tags.add(tag) filtered = FilteredUserItemAnnotations(user_items_to_filter) filtered_users = set() filtered_items = set() filtered_tags = set() for annotation in filtered.annotations(reader.iterate()): user = annotation['user'] item = annotation['item'] tag = annotation['tag'] assert user in users assert item in items assert tag in tags if user in user_items_to_filter: assert item not in user_items_to_filter[user] filtered_users.add(user) filtered_items.add(item) filtered_tags.add(tag) assert len(filtered_users) == len(users) assert len(filtered_items) == len(items) assert len(filtered_tags) == len(tags) assert len(filtered_users.symmetric_difference(users)) == 0 assert len(filtered_items.symmetric_difference(items)) == 0 assert len(filtered_tags.symmetric_difference(tags)) == 0
def main(db_fpath, db_name, cross_val_folder, probs_folder): #get cross validation dicts user_items_to_filter, user_validation_tags, user_test_tags, \ user_test_items = load_train_test_validation(cross_val_folder) with AnnotReader(db_fpath) as reader: reader.change_table(db_name) annot_filter = FilteredUserItemAnnotations(user_items_to_filter) est = PrecomputedEstimator(probs_folder) run_exp(user_validation_tags, user_test_tags, user_test_items, est, annot_filter, reader)
def main(db_fpath, db_name, cross_val_folder, param_value, est_name, rand_seed=None, num_cores=-1): '''Dispatches jobs in multiple cores''' seed(rand_seed) #get cross validation dicts user_items_to_filter, user_validation_tags, user_test_tags = \ load_train_test_validation(cross_val_folder) #all tags used by all users. Used o create a random set of tags excluding #these ones used_tags = set() for user in user_items_to_filter: used_tags.update(user_validation_tags[user]) used_tags.update(user_test_tags[user]) with AnnotReader(db_fpath) as reader: reader.change_table(db_name) annot_filter = FilteredUserItemAnnotations(user_items_to_filter) #Generate 50 random tags not used by any user the test set #Also creates some indexes used to define gamma items annotations = annot_filter.annotations(reader.iterate()) user_to_item = defaultdict(set) items = set() tags = set() random_tags = [] for annotation in annotations: user = annotation['user'] item = annotation['item'] tag = annotation['tag'] user_to_item[user].add(item) items.add(item) tags.add(tag) if tag not in used_tags and tag not in random_tags: random_tags.append(tag) shuffle(random_tags) random_tags = random_tags[:NUM_RANDOM_TAGS] #Gets number of tags and items num_items = len(items) num_tags = len(tags) #Create estimator annotations = annot_filter.annotations(reader.iterate()) if est_name == 'lda': est = create_lda_estimator(annotations, param_value, num_items, num_tags) else: est = create_bayes_estimator(annotations, param_value) annotations = annot_filter.annotations(reader.iterate()) value_calc = ValueCalculator(est, annotations) run_exp(user_items_to_filter, user_test_tags, user_to_item, num_items, random_tags, value_calc)
def run_one(args): """ This method will be run by parallel processes. Basically, it is the main method for each possible parameter being tested. It will work as follows: 1. Loads train, validation and test separation from files 2. Values of p(i|u) are computed for the gamma items set for each user based on the train set. Gamma items is just every item excluding the user items. 3. Computes p(i|t,u) for a set of tags gamma items for each user. The set of tags is composed of the previous user tags (those on the test set), the tags which were used on the validation set, the tags used on the train set and 50 random tags not previously used by the user. 4. Saves p(i|u) and p(i|t,u) for items and tags considered above on the output folder. This provides sufficient information for choosing the best estimator (on the validation set) and performing further experiments (actually computing tag values) on the test set. """ # unbox arguments db_fpath, db_name, output_folder, cross_val_folder, est_name, param_one, value_one, param_two, value_two = args # get cross validation dicts user_items_to_filter, user_validation_tags, user_test_tags = load_train_test_validation(cross_val_folder) # all tags used by all users. Used o create a random set of tags excluding # these ones used_tags = set() for user in user_items_to_filter: used_tags.update(user_validation_tags[user]) used_tags.update(user_test_tags[user]) with AnnotReader(db_fpath) as reader: reader.change_table(db_name) annot_filter = FilteredUserItemAnnotations(user_items_to_filter) # Generate 50 random tags not used by any user in validation or test # Also creates some indexes used to define gamma items annotations = annot_filter.annotations(reader.iterate()) user_to_item = defaultdict(set) items = set() tags = set() random_tags = [] for annotation in annotations: user = annotation["user"] item = annotation["item"] tag = annotation["tag"] user_to_item[user].add(item) items.add(item) tags.add(tag) if tag not in used_tags and tag not in random_tags: random_tags.append(tag) shuffle(random_tags) random_tags = random_tags[:NUM_RANDOM_TAGS] # Gets number of tags and items num_items = len(items) num_tags = len(tags) # Create estimator annotations = annot_filter.annotations(reader.iterate()) save_lhood = False if est_name == "lda": est = create_lda_estimator(annotations, value_one, num_items, num_tags, value_two) save_lhood = True else: est = create_bayes_estimator(annotations, value_one, value_two) param_out_folder = os.path.join( output_folder, "params-%s-%f_%s-%f" % (param_one, value_one, param_two, value_two) ) os.mkdir(param_out_folder) run_exp( user_items_to_filter, user_validation_tags, user_test_tags, user_to_item, num_items, random_tags, est, param_out_folder, save_lhood, )
def run_one(args): ''' This method will be run by parallel processes. Basically, it is the main method for each possible parameter being tested. It will work as follows: 1. Loads train, validation and test separation from files 2. Values of p(i|u) are computed for the gamma items set for each user based on the train set. Gamma items is just every item excluding the user items. 3. Computes p(i|t,u) for a set of tags gamma items for each user. The set of tags is composed of the previous user tags (those on the test set), the tags which were used on the validation set, the tags used on the train set and 50 random tags not previously used by the user. 4. Saves p(i|u) and p(i|t,u) for items and tags considered above on the output folder. This provides sufficient information for choosing the best estimator (on the validation set) and performing further experiments (actually computing tag values) on the test set. ''' #unbox arguments db_fpath, db_name, output_folder, cross_val_folder, est_name, \ param_one, value_one, param_two, value_two = args #get cross validation dicts user_items_to_filter, user_validation_tags, user_test_tags = \ load_train_test_validation(cross_val_folder) #all tags used by all users. Used o create a random set of tags excluding #these ones used_tags = set() for user in user_items_to_filter: used_tags.update(user_validation_tags[user]) used_tags.update(user_test_tags[user]) with AnnotReader(db_fpath) as reader: reader.change_table(db_name) annot_filter = FilteredUserItemAnnotations(user_items_to_filter) #Generate 50 random tags not used by any user in validation or test #Also creates some indexes used to define gamma items annotations = annot_filter.annotations(reader.iterate()) user_to_item = defaultdict(set) items = set() tags = set() random_tags = [] for annotation in annotations: user = annotation['user'] item = annotation['item'] tag = annotation['tag'] user_to_item[user].add(item) items.add(item) tags.add(tag) if tag not in used_tags and tag not in random_tags: random_tags.append(tag) shuffle(random_tags) random_tags = random_tags[:NUM_RANDOM_TAGS] #Gets number of tags and items num_items = len(items) num_tags = len(tags) #Create estimator annotations = annot_filter.annotations(reader.iterate()) save_lhood = False if est_name == 'lda': est = create_lda_estimator(annotations, value_one, num_items, num_tags, value_two) save_lhood = True else: est = create_bayes_estimator(annotations, value_one, value_two) param_out_folder = os.path.join(output_folder, \ 'params-%s-%f_%s-%f' % \ (param_one, value_one, param_two, value_two)) os.mkdir(param_out_folder) run_exp(user_items_to_filter, user_validation_tags, user_test_tags, user_to_item, num_items, random_tags, est, param_out_folder, save_lhood)